Import volk_3.0.0.orig.tar.gz

author A. Maitland Bottoms <bottoms@debian.org>

Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)

committer A. Maitland Bottoms <bottoms@debian.org>

Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)
author A. Maitland Bottoms <bottoms@debian.org>
Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)
committer A. Maitland Bottoms <bottoms@debian.org>
Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)
diff --git a/.clang-format b/.clang-format

new file mode 100644 (file)

index 0000000..285b68d
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,106 @@
+---
+Language: Cpp
+# BasedOnStyle: LLVM
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass:      true
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     90
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeCategories:
+  - Regex:           '^"(gnuradio)/'
+    Priority:        1
+  - Regex:           '^<(gnuradio)/'
+    Priority:        2
+  - Regex:           '^<(boost)/'
+    Priority:        98
+  - Regex:           '^<[a-z]*>$'
+    Priority:        99
+  - Regex:           '^".*"$'
+    Priority:        0
+  - Regex:           '.*'
+    Priority:        10
+
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentWidth: 4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 2
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
+
+
diff --git a/.github/workflows/android_build.yml b/.github/workflows/android_build.yml

new file mode 100644 (file)

index 0000000..3adb05f
--- /dev/null
+++ b/.github/workflows/android_build.yml
@@ -0,0 +1,57 @@
+#
+# Copyright 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+name: Build on Android NDK
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    name: Build on Android NDK ${{ matrix.arch.name }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        arch: 
+          - { name: armeabi-v7a, allow_fail: false }
+          - { name: arm64-v8a, allow_fail: false }
+          - { name: x86, allow_fail: false }
+          - { name: x86_64, allow_fail: false }
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3.0.0
+      with:
+        submodules: 'recursive'
+
+    - name: Update repositories
+      run: sudo apt update 
+    
+    # All dependencies
+    - name: Install dependencies
+      run: sudo apt install -y cmake openjdk-11-jre-headless wget unzip make python3-mako
+
+    # Setup Android SDK, and auto-accept licenses
+    - name: Install Android SDK
+      run: wget --quiet --output-document=android-sdk.zip https://dl.google.com/android/repository/commandlinetools-linux-8512546_latest.zip && mkdir android-sdk-linux && unzip -qq android-sdk.zip -d android-sdk-linux && export ANDROID_HOME=./android-sdk-linux && echo y | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --update && (echo y; echo y; echo y; echo y; echo y; echo y; echo y; echo y) | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --licenses
+
+    # Call SDKManager to install the Android NDK
+    - name: Install Android NDK
+      run: $GITHUB_WORKSPACE/android-sdk-linux/cmdline-tools/bin/sdkmanager --sdk_root=$GITHUB_WORKSPACE/android-sdk-linux --install "ndk;24.0.8215888" --channel=3
+
+    # Setup build directory
+    - name: Setup ${{ matrix.arch.name }}
+      shell: bash
+      run: cd $GITHUB_WORKSPACE/ && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-sdk-linux/ndk/24.0.8215888/build/cmake/android.toolchain.cmake -DANDROID_ABI=${{ matrix.arch.name }} -DANDROID_PLATFORM=android-23 ..
+
+    # Build
+    - name: Build ${{ matrix.arch.name }}
+      shell: bash
+      run: cd $GITHUB_WORKSPACE/build && make
+      continue-on-error: ${{ matrix.arch.allow_fail }}
diff --git a/.github/workflows/check-pr-formatting.yml b/.github/workflows/check-pr-formatting.yml

new file mode 100644 (file)

index 0000000..8b28fa6
--- /dev/null
+++ b/.github/workflows/check-pr-formatting.yml
@@ -0,0 +1,31 @@
+#
+# Copyright 2020, 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+name: Check PR Formatting
+
+on:
+  push:
+    paths-ignore:
+      - 'tmpl/'
+      - 'include/volk/sse2neon.h'
+  pull_request:
+    paths-ignore:
+      - 'tmpl/'
+      - 'include/volk/sse2neon.h'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - uses: gnuradio/clang-format-lint-action@v0.5-4
+      with:
+        source: '.'
+        exclude: './tmpl,./include/volk/sse2neon.h'
+        extensions: 'c,cc,cpp,cxx,h,hh'
+\ No newline at end of file
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml

new file mode 100644 (file)

index 0000000..215ded6
--- /dev/null
+++ b/.github/workflows/run-tests.yml
@@ -0,0 +1,263 @@
+#
+# Copyright 2020 - 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+name: Run VOLK tests
+
+on: [push, pull_request]
+
+jobs:
+  build-ubuntu:
+    name: Build on ${{ matrix.compiler.distro }} ${{ matrix.compiler.name }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        compiler: 
+          - { name: g++-7, cc: gcc-7, cxx: g++-7, distro: ubuntu-20.04 }
+          - { name: g++-8, cc: gcc-8, cxx: g++-8, distro: ubuntu-20.04 }
+          - { name: g++-9, cc: gcc-9, cxx: g++-9, distro: ubuntu-latest }
+          - { name: g++-10, cc: gcc-10, cxx: g++-10, distro: ubuntu-latest }
+          - { name: clang-7, cc: clang-7, cxx: clang++-7, distro: ubuntu-20.04 }
+          - { name: clang-8, cc: clang-8, cxx: clang++-8, distro: ubuntu-20.04 }
+          - { name: clang-9, cc: clang-9, cxx: clang++-9, distro: ubuntu-20.04 }
+          - { name: clang-10, cc: clang-10, cxx: clang++-10, distro: ubuntu-20.04 }
+          - { name: clang-11, cc: clang-11, cxx: clang++-11, distro: ubuntu-20.04 }
+          - { name: clang-12, cc: clang-12, cxx: clang++-12, distro: ubuntu-latest }
+          - { name: clang-13, cc: clang-13, cxx: clang++-13, distro: ubuntu-latest }
+          - { name: clang-14, cc: clang-14, cxx: clang++-14, distro: ubuntu-latest }
+          # - { name: clang-15, cc: clang-15, cxx: clang++-15, distro: ubuntu-latest }
+
+    runs-on: ${{ matrix.compiler.distro }}
+    
+    steps:
+    - uses: actions/checkout@v3.1.0
+      with:
+        submodules: 'recursive'
+    - name: Install dependencies
+      run: sudo apt install python3-mako liborc-dev ${{ matrix.compiler.name }}
+    - name: Configure
+      env:
+        CC: ${{ matrix.compiler.cc }}
+        CXX: ${{ matrix.compiler.cxx }}
+      run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+    - name: Build
+      run: |
+        echo "Build with $(nproc) thread(s)"
+        cmake --build build -j$(nproc)
+    - name: Print info
+      run: |
+        ./build/cpu_features/list_cpu_features
+        ./build/apps/volk-config-info --alignment
+        ./build/apps/volk-config-info --avail-machines
+        ./build/apps/volk-config-info --all-machines
+        ./build/apps/volk-config-info --malloc
+        ./build/apps/volk-config-info --cc
+    - name: Test
+      run: |
+        cd build
+        ctest -V
+
+  build-ubuntu-arm:
+    # The host should always be linux
+    # see: https://github.com/uraimo/run-on-arch-action
+    runs-on: ubuntu-latest
+    name: Build on ${{ matrix.distro }} ${{ matrix.arch }} ${{ matrix.compiler.name }}
+
+    # Run steps on a matrix of compilers and possibly archs.
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: aarch64
+            distro: ubuntu20.04
+            compiler: { name: g++-8, cc: gcc-8, cxx: g++-8 }
+          - arch: aarch64
+            distro: ubuntu20.04
+            compiler: { name: g++-9, cc: gcc-9, cxx: g++-9 }
+          - arch: aarch64
+            distro: ubuntu20.04
+            compiler: { name: g++-10, cc: gcc-10, cxx: g++-10 }
+          - arch: aarch64
+            distro: ubuntu22.04
+            compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
+          - arch: aarch64
+            distro: ubuntu20.04
+            compiler: { name: clang-9, cc: clang-9, cxx: clang++-9 }
+          - arch: aarch64
+            distro: ubuntu20.04
+            compiler: { name: clang-10, cc: clang-10, cxx: clang++-10 }
+          - arch: aarch64
+            distro: ubuntu22.04
+            compiler: { name: clang-14, cc: clang-14, cxx: clang++-14 }
+          - arch: armv7
+            distro: ubuntu22.04
+            compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
+          - arch: ppc64le
+            distro: ubuntu22.04
+            compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
+          - arch: s390x
+            distro: ubuntu22.04
+            compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
+          # It would be really nice to test on Risc-V but that'll take time.
+          - arch: riscv64
+            distro: ubuntu22.04
+            compiler: { name: g++-12, cc: gcc-12, cxx: g++-12 }
+    
+    steps:
+      - uses: actions/checkout@v3.1.0
+        with:
+          submodules: 'recursive'
+      - uses: uraimo/run-on-arch-action@v2.5.0
+        name: Build in non-x86 container
+        continue-on-error: ${{ contains(fromJson('["ppc64le", "s390x", "riscv64"]'), matrix.arch) || contains(fromJson('["clang-14"]'), matrix.compiler.name) }}
+        id: build
+        with:
+          arch: ${{ matrix.arch }}
+          distro: ${{ matrix.distro }}
+
+          # Not required, but speeds up builds
+          githubToken: ${{ github.token }}
+
+          setup: |
+            mkdir -p "${PWD}/build"
+
+          dockerRunArgs: |
+            --volume "${PWD}:/volk"
+
+          env: | # YAML, but pipe character is necessary
+            CC: ${{ matrix.compiler.cc }}
+            CXX: ${{ matrix.compiler.cxx }}
+
+          shell: /bin/sh
+
+          install: |
+            case "${{ matrix.distro }}" in
+              ubuntu*|jessie|stretch|buster)
+                apt-get update -q -y
+                apt-get install -q -y git cmake python3-mako liborc-dev ${{ matrix.compiler.name }}
+                ;;
+              fedora*)
+                dnf -y update
+                dnf -y install git which
+                ;;
+            esac
+
+          run: |
+            cd /volk
+            cd build
+            cmake -DCMAKE_CXX_FLAGS="-Werror" ..
+            echo "Build with $(nproc) thread(s)"
+            make -j$(nproc)
+            ./cpu_features/list_cpu_features
+            ./apps/volk-config-info --alignment
+            ./apps/volk-config-info --avail-machines
+            ./apps/volk-config-info --all-machines
+            ./apps/volk-config-info --malloc
+            ./apps/volk-config-info --cc
+            ctest -V
+
+  build-ubuntu-static:
+    name: Build static on ubuntu-latest
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3.1.0
+      with:
+        submodules: 'recursive'
+    - name: dependencies
+      run: sudo apt install python3-mako liborc-dev
+    - name: configure
+      run: mkdir build && cd build && cmake -DENABLE_STATIC_LIBS=True ..
+    - name: build
+      run: cmake --build build -j$(nproc)
+    - name: Print info
+      run: |
+        ./build/cpu_features/list_cpu_features
+        ./build/apps/volk-config-info --alignment
+        ./build/apps/volk-config-info --avail-machines
+        ./build/apps/volk-config-info --all-machines
+        ./build/apps/volk-config-info --malloc
+        ./build/apps/volk-config-info --cc
+    - name: test
+      run: cd build && ctest -V
+
+  build-windows:
+
+    runs-on: windows-latest
+
+    steps:
+    - uses: actions/checkout@v3.1.0
+      with:
+        submodules: 'recursive'
+    - name: dependencies
+      run: pip install mako
+    - name: configure
+      run: mkdir build && cd build && cmake ..
+    - name: build
+      run: cmake --build build --config Release --target INSTALL -j2
+    - name: test
+      run: cd build && ctest -V -C Release
+
+  # build-windows-msys2:
+  #   name: Build on windows-latest using MinGW and MSYS2
+    
+  #   runs-on: windows-latest
+  #   steps:
+  #     - uses: msys2/setup-msys2@v2
+  #       with:
+  #         update: true
+  #         install: >-
+  #           base-devel
+  #           git
+  #           mingw-w64-x86_64-gcc-libs
+  #           mingw-w64-x86_64-orc
+  #           python
+  #           python-mako
+  #           python-six
+  #           mingw-w64-x86_64-gcc
+  #           mingw-w64-x86_64-cmake
+  #     - uses: actions/checkout@v2
+  #     - name: Checkout submodules
+  #       run: git submodule update --init --recursive
+  #     - name: Configure
+  #       shell: msys2 {0}
+  #       run: mkdir build && cd build && cmake -G "MSYS Makefiles" -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=/usr/bin/python3 -DCMAKE_CXX_FLAGS="-Werror" ..
+  #     - name: Build
+  #       shell: msys2 {0}
+  #       run: cd build && make -j$(nproc)
+  #     - name: Test 
+  #       shell: msys2 {0}
+  #       run: |
+  #         cd build
+  #         ctest -V
+
+  build-macos:
+
+    runs-on: macos-latest
+
+    steps:
+    - uses: actions/checkout@v3.1.0
+      with:
+        submodules: 'recursive'
+    - name: dependencies
+      run: pip3 install mako
+    - name: configure
+      run: mkdir build && cd build && cmake ..
+    - name: build
+      run: cmake --build build --config Debug -j3
+    - name: Print info
+      run: |
+        ./build/cpu_features/list_cpu_features
+        # ./build/apps/volk-config-info --alignment
+        # ./build/apps/volk-config-info --avail-machines
+        # ./build/apps/volk-config-info --all-machines
+        # ./build/apps/volk-config-info --malloc
+        # ./build/apps/volk-config-info --cc
+    - name: test
+      run: cd build && ctest -V
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..92611b5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*~
+*.pyc
+*.pyo
+*build*/
+archives/
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml

new file mode 100644 (file)

index 0000000..63835ec
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,38 @@
+#
+# Copyright 2018 - 2020, 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+# This file is a template, and might need editing before it works on your project.
+# use the official gcc image, based on debian
+# can use versions as well, like gcc:5.2
+# see https://hub.docker.com/_/gcc/
+image: ubuntu:18.04
+
+build:
+  stage: build
+  # instead of calling g++ directly you can also use some build toolkit like make
+  # install the necessary build tools when needed
+  before_script:
+     - apt update && apt -y install make cmake python python-pip && pip install mako
+  script:
+    - mkdir build && cd build && cmake .. && make -j
+  artifacts:
+    paths:
+      - build/
+  # depending on your build setup it's most likely a good idea to cache outputs to reduce the build time
+  # cache:
+  #   paths:
+  #     - "*.o"
+
+# run tests using the binary built before
+test:
+  stage: test
+  before_script:
+    - apt update && apt -y install cmake python python-pip && pip install mako
+  script:
+    - cd build && ctest -V
+
diff --git a/.gitmodules b/.gitmodules

new file mode 100644 (file)

index 0000000..c94aa2a
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "cpu_features"]
+       path = cpu_features
+       url = https://github.com/google/cpu_features.git
diff --git a/.lastrelease b/.lastrelease

new file mode 100644 (file)

index 0000000..ad55eb8
--- /dev/null
+++ b/.lastrelease
@@ -0,0 +1 @@
+v3.0.0
diff --git a/.travis.yml b/.travis.yml

new file mode 100644 (file)

index 0000000..d7a3227
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,74 @@
+#
+# Copyright 2018 - 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+language: cpp
+
+os: linux
+dist: bionic
+
+addons:
+  apt:
+    packages: &common_packages
+      - python3-mako
+      - liborc-dev
+
+env:
+  global:
+    - SDE_VERSION=sde-external-8.50.0-2020-03-26-lin
+    - SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/
+
+matrix:
+  include:
+    # Job 1 ... gcc-7 with Intel SDE
+    - name: Linux x86 Intel SDE GCC 7
+      env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7 CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/intel-sde.cmake"
+      addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages]}}
+      cache:
+        directories:
+          - ${TRAVIS_BUILD_DIR}/cache
+      before_script:
+        - cd ${TRAVIS_BUILD_DIR} && ./scripts/ci/download_intel_sde.sh
+
+    # Job 4 ... gcc-6
+    - name: Linux x86 GCC 6
+      env: MATRIX_EVAL="CC=gcc-6 && CXX=g++-6"
+      addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-6]}}
+
+    # Job 7 ... ARMv7 cross compile
+    - name: Linux ARMv7 Qemu GCC 7
+      env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm-linux-gnueabihf.cmake"
+      addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-arm-linux-gnueabihf, qemu-user]}}
+
+    # Job 8 ... ARMv8 (aarch64) cross compile
+    - name: Linux ARMv8 (aarch64) Qemu GCC 7
+      env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/aarch64-linux-gnu.cmake"
+      addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-aarch64-linux-gnu, qemu-user]}}
+
+    # Job 9 ... clang
+    - name: Linux x86 Clang 6
+      env: MATRIX_EVAL="CC=\"clang -fprofile-instr-generate -fcoverage-mapping\" && CXX=\"clang++ -fprofile-instr-generate -fcoverage-mapping\""
+      addons: {apt: {packages: [*common_packages, ]}}
+
+    - name: Linux ARMv8 (aarch64) GCC 7
+      arch: arm64
+      env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
+      addons: {apt: {packages: [*common_packages, ]}}
+
+    - name: Linux ARMv8 (aarch64) Clang 6
+      arch: arm64
+      env: MATRIX_EVAL="CC=clang && CXX=clang++"
+      addons: {apt: {packages: [*common_packages, ]}}
+
+script:
+  - eval "${MATRIX_EVAL}"
+  - lscpu
+  - git submodule update --init --recursive
+  - mkdir build && cd build
+  - cmake ${CMAKE_ARG} ../
+  - make
+  - echo $(./apps/volk-config-info --malloc) && echo $(./apps/volk-config-info --alignment) && echo "All compiled VOLK machines:" $(./apps/volk-config-info --all-machines) && echo "Available VOLK machines:" $(./apps/volk-config-info --avail-machines)
+  - ctest -V
diff --git a/.zenodo.json b/.zenodo.json

new file mode 100644 (file)

index 0000000..46c1f8a
--- /dev/null
+++ b/.zenodo.json
@@ -0,0 +1,263 @@
+{
+    "title": "Vector-Optimized Library of Kernels (VOLK)",
+    "license": {
+        "id": "LGPL-3.0-or-later"
+    },
+    "keywords": [
+        "communication",
+        "software radio",
+        "SDR",
+        "C",
+        "C++",
+        "SIMD",
+        "library"
+    ],
+    "creators": [
+        {
+            "affiliation": "Department of Communications Engineering, University of Bremen, Germany",
+            "name": "Demel, Johannes",
+            "orcid": "0000-0002-5434-7232"
+        },
+        {
+            "name": "Dickens, Michael"
+        },
+        {
+            "name": "Anderson, Douglas"
+        },
+        {
+            "name": "Ashton, Brennan"
+        },
+        {
+            "name": "Balister, Philip"
+        },
+        {
+            "name": "Behar, Doron"
+        },
+        {
+            "name": "Behnke, Steven"
+        },
+        {
+            "name": "Bekhit, Amr"
+        },
+        {
+            "affiliation": "Carnegie Mellon University, IIT Bombay",
+            "name": "Bhowmick, Abhishek"
+        },
+        {
+            "name": "Blossom, Eric"
+        },
+        {
+            "name": "Blum, Josh"
+        },
+        {
+            "name": "Bottoms, A. Maitland"
+        },
+        {
+            "name": "Briggs, Elliot"
+        },
+        {
+            "name": "Cardoso, Jeison"
+        },
+        {
+            "name": "Cercueil, Paul"
+        },
+        {
+            "affiliation": "Corgan Labs",
+            "name": "Corgan, Johnathan"
+        },
+        {
+            "affiliation": "Skylark Wireless (@skylarkwireless)",
+            "name": "Corgan, Nicholas"
+        },
+        {
+            "name": "Cruz, Luigi"
+        },
+        {
+            "name": "Economos, Ron"
+        },
+        {
+            "name": "Enochs, Brandon P."
+        },
+        {
+            "affiliation": "Centre Tecnol\u00f2gic de Telecomunicacions de Catalunya (CTTC)",
+            "name": "Fernandez, Carles"
+        },
+        {
+            "name": "Fischer, Moritz"
+        },
+        {
+            "name": "Foster, Nick"
+        },
+        {
+            "name": "Geiger, Douglas"
+        },
+        {
+            "name": "Giard, Pascal"
+        },
+        {
+            "name": "Goavec-Merou, Gwenhael"
+        },
+        {
+            "name": "Hilburn, Ben"
+        },
+        {
+            "name": "Holguin, Albert"
+        },
+        {
+            "name": "Iwamoto, Jessica"
+        },
+        {
+            "name": "Kaesberger, Martin"
+        },
+        {
+            "name": "Lichtman, Marc"
+        },
+        {
+            "name": "Logue, Kyle A"
+        },
+        {
+            "name": "Lundmark, Magnus"
+        },
+        {
+            "name": "Markgraf, Steve"
+        },
+        {
+            "name": "Mayer, Christoph"
+        },
+        {
+            "name": "McCarthy, Nicholas"
+        },
+        {
+            "name": "McCarthy, Nick"
+        },
+        {
+            "affiliation": "University of Colorado Boulder",
+            "name": "Miralles, Damian"
+        },
+        {
+            "name": "Munaut, Sylvain"
+        },
+        {
+            "name": "M\u00fcller, Marcus"
+        },
+        {
+            "affiliation": "GCN Development",
+            "name": "Nieboer, Geof"
+        },
+        {
+            "affiliation": "@deepsig",
+            "name": "O'Shea, Tim"
+        },
+        {
+            "name": "Olivain, Julien"
+        },
+        {
+            "name": "Oltmanns, Stefan"
+        },
+        {
+            "name": "Pinkava, Jiri"
+        },
+        {
+            "name": "Piscopo, Mike"
+        },
+        {
+            "name": "Quiceno, Jam M. Hernandez"
+        },
+        {
+            "name": "Rene, Mathieu"
+        },
+        {
+            "name": "Ritterhoff, Florian"
+        },
+        {
+            "name": "Robertson, Dan"
+        },
+        {
+            "name": "Rocca, Federico 'Larroca' La"
+        },
+        {
+            "name": "Rode, Andrej"
+        },
+        {
+            "name": "Rodionov, Andrey"
+        },
+        {
+            "name": "Roe, Michael"
+        },
+        {
+            "affiliation": "GNU Radio",
+            "name": "Rondeau, Tom"
+        },
+        {
+            "name": "Sekine, Takehiro"
+        },
+        {
+            "name": "Semich, Karl"
+        },
+        {
+            "name": "Sergeev, Vanya"
+        },
+        {
+            "name": "Slokva, Alexey"
+        },
+        {
+            "name": "Smith, Clayton"
+        },
+        {
+            "affiliation": "Medurit AB",
+            "name": "Stigo, Albin"
+        },
+        {
+            "name": "Thompson, Adam"
+        },
+        {
+            "name": "Thompson, Roy"
+        },
+        {
+            "name": "Velichkov, Vasil"
+        },
+        {
+            "affiliation": "@MITHaystack",
+            "name": "Volz, Ryan"
+        },
+        {
+            "name": "Walls, Andy"
+        },
+        {
+            "name": "Ward, Don"
+        },
+        {
+            "name": "West, Nathan"
+        },
+        {
+            "name": "Wiedemann, Bernhard M."
+        },
+        {
+            "name": "Wunsch, Stefan"
+        },
+        {
+            "name": "Zapodovnikov, Valerii"
+        },
+        {
+            "name": "\u0160karvada, Jaroslav"
+        },
+        {
+            "name": "Aang23"
+        },
+        {
+            "name": "AlexandreRouma"
+        },
+        {
+            "name": "Andrew"
+        },
+        {
+            "name": "Zlika"
+        },
+        {
+            "name": "luz.paz"
+        },
+        {
+            "name": "rear1019"
+        }
+    ]
+}
+\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt

new file mode 100644 (file)

index 0000000..3816757
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,404 @@
+#
+# Copyright 2011-2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+########################################################################
+# Project setup
+########################################################################
+cmake_minimum_required(VERSION 3.8)
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel")
+project(volk)
+
+enable_language(CXX)
+enable_language(C)
+
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+
+enable_testing()
+
+
+########################################################################
+# Common compile flags
+########################################################################
+
+# Disable complex math NaN/INFO range checking for performance
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE)
+if(HAVE_CX_LIMITED_RANGE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcx-limited-range")
+endif(HAVE_CX_LIMITED_RANGE)
+
+include(CheckCCompilerFlag)
+check_c_compiler_flag(-fcx-limited-range HAVE_C_LIMITED_RANGE)
+if(HAVE_C_LIMITED_RANGE)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fcx-limited-range")
+endif(HAVE_C_LIMITED_RANGE)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
+
+if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
+    # Abort compilation if kernel implementations have inconsistent function
+    # prototypes, i.e. if
+    #
+    #     kernel_foo_sse(uint32_t *dst, lv32fc_t *src)
+    #     kernel_foo_avx(uint16_t *dst, lv32fc_t *src)
+    #
+    # are defined. Note the different data type of the first argument). By
+    # default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
+    # is a warning enabled by '-Wall'. These warnings are only applicable to C.
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign")
+endif()
+
+set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project
+set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project
+list(INSERT CMAKE_MODULE_PATH 0 ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules) #location for custom "Modules"
+
+include(VolkBuildTypes)
+#select the release build type by default to get optimization flags
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+    message(STATUS "Build type not specified: defaulting to release.")
+endif()
+VOLK_CHECK_BUILD_TYPE(${CMAKE_BUILD_TYPE})
+set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "")
+message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.")
+
+########################################################################
+# Version setup
+########################################################################
+
+set(VERSION_INFO_MAJOR_VERSION 3)
+set(VERSION_INFO_MINOR_VERSION 0)
+set(VERSION_INFO_MAINT_VERSION 0)
+include(VolkVersion) #setup version info
+
+macro(set_version_str VAR)
+  set(IN_VER ${VERSION_INFO_${VAR}_VERSION})
+  string(LENGTH "${IN_VER}" VER_LEN)
+  if(${VER_LEN} EQUAL 1)
+    set(VOLK_VERSION_${VAR} "0${IN_VER}")
+  else()
+    set(VOLK_VERSION_${VAR} "${IN_VER}")
+  endif()
+endmacro()
+
+set_version_str(MAJOR)
+set_version_str(MINOR)
+set_version_str(MAINT)
+
+configure_file(
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in
+    ${CMAKE_BINARY_DIR}/include/volk/volk_version.h
+@ONLY)
+
+########################################################################
+# Environment setup
+########################################################################
+IF(NOT DEFINED CROSSCOMPILE_MULTILIB)
+    SET(CROSSCOMPILE_MULTILIB "")
+ENDIF()
+SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile")
+
+if(MSVC)
+    add_definitions(-D_USE_MATH_DEFINES) #enables math constants on all supported versions of MSVC
+    add_compile_options(/W1) #reduce warnings
+    add_compile_options(/wo4309)
+    add_compile_options(/wd4752)
+    add_compile_options(/wo4273)
+    add_compile_options(/wo4838)
+endif(MSVC)
+
+########################################################################
+# Dependencies setup
+########################################################################
+
+# cpu_features - sensible defaults, user settable option
+if(CMAKE_SYSTEM_PROCESSOR MATCHES
+    "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)")
+  option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON)
+else()
+  option(VOLK_CPU_FEATURES "Volk uses cpu_features" OFF)
+endif()
+
+if (VOLK_CPU_FEATURES)
+  find_package(CpuFeatures QUIET)
+  if(NOT CpuFeatures_FOUND)
+    message(STATUS "cpu_features package not found. Requiring cpu_features submodule ...")
+    if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/cpu_features/CMakeLists.txt" )
+      message(FATAL_ERROR "cpu_features/CMakeLists.txt not found. Did you forget to git clone recursively?\nFix with: git submodule update --init")
+    endif()
+    message(STATUS "Building Volk with cpu_features")
+    set(BUILD_TESTING OFF CACHE BOOL "Build cpu_features without tests." FORCE)
+    set(BUILD_PIC ON CACHE BOOL
+      "Build cpu_features with Position Independent Code (PIC)."
+      FORCE)
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL
+      "Build cpu_features with Position Independent Code (PIC)."
+      FORCE)
+    set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}")
+    set(BUILD_SHARED_LIBS OFF)
+    add_subdirectory(cpu_features)
+    set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}")
+  endif()
+else()
+  message(STATUS "Building Volk without cpu_features")
+endif()
+
+# Python
+include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
+VOLK_PYTHON_CHECK_MODULE("python >= 3.4" sys "sys.version_info >= (3, 4)" PYTHON_MIN_VER_FOUND)
+VOLK_PYTHON_CHECK_MODULE("mako >= 0.4.2" mako "mako.__version__ >= '0.4.2'" MAKO_FOUND)
+
+if(NOT PYTHON_MIN_VER_FOUND)
+    message(FATAL_ERROR "Python 3.4 or greater required to build VOLK")
+endif()
+
+# Mako
+if(NOT MAKO_FOUND)
+    message(FATAL_ERROR "Mako templates required to build VOLK")
+endif()
+
+# Check if we have std::filesystem
+find_package(FILESYSTEM COMPONENTS Final Experimental REQUIRED)
+
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+
+########################################################################
+# check for aligned_alloc, since some compilers lack this C11 feature.
+# For Apple-clang use `posix_memalign`
+# For MSVC use `_aligned_malloc`.
+########################################################################
+include(CheckSymbolExists)
+if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
+    CHECK_SYMBOL_EXISTS(aligned_alloc stdlib.h USE_ALIGNED_ALLOC)
+endif()
+if(NOT USE_ALIGNED_ALLOC)
+    CHECK_SYMBOL_EXISTS(posix_memalign stdlib.h HAVE_POSIX_MEMALIGN)
+endif()
+
+########################################################################
+# Check if Orc is available
+########################################################################
+option(ENABLE_ORC "Enable Orc" True)
+if(ENABLE_ORC)
+  find_package(ORC)
+else(ENABLE_ORC)
+  message(STATUS "Disabling use of ORC")
+endif(ENABLE_ORC)
+
+########################################################################
+# Setup doxygen
+########################################################################
+add_subdirectory(docs)
+
+########################################################################
+# Detect /lib versus /lib64
+########################################################################
+if (${CMAKE_INSTALL_LIBDIR} MATCHES lib64)
+    set(LIB_SUFFIX 64)
+endif()
+
+########################################################################
+# Setup the package config file
+########################################################################
+#set variables found in the pc.in file
+set(prefix ${CMAKE_INSTALL_PREFIX})
+set(exec_prefix "\${prefix}")
+set(libdir "\${exec_prefix}/lib${LIB_SUFFIX}")
+set(includedir "\${prefix}/include")
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/tmpl/volk.pc.in
+    ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+@ONLY)
+
+install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/volk.pc
+    DESTINATION lib${LIB_SUFFIX}/pkgconfig
+    COMPONENT "volk_devel"
+)
+
+########################################################################
+# Install all headers in the include directories
+########################################################################
+set(VOLK_RUNTIME_DIR   bin)
+set(VOLK_LIBRARY_DIR   lib${LIB_SUFFIX})
+set(VOLK_INCLUDE_DIR   include)
+
+install(
+    DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk
+    DESTINATION include COMPONENT "volk_devel"
+    FILES_MATCHING PATTERN "*.h"
+)
+
+install(FILES
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_prefs.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_alloc.hh
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_complex.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_common.h
+    ${CMAKE_SOURCE_DIR}/include/volk/saturation_arithmetic.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
+    ${CMAKE_BINARY_DIR}/include/volk/volk.h
+    ${CMAKE_BINARY_DIR}/include/volk/volk_cpu.h
+    ${CMAKE_BINARY_DIR}/include/volk/volk_config_fixed.h
+    ${CMAKE_BINARY_DIR}/include/volk/volk_typedefs.h
+    ${CMAKE_SOURCE_DIR}/include/volk/volk_malloc.h
+    ${CMAKE_BINARY_DIR}/include/volk/volk_version.h
+    ${CMAKE_SOURCE_DIR}/include/volk/constants.h
+    DESTINATION include/volk
+    COMPONENT "volk_devel"
+)
+
+########################################################################
+# On Apple only, set install name and use rpath correctly, if not already set
+########################################################################
+if(APPLE)
+    if(NOT CMAKE_INSTALL_NAME_DIR)
+        set(CMAKE_INSTALL_NAME_DIR
+            ${CMAKE_INSTALL_PREFIX}/${VOLK_LIBRARY_DIR} CACHE
+            PATH "Library Install Name Destination Directory" FORCE)
+    endif(NOT CMAKE_INSTALL_NAME_DIR)
+    if(NOT CMAKE_INSTALL_RPATH)
+        set(CMAKE_INSTALL_RPATH
+            ${CMAKE_INSTALL_PREFIX}/${VOLK_LIBRARY_DIR} CACHE
+            PATH "Library Install RPath" FORCE)
+    endif(NOT CMAKE_INSTALL_RPATH)
+    if(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE
+            BOOL "Do Build Using Library Install RPath" FORCE)
+    endif(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
+endif(APPLE)
+
+########################################################################
+# Create uninstall target
+########################################################################
+configure_file(
+    ${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+@ONLY)
+
+# Only add the target if there isn't one defined already
+if(NOT TARGET uninstall)
+    add_custom_target(uninstall
+        ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake
+    )
+endif()
+
+
+########################################################################
+# Install our Cmake modules into $prefix/lib/cmake/volk
+# See "Package Configuration Files" on page:
+#    http://www.cmake.org/Wiki/CMake/Tutorials/Packaging
+########################################################################
+
+configure_file(
+  ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
+  ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
+@ONLY)
+
+configure_file(
+  ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
+  ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+@ONLY)
+
+
+########################################################################
+# Install cmake search routine for external use
+########################################################################
+
+if(NOT CMAKE_MODULES_DIR)
+    set(CMAKE_MODULES_DIR lib${LIB_SUFFIX}/cmake)
+endif(NOT CMAKE_MODULES_DIR)
+
+install(
+    FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+    DESTINATION ${CMAKE_MODULES_DIR}/volk
+    COMPONENT "volk_devel" )
+
+install(EXPORT VOLK-export FILE VolkTargets.cmake
+  NAMESPACE Volk:: DESTINATION ${CMAKE_MODULES_DIR}/volk )
+
+########################################################################
+# Option to enable QA testing, on by default
+########################################################################
+OPTION(ENABLE_TESTING "Enable QA testing" ON)
+if(ENABLE_TESTING)
+  message(STATUS "QA Testing is enabled.")
+else()
+  message(STATUS "QA Testing is disabled.")
+endif()
+message(STATUS "  Modify using: -DENABLE_TESTING=ON/OFF")
+
+########################################################################
+# Option to enable post-build profiling using volk_profile, off by default
+########################################################################
+OPTION(ENABLE_PROFILING "Launch system profiler after build" OFF)
+if(ENABLE_PROFILING)
+  if(DEFINED VOLK_CONFIGPATH)
+    get_filename_component(VOLK_CONFIGPATH ${VOLK_CONFIGPATH} ABSOLUTE)
+    set(VOLK_CONFIGPATH "${VOLK_CONFIGPATH}/volk")
+    message(STATUS "System profiling is enabled, using path: ${VOLK_CONFIGPATH}")
+  elseif(DEFINED ENV{VOLK_CONFIGPATH})
+    set(VOLK_CONFIGPATH "$ENV{VOLK_CONFIGPATH}/volk")
+    message(STATUS "System profiling is enabled, using env path: $ENV{VOLK_CONFIGPATH}")
+  else()
+    message(STATUS "System profiling is enabled with default paths.")
+    if(DEFINED ENV{HOME})
+        set(VOLK_CONFIGPATH "$ENV{HOME}/.volk")
+    elseif(DEFINED ENV{APPDATA})
+        set(VOLK_CONFIGPATH "$ENV{APPDATA}/.volk")
+    endif()
+  endif()
+else()
+  message(STATUS "System profiling is disabled.")
+endif()
+message(STATUS "  Modify using: -DENABLE_PROFILING=ON/OFF")
+
+########################################################################
+# Setup the library
+########################################################################
+add_subdirectory(lib)
+
+########################################################################
+# And the utility apps
+########################################################################
+add_subdirectory(apps)
+option(ENABLE_MODTOOL "Enable volk_modtool python utility" True)
+if(ENABLE_MODTOOL)
+  add_subdirectory(python/volk_modtool)
+endif()
+
+########################################################################
+# And the LGPL license check
+########################################################################
+# detect default for LGPL
+if(VERSION_INFO_MAJOR_VERSION GREATER_EQUAL 3)
+  OPTION(ENABLE_LGPL "Enable testing for LGPL" ON)
+else()
+  OPTION(ENABLE_LGPL "Enable testing for LGPL" OFF)
+endif()
+
+if(ENABLE_TESTING AND ENABLE_LGPL)
+  message(STATUS "Checking for LGPL resubmission enabled")
+  add_test("check_lgpl" ${CMAKE_SOURCE_DIR}/scripts/licensing/count_contrib.sh ${CMAKE_SOURCE_DIR}/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md)
+endif()
+
+########################################################################
+# Print summary
+########################################################################
+message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
diff --git a/COPYING b/COPYING

new file mode 100644 (file)

index 0000000..0a04128
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..1664376
--- /dev/null
+++ b/README.md
@@ -0,0 +1,132 @@
+[![Build Status](https://travis-ci.com/gnuradio/volk.svg?branch=master)](https://travis-ci.com/gnuradio/volk) [![Build status](https://ci.appveyor.com/api/projects/status/5o56mgw0do20jlh3/branch/master?svg=true)](https://ci.appveyor.com/project/gnuradio/volk/branch/master)
+![Check PR Formatting](https://github.com/gnuradio/volk/workflows/Check%20PR%20Formatting/badge.svg)
+![Run VOLK tests](https://github.com/gnuradio/volk/workflows/Run%20VOLK%20tests/badge.svg)
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3360942.svg)](https://doi.org/10.5281/zenodo.3360942)
+
+![VOLK Logo](/docs/volk_logo.png)
+
+# Welcome to VOLK!
+
+VOLK is a sub-project of GNU Radio. Please see http://libvolk.org for bug
+tracking, documentation, source code, and contact information about VOLK.
+See https://www.gnuradio.org/ for information about GNU Radio.
+
+VOLK is the Vector-Optimized Library of Kernels. It is a library that contains kernels of hand-written SIMD code for different mathematical operations. Since each SIMD architecture can be very different and no compiler has yet come along to handle vectorization properly or highly efficiently, VOLK approaches the problem differently.
+
+For each architecture or platform that a developer wishes to vectorize for, a new proto-kernel is added to VOLK. At runtime, VOLK will select the correct proto-kernel. In this way, the users of VOLK call a kernel for performing the operation that is platform/architecture agnostic. This allows us to write portable SIMD code.
+
+Bleeding edge code can be found in our git repository at
+https://www.gnuradio.org/git/volk.git/.
+
+## How to use VOLK:
+
+For detailed instructions see http://libvolk.org/doxygen/using_volk.html
+
+See these steps for a quick build guide.
+
+### Building on most x86 (32-bit and 64-bit) platforms
+
+```bash
+$ mkdir build
+$ cd build
+$ cmake ..
+$ make
+$ make test
+$ sudo make install
+
+# You might want to explore "make -j[SOMEVALUE]" options for your multicore CPU.
+
+# Perform post-installation steps
+
+# Linux OS: Link and cache shared library
+$ sudo ldconfig
+
+# macOS/Windows: Update PATH environment variable to point to lib install location
+
+# volk_profile will profile your system so that the best kernel is used
+$ volk_profile
+```
+
+#### Missing submodule
+We use [cpu_features](https://github.com/google/cpu_features) as a git submodule to detect CPU features, e.g. AVX.
+There are two options to get the required code:
+```bash
+git clone --recursive https://github.com/gnuradio/volk.git
+```
+will automatically clone submodules as well.
+In case you missed that, you can just run:
+```bash
+git submodule update --init --recursive
+```
+that'll pull in missing submodules.
+
+
+### Building on Raspberry Pi and other ARM boards (32 bit)
+
+To build for these boards you need specify the correct cmake toolchain file for best performance.
+
+_Note: There is no need for adding extra options to the compiler for 64 bit._
+
+* Raspberry Pi 4 `arm_cortex_a72_hardfp_native.cmake`
+* Raspberry Pi 3 `arm_cortex_a53_hardfp_native.cmake`
+
+```bash
+$ mkdir build && cd build
+$ cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake ..
+# make -j4 might be faster
+$ make
+$ make test
+$ sudo make install
+
+# volk_profile will profile your system so that the best kernel is used
+$ volk_profile
+```
+
+## Code of Conduct
+We want to make sure everyone feels welcome. Thus, we follow our [Code of Conduct](docs/CODE_OF_CONDUCT.md).
+
+## Contributing
+We are happy to accept contributions. [Please refer to our contributing guide for further details](docs/CONTRIBUTING.md).
+Also, make sure to read the [Developer's Certificate of Origin](docs/DCO.txt) and make sure to sign every commit with `git commit -s`.
+
+## Releases and development
+We maintain a [CHANGELOG](docs/CHANGELOG.md) for every release. Please refer to this file for more detailed information.
+We follow semantic versioning as outlined in [our versioning guide](docs/versioning.md).
+
+## Supported platforms
+VOLK aims to be portable to as many platforms as possible. We can only run tests on some platforms.
+
+### Hardware architectures
+Currently VOLK aims to run with optimized kernels on x86 with SSE/AVX and ARM with NEON.
+Support for MIPS and RISC-V is experimental; some kernels are known not to work on these architectures.
+
+### OS / Distro
+We run tests on a variety of Ubuntu versions and aim to support as many current distros as possible.
+The same goal applies to different OSes. Although this does only rarely happen, it might occur that VOLK does not work on obsolete distros, e.g. Ubuntu 12.04.
+
+### Compilers
+We want to make sure VOLK works with C/C++ standard compliant compilers. Of course, as an open source project we focus on open source compilers, most notably GCC and Clang.
+We want to make sure VOLK compiles on a wide variety of compilers. Thus, we target AppleClang and MSVC as well. Mind that MSVC lacks `aligned_alloc` support for aligned arrays. We use MSVC specific instructions in this case which cannot be `free`'d with `free`.
+
+
+## License
+
+**VOLK 3.0 and later are licensed under the GNU Lesser General Public License v3.0 or later (LGPL-3.0-or-later).**
+
+### Previous VOLK version license
+
+Earlier versions of VOLK (before VOLK 3.0) were licensed under the GNU General Public License v3.0 or later (GPL-3.0-or-later).
+Since then, VOLK migrated to the LGPL-3.0-or-later.
+
+Being technical: There are 3 people left (out of 74) who we haven't been able to get in contact with (at all), for a total of 4 (out of 1092) commits, 13 (of 282822) additions, and 7 (of 170421) deletions. We have reviewed these commits and all are simple, trivial changes (e.g., 1 line change) and most are no longer relevant (e.g., to a file that no longer exists). Volk maintainers (@michaelld and @jdemel) are in agreement that the combination -- small numbers of changes per committer, simple changes per commit, commits no longer relevant -- means that we can proceed with relicensing without the approval of the folks. We will try reaching out periodically to these folks, but we believe it unlikely we will get a reply.
+We kindly request them to re-submit their GPL-3.0-or-later license code contributions to LGPL-3.0-or-later by adding their name, GitHub handle, and email address(es) used for VOLK commits
+to the file [AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md](docs/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md).
+
+### Legal Matters
+
+Some files have been changed many times throughout the years. Copyright
+notices at the top of source files list which years changes have been
+made. For some files, changes have occurred in many consecutive years.
+These files may often have the format of a year range (e.g., "2006 - 2011"),
+which indicates that these files have had copyrightable changes made
+during each year in the range, inclusive.
+\ No newline at end of file
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt

new file mode 100644 (file)

index 0000000..8ac9187
--- /dev/null
+++ b/apps/CMakeLists.txt
@@ -0,0 +1,91 @@
+#
+# Copyright 2011-2013 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+########################################################################
+# Setup profiler
+########################################################################
+
+# POSIX_MEMALIGN: If we have to fall back to `posix_memalign`.
+if(HAVE_POSIX_MEMALIGN)
+    message(STATUS "Use `posix_memalign` for aligned malloc!")
+    add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+# MAKE volk_profile
+add_executable(volk_profile
+    ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc
+    ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc
+)
+
+if(MSVC)
+    target_include_directories(volk_profile
+        PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/cmake/msvc>
+    )
+endif(MSVC)
+
+target_include_directories(volk_profile
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/lib>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/lib>
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+add_definitions(-DHAS_STD_FILESYSTEM=1)
+if(${find_experimental})
+    add_definitions(-DHAS_STD_FILESYSTEM_EXPERIMENTAL=1)
+endif()
+target_link_libraries(volk_profile PRIVATE std::filesystem)
+
+if(ENABLE_STATIC_LIBS)
+    target_link_libraries(volk_profile PRIVATE volk_static)
+    set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static")
+else()
+    target_link_libraries(volk_profile PRIVATE volk)
+endif()
+
+install(
+    TARGETS volk_profile
+    DESTINATION bin
+    COMPONENT "volk"
+)
+
+# MAKE volk-config-info
+add_executable(volk-config-info volk-config-info.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc
+        )
+
+if(ENABLE_STATIC_LIBS)
+    target_link_libraries(volk-config-info volk_static)
+    set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static")
+else()
+    target_link_libraries(volk-config-info volk)
+endif()
+
+install(
+    TARGETS volk-config-info
+    DESTINATION bin
+    COMPONENT "volk"
+)
+
+# Launch volk_profile if requested to do so
+if(ENABLE_PROFILING)
+   if(DEFINED VOLK_CONFIGPATH)
+        set( VOLK_CONFIG_ARG "-p${VOLK_CONFIGPATH}" )
+        set( VOLK_CONFIG "${VOLK_CONFIGPATH}/volk_config" )
+   endif()
+
+   add_custom_command(OUTPUT ${VOLK_CONFIG}
+        COMMAND volk_profile "${VOLK_CONFIG_ARG}"
+        DEPENDS volk_profile
+        COMMENT "Launching profiler, this may take a few minutes..."
+    )
+    add_custom_target(volk-profile-run ALL DEPENDS ${VOLK_CONFIG})
+
+endif()
diff --git a/apps/plot_best_vs_generic.py b/apps/plot_best_vs_generic.py

new file mode 100755 (executable)

index 0000000..93f8632
--- /dev/null
+++ b/apps/plot_best_vs_generic.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+# Copyright 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+# This script is used to compare the generic kernels to the highest performing kernel, for each operation
+# Run:
+#   ./volk_profile -j volk_results.json
+# Then run this script under python3
+
+import matplotlib.pyplot as plt
+import numpy as np
+import json
+
+filename = 'volk_results.json'
+
+operations = []
+metrics = []
+with open(filename) as json_file:
+    data = json.load(json_file)
+    for test in data['volk_tests']:
+        if ('generic' in test['results']) or ('u_generic' in test['results']): # some dont have a generic kernel
+            operations.append(test['name'][5:]) # remove volk_ prefix that they all have
+            extension_performance = []
+            for key, val in test['results'].items():
+                if key not in ['generic', 'u_generic']: # exclude generic results, when trying to find fastest time
+                    extension_performance.append(val['time'])
+            try:
+                generic_time = test['results']['generic']['time']
+            except:
+                generic_time = test['results']['u_generic']['time']
+            metrics.append(extension_performance[np.argmin(extension_performance)]/generic_time)
+
+
+plt.bar(np.arange(len(metrics)), metrics)
+plt.hlines(1.0, -1, len(metrics), colors='r', linestyles='dashed')
+plt.axis([-1, len(metrics), 0, 2])
+plt.xticks(np.arange(len(operations)), operations, rotation=90)
+plt.ylabel('Time taken of fastest kernel relative to generic kernel')
+plt.tight_layout()
+plt.show()
diff --git a/apps/volk-config-info.cc b/apps/volk-config-info.cc

new file mode 100644 (file)

index 0000000..72d2045
--- /dev/null
+++ b/apps/volk-config-info.cc
@@ -0,0 +1,73 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2016, 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <volk/constants.h> // for volk_available_machines, volk_c_com...
+#include <iostream>         // for operator<<, endl, cout, ostream
+#include <string>           // for string
+
+#include "volk/volk.h"           // for volk_get_alignment, volk_get_machine
+#include "volk_option_helpers.h" // for option_list, option_t
+
+void print_alignment()
+{
+    std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
+}
+
+void print_malloc()
+{
+    // You don't want to change the volk_malloc code, so just copy the if/else
+    // structure from there and give an explanation for the implementations
+    std::cout << "Used malloc implementation: ";
+#if HAVE_POSIX_MEMALIGN
+    std::cout << "posix_memalign" << std::endl;
+#elif defined(_MSC_VER)
+    std::cout << "_aligned_malloc" << std::endl;
+#else
+    std::cout << "C11 aligned_alloc" << std::endl;
+#endif
+}
+
+
+int main(int argc, char** argv)
+{
+
+    option_list our_options("volk-config-info");
+    our_options.add(
+        option_t("prefix", "", "print the VOLK installation prefix", volk_prefix()));
+    our_options.add(
+        option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler()));
+    our_options.add(
+        option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags()));
+    our_options.add(option_t(
+        "all-machines", "", "print VOLK machines built", volk_available_machines()));
+    our_options.add(option_t("avail-machines",
+                             "",
+                             "print VOLK machines on the current "
+                             "platform",
+                             volk_list_machines));
+    our_options.add(option_t("machine",
+                             "",
+                             "print the current VOLK machine that will be used",
+                             volk_get_machine()));
+    our_options.add(
+        option_t("alignment", "", "print the memory alignment", print_alignment));
+    our_options.add(option_t("malloc",
+                             "",
+                             "print the malloc implementation used in volk_malloc",
+                             print_malloc));
+    our_options.add(option_t("version", "v", "print the VOLK version", volk_version()));
+
+    our_options.parse(argc, argv);
+
+    return 0;
+}
diff --git a/apps/volk_option_helpers.cc b/apps/volk_option_helpers.cc

new file mode 100644 (file)

index 0000000..478049e
--- /dev/null
+++ b/apps/volk_option_helpers.cc
@@ -0,0 +1,241 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#include "volk_option_helpers.h"
+
+#include <limits.h>  // IWYU pragma: keep
+#include <cstdlib>   // IWYU pragma: keep
+#include <cstring>   // IWYU pragma: keep
+#include <exception> // for exception
+#include <iostream>  // for operator<<, endl, basic_ostream, cout, ostream
+#include <utility>   // for pair
+
+/*
+ * Option type
+ */
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   void (*t_callback)())
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      callback(t_callback)
+{
+    option_type = VOID_CALLBACK;
+}
+
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   void (*t_callback)(int))
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      callback((void (*)())t_callback)
+{
+    option_type = INT_CALLBACK;
+}
+
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   void (*t_callback)(float))
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      callback((void (*)())t_callback)
+{
+    option_type = FLOAT_CALLBACK;
+}
+
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   void (*t_callback)(bool))
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      callback((void (*)())t_callback)
+{
+    option_type = BOOL_CALLBACK;
+}
+
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   void (*t_callback)(std::string))
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      callback((void (*)())t_callback)
+{
+    option_type = STRING_CALLBACK;
+}
+
+option_t::option_t(std::string t_longform,
+                   std::string t_shortform,
+                   std::string t_msg,
+                   std::string t_printval)
+    : longform("--" + t_longform),
+      shortform("-" + t_shortform),
+      msg(t_msg),
+      printval(t_printval)
+{
+    option_type = STRING;
+}
+
+
+/*
+ * Option List
+ */
+
+option_list::option_list(std::string program_name) : d_program_name(program_name)
+{
+    d_internal_list = std::vector<option_t>();
+}
+
+
+void option_list::add(option_t opt) { d_internal_list.push_back(opt); }
+
+void option_list::parse(int argc, char** argv)
+{
+    for (int arg_number = 0; arg_number < argc; ++arg_number) {
+        for (std::vector<option_t>::iterator this_option = d_internal_list.begin();
+             this_option != d_internal_list.end();
+             this_option++) {
+            int int_val = INT_MIN;
+            if (this_option->longform == std::string(argv[arg_number]) ||
+                this_option->shortform == std::string(argv[arg_number])) {
+
+                if (d_present_options.count(this_option->longform) == 0) {
+                    d_present_options.insert(
+                        std::pair<std::string, int>(this_option->longform, 1));
+                } else {
+                    d_present_options[this_option->longform] += 1;
+                }
+                switch (this_option->option_type) {
+                case VOID_CALLBACK:
+                    this_option->callback();
+                    break;
+                case INT_CALLBACK:
+                    try {
+                        int_val = atoi(argv[++arg_number]);
+                        ((void (*)(int))this_option->callback)(int_val);
+                    } catch (std::exception& exc) {
+                        std::cout << "An int option can only receive a number"
+                                  << std::endl;
+                        throw std::exception();
+                    };
+                    break;
+                case FLOAT_CALLBACK:
+                    try {
+                        double double_val = atof(argv[++arg_number]);
+                        ((void (*)(float))this_option->callback)(double_val);
+                    } catch (std::exception& exc) {
+                        std::cout << "A float option can only receive a number"
+                                  << std::endl;
+                        throw std::exception();
+                    };
+                    break;
+                case BOOL_CALLBACK:
+                    try {
+                        if (arg_number == (argc - 1)) { // this is the last arg
+                            int_val = 1;
+                        } else { // sneak a look at the next arg since it's present
+                            char* next_arg = argv[arg_number + 1];
+                            if ((strncmp(next_arg, "-", 1) == 0) ||
+                                (strncmp(next_arg, "--", 2) == 0)) {
+                                // the next arg is actually an arg, the bool is just
+                                // present, set to true
+                                int_val = 1;
+                            } else if (strncmp(next_arg, "true", 4) == 0) {
+                                int_val = 1;
+                            } else if (strncmp(next_arg, "false", 5) == 0) {
+                                int_val = 0;
+                            } else {
+                                // we got a number or a string.
+                                // convert it to a number and depend on the catch to
+                                // report an error condition
+                                int_val = (bool)atoi(argv[++arg_number]);
+                            }
+                        }
+                    } catch (std::exception& e) {
+                        int_val = INT_MIN;
+                    };
+                    if (int_val == INT_MIN) {
+                        std::cout
+                            << "option: '" << argv[arg_number - 1]
+                            << "' -> received an unknown value. Boolean "
+                               "options should receive one of '0', '1', 'true', 'false'."
+                            << std::endl;
+                        throw std::exception();
+                    } else if (int_val) {
+                        ((void (*)(bool))this_option->callback)(int_val);
+                    }
+                    break;
+                case STRING_CALLBACK:
+                    try {
+                        ((void (*)(std::string))this_option->callback)(
+                            argv[++arg_number]);
+                    } catch (std::exception& exc) {
+                        throw std::exception();
+                    };
+                case STRING:
+                    std::cout << this_option->printval << std::endl;
+                    break;
+                }
+            }
+        }
+        if (std::string("--help") == std::string(argv[arg_number]) ||
+            std::string("-h") == std::string(argv[arg_number])) {
+            d_present_options.insert(std::pair<std::string, int>("--help", 1));
+            help();
+        }
+    }
+}
+
+bool option_list::present(std::string option_name)
+{
+    if (d_present_options.count("--" + option_name)) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+void option_list::help()
+{
+    std::cout << d_program_name << std::endl;
+    std::cout << "  -h [ --help ] \t\tdisplay this help message" << std::endl;
+    for (std::vector<option_t>::iterator this_option = d_internal_list.begin();
+         this_option != d_internal_list.end();
+         this_option++) {
+        std::string help_line("  ");
+        if (this_option->shortform == "-") {
+            help_line += this_option->longform + " ";
+        } else {
+            help_line += this_option->shortform + " [ " + this_option->longform + " ]";
+        }
+
+        switch (help_line.size() / 8) {
+        case 0:
+            help_line += "\t";
+        case 1:
+            help_line += "\t";
+        case 2:
+            help_line += "\t";
+        case 3:
+            help_line += "\t";
+        }
+        help_line += this_option->msg;
+        std::cout << help_line << std::endl;
+    }
+}
diff --git a/apps/volk_option_helpers.h b/apps/volk_option_helpers.h

new file mode 100644 (file)

index 0000000..a2e31da
--- /dev/null
+++ b/apps/volk_option_helpers.h
@@ -0,0 +1,83 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef VOLK_VOLK_OPTION_HELPERS_H
+#define VOLK_VOLK_OPTION_HELPERS_H
+
+#include <limits.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+
+typedef enum {
+    VOID_CALLBACK,
+    INT_CALLBACK,
+    BOOL_CALLBACK,
+    STRING_CALLBACK,
+    FLOAT_CALLBACK,
+    STRING,
+} VOLK_OPTYPE;
+
+class option_t
+{
+public:
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             void (*t_callback)());
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             void (*t_callback)(int));
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             void (*t_callback)(float));
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             void (*t_callback)(bool));
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             void (*t_callback)(std::string));
+    option_t(std::string t_longform,
+             std::string t_shortform,
+             std::string t_msg,
+             std::string t_printval);
+
+    std::string longform;
+    std::string shortform;
+    std::string msg;
+    VOLK_OPTYPE option_type;
+    std::string printval;
+    void (*callback)();
+};
+
+class option_list
+{
+public:
+    option_list(std::string program_name);
+    bool present(std::string option_name);
+
+    void add(option_t opt);
+
+    void parse(int argc, char** argv);
+
+    void help();
+
+private:
+    std::string d_program_name;
+    std::vector<option_t> d_internal_list;
+    std::map<std::string, int> d_present_options;
+};
+
+
+#endif // VOLK_VOLK_OPTION_HELPERS_H
diff --git a/apps/volk_profile.cc b/apps/volk_profile.cc

new file mode 100644 (file)

index 0000000..6392aa7
--- /dev/null
+++ b/apps/volk_profile.cc
@@ -0,0 +1,335 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012-2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#if HAS_STD_FILESYSTEM_EXPERIMENTAL
+#include <experimental/filesystem>
+#else
+#include <filesystem>
+#endif
+#include <stddef.h>          // for size_t
+#include <sys/stat.h>        // for stat
+#include <volk/volk_prefs.h> // for volk_get_config_path
+#include <fstream>           // IWYU pragma: keep
+#include <iostream>          // for operator<<, basic_ostream
+#include <map>               // for map, map<>::iterator
+#include <utility>           // for pair
+#include <vector>            // for vector, vector<>::const_...
+
+#include "kernel_tests.h"        // for init_test_list
+#include "qa_utils.h"            // for volk_test_results_t, vol...
+#include "volk/volk_complex.h"   // for lv_32fc_t
+#include "volk_option_helpers.h" // for option_list, option_t
+#include "volk_profile.h"
+
+#if HAS_STD_FILESYSTEM_EXPERIMENTAL
+namespace fs = std::experimental::filesystem;
+#else
+namespace fs = std::filesystem;
+#endif
+
+volk_test_params_t test_params(1e-6f, 327.f, 131071, 1987, false, "");
+
+void set_benchmark(bool val) { test_params.set_benchmark(val); }
+void set_tolerance(float val) { test_params.set_tol(val); }
+void set_vlen(int val) { test_params.set_vlen((unsigned int)val); }
+void set_iter(int val) { test_params.set_iter((unsigned int)val); }
+void set_substr(std::string val) { test_params.set_regex(val); }
+bool update_mode = false;
+void set_update(bool val) { update_mode = val; }
+bool dry_run = false;
+void set_dryrun(bool val) { dry_run = val; }
+std::string json_filename("");
+void set_json(std::string val) { json_filename = val; }
+std::string volk_config_path("");
+void set_volk_config(std::string val) { volk_config_path = val; }
+
+int main(int argc, char* argv[])
+{
+
+    option_list profile_options("volk_profile");
+    profile_options.add(
+        option_t("benchmark", "b", "Run all kernels (benchmark mode)", set_benchmark));
+    profile_options.add(
+        option_t("tol", "t", "Set the default tolerance for all tests", set_tolerance));
+    profile_options.add(
+        option_t("vlen", "v", "Set the default vector length for tests", set_vlen));
+    profile_options.add((option_t(
+        "iter", "i", "Set the default number of test iterations per kernel", set_iter)));
+    profile_options.add(
+        (option_t("tests-substr", "R", "Run tests matching substring", set_substr)));
+    profile_options.add(
+        (option_t("update", "u", "Run only kernels missing from config", set_update)));
+    profile_options.add(
+        (option_t("dry-run",
+                  "n",
+                  "Dry run. Respect other options, but don't write to file",
+                  set_dryrun)));
+    profile_options.add((option_t(
+        "json", "j", "Write results to JSON file named as argument value", set_json)));
+    profile_options.add(
+        (option_t("path", "p", "Specify the volk_config path", set_volk_config)));
+    profile_options.parse(argc, argv);
+
+    if (profile_options.present("help")) {
+        return 0;
+    }
+
+    if (dry_run) {
+        std::cout << "Warning: this IS a dry-run. Config will not be written!"
+                  << std::endl;
+    }
+
+    // Adding program options
+    std::ofstream json_file;
+    std::string config_file;
+
+    if (json_filename != "") {
+        json_file.open(json_filename.c_str());
+    }
+
+    if (volk_config_path != "") {
+        config_file = volk_config_path + "/volk_config";
+    }
+
+    // Run tests
+    std::vector<volk_test_results_t> results;
+    if (update_mode) {
+        if (config_file != "")
+            read_results(&results, config_file);
+        else
+            read_results(&results);
+    }
+
+    // Initialize the list of tests
+    std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
+
+    // Iterate through list of tests running each one
+    std::string substr_to_match(test_params.kernel_regex());
+    for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+        bool regex_match = true;
+
+        volk_test_case_t test_case = test_cases[ii];
+        // if the kernel name matches regex then do the test
+        std::string test_case_name = test_case.name();
+        if (test_case_name.find(substr_to_match) == std::string::npos) {
+            regex_match = false;
+        }
+
+        // if we are in update mode check if we've already got results
+        // if we have any, then no need to test that kernel
+        bool update = true;
+        if (update_mode) {
+            for (unsigned int jj = 0; jj < results.size(); ++jj) {
+                if (results[jj].name == test_case.name() ||
+                    results[jj].name == test_case.puppet_master_name()) {
+                    update = false;
+                    break;
+                }
+            }
+        }
+
+        if (regex_match && update) {
+            try {
+                run_volk_tests(test_case.desc(),
+                               test_case.kernel_ptr(),
+                               test_case.name(),
+                               test_case.test_parameters(),
+                               &results,
+                               test_case.puppet_master_name());
+            } catch (std::string& error) {
+                std::cerr << "Caught Exception in 'run_volk_tests': " << error
+                          << std::endl;
+            }
+        }
+    }
+
+
+    // Output results according to provided options
+    if (json_filename != "") {
+        write_json(json_file, results);
+        json_file.close();
+    }
+
+    if (!dry_run) {
+        if (config_file != "")
+            write_results(&results, false, config_file);
+        else
+            write_results(&results, false);
+    } else {
+        std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
+    }
+    return 0;
+}
+
+void read_results(std::vector<volk_test_results_t>* results)
+{
+    char path[1024];
+    volk_get_config_path(path, true);
+    if (path[0] == 0) {
+        std::cout << "No prior test results found ..." << std::endl;
+        return;
+    }
+
+    read_results(results, std::string(path));
+}
+
+void read_results(std::vector<volk_test_results_t>* results, std::string path)
+{
+    struct stat buffer;
+    bool config_status = (stat(path.c_str(), &buffer) == 0);
+
+    if (config_status) {
+        // a config exists and we are reading results from it
+        std::ifstream config(path.c_str());
+        char config_line[256];
+        while (config.getline(config_line, 255)) {
+            // tokenize the input line by kernel_name unaligned aligned
+            // then push back in the results vector with fields filled in
+
+            std::vector<std::string> single_kernel_result;
+            std::string config_str(config_line);
+            std::size_t str_size = config_str.size();
+            std::size_t found = config_str.find(' ');
+
+            // Split line by spaces
+            while (found && found < str_size) {
+                found = config_str.find(' ');
+                // kernel names MUST be less than 128 chars, which is
+                // a length restricted by volk/volk_prefs.c
+                // on the last token in the parsed string we won't find a space
+                // so make sure we copy at most 128 chars.
+                if (found > 127) {
+                    found = 127;
+                }
+                str_size = config_str.size();
+                char line_buffer[128] = { '\0' };
+                config_str.copy(line_buffer, found + 1, 0);
+                line_buffer[found] = '\0';
+                single_kernel_result.push_back(std::string(line_buffer));
+                config_str.erase(0, found + 1);
+            }
+
+            if (single_kernel_result.size() == 3) {
+                volk_test_results_t kernel_result;
+                kernel_result.name = std::string(single_kernel_result[0]);
+                kernel_result.config_name = std::string(single_kernel_result[0]);
+                kernel_result.best_arch_u = std::string(single_kernel_result[1]);
+                kernel_result.best_arch_a = std::string(single_kernel_result[2]);
+                results->push_back(kernel_result);
+            }
+        }
+    }
+}
+
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result)
+{
+    char path[1024];
+    volk_get_config_path(path, false);
+    if (path[0] == 0) {
+        std::cout << "Aborting 'No config save path found' ..." << std::endl;
+        return;
+    }
+
+    write_results(results, update_result, std::string(path));
+}
+
+void write_results(const std::vector<volk_test_results_t>* results,
+                   bool update_result,
+                   const std::string path)
+{
+    //    struct stat buffer;
+    //    bool config_status = (stat (path.c_str(), &buffer) == 0);
+
+    /*
+     * These
+     */
+    const fs::path config_path(path);
+    if (!fs::exists(config_path.parent_path())) {
+        std::cout << "Creating " << config_path.parent_path() << "..." << std::endl;
+        fs::create_directories(config_path.parent_path());
+    }
+
+    std::ofstream config;
+    if (update_result) {
+        std::cout << "Updating " << path << "..." << std::endl;
+        config.open(path.c_str(), std::ofstream::app);
+        if (!config.is_open()) { // either we don't have write access or we don't have the
+                                 // dir yet
+            std::cout << "Error opening file " << path << std::endl;
+        }
+    } else {
+        std::cout << "Writing " << path << "..." << std::endl;
+        config.open(path.c_str());
+        if (!config.is_open()) { // either we don't have write access or we don't have the
+                                 // dir yet
+            std::cout << "Error opening file " << path << std::endl;
+        }
+
+        config << "\
+#this file is generated by volk_profile.\n\
+#the function name is followed by the preferred architecture.\n\
+";
+    }
+
+    std::vector<volk_test_results_t>::const_iterator profile_results;
+    for (profile_results = results->begin(); profile_results != results->end();
+         ++profile_results) {
+        config << profile_results->config_name << " " << profile_results->best_arch_a
+               << " " << profile_results->best_arch_u << std::endl;
+    }
+    config.close();
+}
+
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results)
+{
+    json_file << "{" << std::endl;
+    json_file << " \"volk_tests\": [" << std::endl;
+    size_t len = results.size();
+    size_t i = 0;
+    std::vector<volk_test_results_t>::iterator result;
+    for (result = results.begin(); result != results.end(); ++result) {
+        json_file << "  {" << std::endl;
+        json_file << "   \"name\": \"" << result->name << "\"," << std::endl;
+        json_file << "   \"vlen\": " << (int)(result->vlen) << "," << std::endl;
+        json_file << "   \"iter\": " << result->iter << "," << std::endl;
+        json_file << "   \"best_arch_a\": \"" << result->best_arch_a << "\","
+                  << std::endl;
+        json_file << "   \"best_arch_u\": \"" << result->best_arch_u << "\","
+                  << std::endl;
+        json_file << "   \"results\": {" << std::endl;
+        size_t results_len = result->results.size();
+        size_t ri = 0;
+
+        std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+        for (kernel_time_pair = result->results.begin();
+             kernel_time_pair != result->results.end();
+             ++kernel_time_pair) {
+            volk_test_time_t time = kernel_time_pair->second;
+            json_file << "    \"" << time.name << "\": {" << std::endl;
+            json_file << "     \"name\": \"" << time.name << "\"," << std::endl;
+            json_file << "     \"time\": " << time.time << "," << std::endl;
+            json_file << "     \"units\": \"" << time.units << "\"" << std::endl;
+            json_file << "    }";
+            if (ri + 1 != results_len) {
+                json_file << ",";
+            }
+            json_file << std::endl;
+            ri++;
+        }
+        json_file << "   }" << std::endl;
+        json_file << "  }";
+        if (i + 1 != len) {
+            json_file << ",";
+        }
+        json_file << std::endl;
+        i++;
+    }
+    json_file << " ]" << std::endl;
+    json_file << "}" << std::endl;
+}
diff --git a/apps/volk_profile.h b/apps/volk_profile.h

new file mode 100644 (file)

index 0000000..6c06a79
--- /dev/null
+++ b/apps/volk_profile.h
@@ -0,0 +1,23 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012-2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <stdbool.h> // for bool
+#include <iosfwd>    // for ofstream
+#include <string>    // for string
+#include <vector>    // for vector
+
+class volk_test_results_t;
+
+void read_results(std::vector<volk_test_results_t>* results);
+void read_results(std::vector<volk_test_results_t>* results, std::string path);
+void write_results(const std::vector<volk_test_results_t>* results, bool update_result);
+void write_results(const std::vector<volk_test_results_t>* results,
+                   bool update_result,
+                   const std::string path);
+void write_json(std::ofstream& json_file, std::vector<volk_test_results_t> results);
diff --git a/appveyor.yml b/appveyor.yml

new file mode 100644 (file)

index 0000000..17303d7
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,45 @@
+#
+# Copyright 2016, 2017, 2019, 2020, 2022 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+clone_depth: 1
+image: Visual Studio 2019
+cache:
+  - packages -> appveyor.yml
+environment:
+  environment:
+  matrix:
+    - job_name: VS 16 2019 / python 3.8
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2019
+      CMAKE_GENERATOR: Visual Studio 16 2019
+      PYTHON: "C:\\Python38-x64"
+
+install:
+    # Prepend the selected Python to the PATH of this build
+    - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
+    # Display version information about selected python and pip
+    - python --version
+    - python -c "import sys, platform, struct;
+      print(sys.platform, platform.machine(), struct.calcsize('P')*8)"
+    - pip --version
+    - pip install mako
+before_build:
+    - git submodule update --init --recursive
+    - cmake -G "%CMAKE_GENERATOR%" -A x64 \
+        -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=ON \
+        .
+build_script:
+    - cmake --build . --config Release --target INSTALL
+test_script:
+    - ctest -V --output-on-failure -C Release
+after_test:
+    - cd "c:\Program Files"
+    - 7z a "c:\libvolk-x64-%VC_VERSION%.zip" volk
+    - mkdir dlls
+    - cd dlls
+    - 7z a "c:\libvolk-x64-deps-%VC_VERSION%.zip" *
+    - appveyor PushArtifact c:\libvolk-x64-%VC_VERSION%.zip
+    - appveyor PushArtifact c:\libvolk-x64-deps-%VC_VERSION%.zip
diff --git a/cmake/Modules/CMakeParseArgumentsCopy.cmake b/cmake/Modules/CMakeParseArgumentsCopy.cmake

new file mode 100644 (file)

index 0000000..a8b8b31
--- /dev/null
+++ b/cmake/Modules/CMakeParseArgumentsCopy.cmake
@@ -0,0 +1,145 @@
+# Copyright 2014, 2018 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
+#
+# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
+# parsing the arguments given to that macro or function.
+# It processes the arguments and defines a set of variables which hold the
+# values of the respective options.
+#
+# The <options> argument contains all options for the respective macro,
+# i.e. keywords which can be used when calling the macro without any value
+# following, like e.g. the OPTIONAL keyword of the install() command.
+#
+# The <one_value_keywords> argument contains all keywords for this macro
+# which are followed by one value, like e.g. DESTINATION keyword of the
+# install() command.
+#
+# The <multi_value_keywords> argument contains all keywords for this macro
+# which can be followed by more than one value, like e.g. the TARGETS or
+# FILES keywords of the install() command.
+#
+# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
+# keywords listed in <options>, <one_value_keywords> and
+# <multi_value_keywords> a variable composed of the given <prefix>
+# followed by "_" and the name of the respective keyword.
+# These variables will then hold the respective value from the argument list.
+# For the <options> keywords this will be TRUE or FALSE.
+#
+# All remaining arguments are collected in a variable
+# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
+# your macro was called with unrecognized parameters.
+#
+# As an example here a my_install() macro, which takes similar arguments as the
+# real install() command:
+#
+#   function(MY_INSTALL)
+#     set(options OPTIONAL FAST)
+#     set(oneValueArgs DESTINATION RENAME)
+#     set(multiValueArgs TARGETS CONFIGURATIONS)
+#     cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
+#     ...
+#
+# Assume my_install() has been called like this:
+#   my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
+#
+# After the cmake_parse_arguments() call the macro will have set the following
+# variables:
+#   MY_INSTALL_OPTIONAL = TRUE
+#   MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
+#   MY_INSTALL_DESTINATION = "bin"
+#   MY_INSTALL_RENAME = "" (was not used)
+#   MY_INSTALL_TARGETS = "foo;bar"
+#   MY_INSTALL_CONFIGURATIONS = "" (was not used)
+#   MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
+#
+# You can the continue and process these variables.
+#
+# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
+# another recognized keyword follows, this is interpreted as the beginning of
+# the new option.
+# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
+# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
+# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefore.
+
+#=============================================================================
+# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
+  return()
+endif()
+set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
+
+
+function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
+  # first set all result variables to empty/FALSE
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames})
+    set(${prefix}_${arg_name})
+  endforeach(arg_name)
+
+  foreach(option ${_optionNames})
+    set(${prefix}_${option} FALSE)
+  endforeach(option)
+
+  set(${prefix}_UNPARSED_ARGUMENTS)
+
+  set(insideValues FALSE)
+  set(currentArgName)
+
+  # now iterate over all arguments and fill the result variables
+  foreach(currentArg ${ARGN})
+    list(FIND _optionNames "${currentArg}" optionIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _singleArgNames "${currentArg}" singleArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+    list(FIND _multiArgNames "${currentArg}" multiArgIndex)  # ... then this marks the end of the arguments belonging to this keyword
+
+    if(${optionIndex} EQUAL -1  AND  ${singleArgIndex} EQUAL -1  AND  ${multiArgIndex} EQUAL -1)
+      if(insideValues)
+        if("${insideValues}" STREQUAL "SINGLE")
+          set(${prefix}_${currentArgName} ${currentArg})
+          set(insideValues FALSE)
+        elseif("${insideValues}" STREQUAL "MULTI")
+          list(APPEND ${prefix}_${currentArgName} ${currentArg})
+        endif()
+      else(insideValues)
+        list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
+      endif(insideValues)
+    else()
+      if(NOT ${optionIndex} EQUAL -1)
+        set(${prefix}_${currentArg} TRUE)
+        set(insideValues FALSE)
+      elseif(NOT ${singleArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "SINGLE")
+      elseif(NOT ${multiArgIndex} EQUAL -1)
+        set(currentArgName ${currentArg})
+        set(${prefix}_${currentArgName})
+        set(insideValues "MULTI")
+      endif()
+    endif()
+
+  endforeach(currentArg)
+
+  # propagate the result variables to the caller:
+  foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
+    set(${prefix}_${arg_name}  ${${prefix}_${arg_name}} PARENT_SCOPE)
+  endforeach(arg_name)
+  set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
+
+endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)
diff --git a/cmake/Modules/FindFILESYSTEM.cmake b/cmake/Modules/FindFILESYSTEM.cmake

new file mode 100644 (file)

index 0000000..6b92a82
--- /dev/null
+++ b/cmake/Modules/FindFILESYSTEM.cmake
@@ -0,0 +1,261 @@
+# Copyright 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+# Original code from https://github.com/vector-of-bool/CMakeCM and modified
+# by C. Fernandez. The original code is distributed under the OSI-approved
+# BSD 3-Clause License. See https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+
+FindFILESYSTEM
+##############
+
+This module supports the C++17 standard library's filesystem utilities. Use the
+:imp-target:`std::filesystem` imported target to
+
+Options
+*******
+
+The ``COMPONENTS`` argument to this module supports the following values:
+
+.. find-component:: Experimental
+    :name: fs.Experimental
+
+    Allows the module to find the "experimental" Filesystem TS version of the
+    Filesystem library. This is the library that should be used with the
+    ``std::experimental::filesystem`` namespace.
+
+.. find-component:: Final
+    :name: fs.Final
+
+    Finds the final C++17 standard version of the filesystem library.
+
+If no components are provided, behaves as if the
+:find-component:`fs.Final` component was specified.
+
+If both :find-component:`fs.Experimental` and :find-component:`fs.Final` are
+provided, first looks for ``Final``, and falls back to ``Experimental`` in case
+of failure. If ``Final`` is found, :imp-target:`std::filesystem` and all
+:ref:`variables <fs.variables>` will refer to the ``Final`` version.
+
+
+Imported Targets
+****************
+
+.. imp-target:: std::filesystem
+
+    The ``std::filesystem`` imported target is defined when any requested
+    version of the C++ filesystem library has been found, whether it is
+    *Experimental* or *Final*.
+
+    If no version of the filesystem library is available, this target will not
+    be defined.
+
+    .. note::
+        This target has ``cxx_std_17`` as an ``INTERFACE``
+        :ref:`compile language standard feature <req-lang-standards>`. Linking
+        to this target will automatically enable C++17 if no later standard
+        version is already required on the linking target.
+
+
+.. _fs.variables:
+
+Variables
+*********
+
+.. variable:: CXX_FILESYSTEM_IS_EXPERIMENTAL
+
+    Set to ``TRUE`` when the :find-component:`fs.Experimental` version of C++
+    filesystem library was found, otherwise ``FALSE``.
+
+.. variable:: CXX_FILESYSTEM_HAVE_FS
+
+    Set to ``TRUE`` when a filesystem header was found.
+
+.. variable:: CXX_FILESYSTEM_HEADER
+
+    Set to either ``filesystem`` or ``experimental/filesystem`` depending on
+    whether :find-component:`fs.Final` or :find-component:`fs.Experimental` was
+    found.
+
+.. variable:: CXX_FILESYSTEM_NAMESPACE
+
+    Set to either ``std::filesystem`` or ``std::experimental::filesystem``
+    depending on whether :find-component:`fs.Final` or
+    :find-component:`fs.Experimental` was found.
+
+
+Examples
+********
+
+Using `find_package(FILESYSTEM)` with no component arguments:
+
+.. code-block:: cmake
+
+    find_package(FILESYSTEM REQUIRED)
+
+    add_executable(my-program main.cpp)
+    target_link_libraries(my-program PRIVATE std::filesystem)
+
+
+#]=======================================================================]
+
+
+if(TARGET std::filesystem)
+    # This module has already been processed. Don't do it again.
+    return()
+endif()
+
+include(CMakePushCheckState)
+include(CheckIncludeFileCXX)
+include(CheckCXXSourceCompiles)
+
+cmake_push_check_state()
+
+set(CMAKE_REQUIRED_QUIET ${FILESYSTEM_FIND_QUIETLY})
+
+# All of our tests require C++17 or later
+set(OLD_CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD 17)
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "8.0.0"))
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.99")
+        set(UNDEFINED_BEHAVIOR_WITHOUT_LINKING TRUE)
+    endif()
+    set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+endif()
+if((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.99"))
+    set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+endif()
+if((CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11"))
+    set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+endif()
+if(MSVC AND NOT (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "18"))
+    set(CMAKE_REQUIRED_FLAGS "/std:c++17")
+endif()
+
+# Normalize and check the component list we were given
+set(want_components ${FILESYSTEM_FIND_COMPONENTS})
+if(FILESYSTEM_FIND_COMPONENTS STREQUAL "")
+    set(want_components Final)
+endif()
+
+# Warn on any unrecognized components
+set(extra_components ${want_components})
+list(REMOVE_ITEM extra_components Final Experimental)
+foreach(component IN LISTS extra_components)
+    message(WARNING "Extraneous find_package component for FILESYSTEM: ${component}")
+endforeach()
+
+# Detect which of Experimental and Final we should look for
+set(find_experimental TRUE)
+set(find_final TRUE)
+if(NOT "Final" IN_LIST want_components)
+    set(find_final FALSE)
+endif()
+if(NOT "Experimental" IN_LIST want_components)
+    set(find_experimental FALSE)
+endif()
+
+if(find_final)
+    check_include_file_cxx("filesystem" _CXX_FILESYSTEM_HAVE_HEADER)
+    mark_as_advanced(_CXX_FILESYSTEM_HAVE_HEADER)
+    if(_CXX_FILESYSTEM_HAVE_HEADER)
+        # We found the non-experimental header. Don't bother looking for the
+        # experimental one.
+        set(find_experimental FALSE)
+    endif()
+else()
+    set(_CXX_FILESYSTEM_HAVE_HEADER FALSE)
+endif()
+
+if(find_experimental)
+    check_include_file_cxx("experimental/filesystem" _CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
+    mark_as_advanced(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
+else()
+    set(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER FALSE)
+endif()
+
+if(_CXX_FILESYSTEM_HAVE_HEADER)
+    set(_have_fs TRUE)
+    set(_fs_header filesystem)
+    set(_fs_namespace std::filesystem)
+elseif(_CXX_FILESYSTEM_HAVE_EXPERIMENTAL_HEADER)
+    set(_have_fs TRUE)
+    set(_fs_header experimental/filesystem)
+    set(_fs_namespace std::experimental::filesystem)
+else()
+    set(_have_fs FALSE)
+endif()
+
+set(CXX_FILESYSTEM_HAVE_FS ${_have_fs} CACHE BOOL "TRUE if we have the C++ filesystem headers")
+set(CXX_FILESYSTEM_HEADER ${_fs_header} CACHE STRING "The header that should be included to obtain the filesystem APIs")
+set(CXX_FILESYSTEM_NAMESPACE ${_fs_namespace} CACHE STRING "The C++ namespace that contains the filesystem APIs")
+
+set(_found FALSE)
+
+if(CXX_FILESYSTEM_HAVE_FS)
+    # We have some filesystem library available. Do link checks
+    string(CONFIGURE [[
+        #include <@CXX_FILESYSTEM_HEADER@>
+
+        int main() {
+            auto cwd = @CXX_FILESYSTEM_NAMESPACE@::current_path();
+            return static_cast<int>(cwd.string().size());
+        }
+    ]] code @ONLY)
+
+    # Try to compile a simple filesystem program without any linker flags
+    if(NOT UNDEFINED_BEHAVIOR_WITHOUT_LINKING)
+        check_cxx_source_compiles("${code}" CXX_FILESYSTEM_NO_LINK_NEEDED)
+        set(can_link ${CXX_FILESYSTEM_NO_LINK_NEEDED})
+    endif()
+
+    if(NOT CXX_FILESYSTEM_NO_LINK_NEEDED)
+        set(prev_libraries ${CMAKE_REQUIRED_LIBRARIES})
+        set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+        # Add the libstdc++ flag
+        set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lstdc++fs)
+        check_cxx_source_compiles("${code}" CXX_FILESYSTEM_STDCPPFS_NEEDED)
+        set(can_link ${CXX_FILESYSTEM_STDCPPFS_NEEDED})
+        if(NOT CXX_FILESYSTEM_STDCPPFS_NEEDED)
+            # Try the libc++ flag
+            set(CMAKE_REQUIRED_LIBRARIES ${prev_libraries} -lc++fs)
+            check_cxx_source_compiles("${code}" CXX_FILESYSTEM_CPPFS_NEEDED)
+            set(can_link ${CXX_FILESYSTEM_CPPFS_NEEDED})
+        endif()
+    endif()
+
+    if(can_link)
+        if(CMAKE_VERSION VERSION_LESS 3.12)
+            add_library(std::filesystem INTERFACE IMPORTED GLOBAL)
+        else()
+            add_library(std::filesystem INTERFACE IMPORTED)
+            target_compile_features(std::filesystem INTERFACE cxx_std_17)
+        endif()
+        set(_found TRUE)
+
+        if(CXX_FILESYSTEM_NO_LINK_NEEDED)
+            # Nothing to add...
+        elseif(CXX_FILESYSTEM_STDCPPFS_NEEDED)
+            target_link_libraries(std::filesystem INTERFACE -lstdc++fs)
+        elseif(CXX_FILESYSTEM_CPPFS_NEEDED)
+            target_link_libraries(std::filesystem INTERFACE -lc++fs)
+        endif()
+    endif()
+endif()
+
+if(NOT ${_found})
+    set(CMAKE_CXX_STANDARD ${OLD_CMAKE_CXX_STANDARD})
+endif()
+
+cmake_pop_check_state()
+
+set(FILESYSTEM_FOUND ${_found} CACHE BOOL "TRUE if we can compile and link a program using std::filesystem" FORCE)
+
+if(FILESYSTEM_FIND_REQUIRED AND NOT FILESYSTEM_FOUND)
+    message(FATAL_ERROR "Cannot compile a simple program using std::filesystem")
+endif()
diff --git a/cmake/Modules/FindORC.cmake b/cmake/Modules/FindORC.cmake

new file mode 100644 (file)

index 0000000..5b43c98
--- /dev/null
+++ b/cmake/Modules/FindORC.cmake
@@ -0,0 +1,49 @@
+# Copyright 2014, 2019, 2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+FIND_PACKAGE(PkgConfig)
+PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
+
+
+
+
+FIND_PROGRAM(ORCC_EXECUTABLE orcc
+             HINTS ${PC_ORC_TOOLSDIR}
+            PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin)
+
+FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h
+          HINTS ${PC_ORC_INCLUDEDIR}
+          PATHS ${ORC_ROOT}/include ${CMAKE_INSTALL_PREFIX}/include
+          PATH_SUFFIXES orc-0.4)
+
+
+FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX}
+          HINTS ${PC_ORC_LIBDIR}
+         PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+FIND_LIBRARY(ORC_LIB orc-0.4
+             HINTS ${PC_ORC_LIBRARY_DIRS}
+            PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+FIND_LIBRARY(ORC_LIBRARY_STATIC liborc-0.4.a
+             HINTS ${PC_ORC_LIBRARY_DIRS}
+            PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
+
+LIST(APPEND ORC_LIBRARY
+     ${ORC_LIB}
+)
+
+
+SET(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
+SET(ORC_LIBRARIES ${ORC_LIBRARY})
+SET(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR})
+SET(ORC_LIBRARIES_STATIC ${ORC_LIBRARY_STATIC})
+
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR ORCC_EXECUTABLE)
+
+mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE)
diff --git a/cmake/Modules/VolkAddTest.cmake b/cmake/Modules/VolkAddTest.cmake

new file mode 100644 (file)

index 0000000..f0a0f47
--- /dev/null
+++ b/cmake/Modules/VolkAddTest.cmake
@@ -0,0 +1,208 @@
+# Copyright 2015 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+if(DEFINED __INCLUDED_VOLK_ADD_TEST)
+    return()
+endif()
+set(__INCLUDED_VOLK_ADD_TEST TRUE)
+
+########################################################################
+# Generate a test executable which can be used in ADD_TEST to call
+# various subtests.
+#
+# SOURCES        - sources for the test
+# TARGET_DEPS    - build target dependencies (e.g., libraries)
+########################################################################
+
+function(VOLK_GEN_TEST executable_name)
+    include(CMakeParseArgumentsCopy)
+    CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "SOURCES;TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN})
+    add_executable(${executable_name} ${VOLK_TEST_SOURCES})
+    target_link_libraries(${executable_name} ${VOLK_TEST_TARGET_DEPS})
+endfunction()
+
+########################################################################
+# Add a unit test and setup the environment for it.
+# Encloses ADD_TEST, with additional functionality to create a shell
+# script that sets the environment to gain access to in-build binaries
+# properly. The following variables are used to pass in settings:
+# A test executable has to be generated with VOLK_GEN_TEST beforehand.
+# The executable name has to be passed as argument.
+#
+# NAME           - the test name
+# TARGET_DEPS    - build target dependencies (e.g., libraries)
+# EXTRA_LIB_DIRS - other directories for the library path
+# ENVIRONS       - other environment key/value pairs
+# ARGS           - arguments for the test
+########################################################################
+function(VOLK_ADD_TEST test_name executable_name)
+
+  #parse the arguments for component names
+  include(CMakeParseArgumentsCopy)
+  CMAKE_PARSE_ARGUMENTS(VOLK_TEST "" "" "TARGET_DEPS;EXTRA_LIB_DIRS;ENVIRONS;ARGS" ${ARGN})
+
+  #set the initial environs to use
+  set(environs ${VOLK_TEST_ENVIRONS})
+
+  #create the initial library path
+  file(TO_NATIVE_PATH "${VOLK_TEST_EXTRA_LIB_DIRS}" libpath)
+
+  #set the source directory, which is mostly FYI
+  file(TO_NATIVE_PATH ${CMAKE_CURRENT_SOURCE_DIR} srcdir)
+  list(APPEND environs "srcdir=\"${srcdir}\"")
+
+  #http://www.cmake.org/pipermail/cmake/2009-May/029464.html
+  #Replaced this add test + set environs code with the shell script generation.
+  #Its nicer to be able to manually run the shell script to diagnose problems.
+  if(UNIX)
+    if(APPLE)
+      set(LD_PATH_VAR "DYLD_LIBRARY_PATH")
+    else()
+      set(LD_PATH_VAR "LD_LIBRARY_PATH")
+    endif()
+
+    #create a list of target directories to be determined by the
+    #"add_test" command, via the $<FOO:BAR> operator; make sure the
+    #test's directory is first, since it ($1) is prepended to PATH.
+    unset(TARGET_DIR_LIST)
+    foreach(target ${executable_name} ${VOLK_TEST_TARGET_DEPS})
+      list(APPEND TARGET_DIR_LIST "\$<TARGET_FILE_DIR:${target}>")
+    endforeach()
+
+    #augment the PATH to start with the directory of the test
+    set(binpath "\"$1:\$PATH\"")
+    list(APPEND environs "PATH=${binpath}")
+
+    #set the shell to use
+    if(CMAKE_CROSSCOMPILING)
+      set(SHELL "/bin/sh")
+    else()
+      find_program(SHELL sh)
+    endif()
+
+    #check to see if the shell supports "$*" expansion with IFS
+    if(NOT TESTED_SHELL_SUPPORTS_IFS)
+      set(TESTED_SHELL_SUPPORTS_IFS TRUE CACHE BOOL "")
+      set(sh_file ${CMAKE_CURRENT_BINARY_DIR}/ifs_test.sh)
+      file(WRITE ${sh_file} "#!${SHELL}\n")
+      file(APPEND ${sh_file} "export IFS=:\n")
+      file(APPEND ${sh_file} "echo \"$*\"\n")
+      #make the shell file executable
+      execute_process(COMMAND chmod +x ${sh_file})
+
+      #execute the shell script
+      execute_process(COMMAND ${sh_file} "a" "b" "c"
+        OUTPUT_VARIABLE output OUTPUT_STRIP_TRAILING_WHITESPACE
+      )
+
+      #check the output to see if it is correct
+      string(COMPARE EQUAL ${output} "a:b:c" SHELL_SUPPORTS_IFS)
+      set(SHELL_SUPPORTS_IFS ${SHELL_SUPPORTS_IFS} CACHE BOOL
+        "Set this value to TRUE if the shell supports IFS argument expansion"
+      )
+    endif()
+    unset(testlibpath)
+    if(SHELL_SUPPORTS_IFS)
+      #"$*" expands in the shell into a list of all of the arguments
+      #to the shell script, concatenated using the character provided
+      #in ${IFS}.
+      list(APPEND testlibpath "$*")
+    else()
+      #shell does not support IFS expansion; use a loop instead
+      list(APPEND testlibpath "\${LL}")
+    endif()
+
+    #finally: add in the current library path variable
+    list(INSERT libpath 0 ${testlibpath})
+    list(APPEND libpath "$${LD_PATH_VAR}")
+
+    #replace list separator with the path separator
+    string(REPLACE ";" ":" libpath "${libpath}")
+    list(APPEND environs "${LD_PATH_VAR}=\"${libpath}\"")
+
+    #generate a shell script file that sets the environment and runs the test
+    set(sh_file ${CMAKE_CURRENT_BINARY_DIR}/${test_name}_test.sh)
+    file(WRITE ${sh_file} "#!${SHELL}\n")
+    if(SHELL_SUPPORTS_IFS)
+      file(APPEND ${sh_file} "export IFS=:\n")
+    else()
+      file(APPEND ${sh_file} "LL=\"$1\" && for tf in \"\$@\"; do LL=\"\${LL}:\${tf}\"; done\n")
+    endif()
+
+    #each line sets an environment variable
+    foreach(environ ${environs})
+      file(APPEND ${sh_file} "export ${environ}\n")
+    endforeach(environ)
+
+    set(VOLK_TEST_ARGS "${test_name}")
+
+    #redo the test args to have a space between each
+    string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}")
+
+    #finally: append the test name to execute
+    file(APPEND ${sh_file} "${CMAKE_CROSSCOMPILING_EMULATOR} ${executable_name} ${VOLK_TEST_ARGS}\n")
+
+    #make the shell file executable
+    execute_process(COMMAND chmod +x ${sh_file})
+
+    #add the shell file as the test to execute;
+    #use the form that allows for $<FOO:BAR> substitutions,
+    #then combine the script arguments inside the script.
+    add_test(NAME qa_${test_name}
+      COMMAND ${SHELL} ${sh_file} ${TARGET_DIR_LIST}
+    )
+
+  endif(UNIX)
+
+  if(WIN32)
+    #In the land of windows, all libraries must be in the PATH.  Since
+    #the dependent libraries are not yet installed, we must manually
+    #set them in the PATH to run tests.  The following appends the
+    #path of a target dependency.
+    #
+    #create a list of target directories to be determined by the
+    #"add_test" command, via the $<FOO:BAR> operator; make sure the
+    #test's directory is first, since it ($1) is prepended to PATH.
+    unset(TARGET_DIR_LIST)
+    foreach(target ${executable_name} ${VOLK_TEST_TARGET_DEPS})
+      list(APPEND TARGET_DIR_LIST "$<TARGET_FILE_DIR:${target}>")
+    endforeach()
+    #replace list separator with the path separator (escaped)
+    string(REPLACE ";" "\\\\;" TARGET_DIR_LIST "${TARGET_DIR_LIST}")
+
+    #add command line argument (TARGET_DIR_LIST) to path and append current path
+    list(INSERT libpath 0 "%1")
+    list(APPEND libpath "%PATH%")
+
+    #replace list separator with the path separator (escaped)
+    string(REPLACE ";" "\\;" libpath "${libpath}")
+    list(APPEND environs "PATH=${libpath}")
+
+    #generate a bat file that sets the environment and runs the test
+    set(bat_file ${CMAKE_CURRENT_BINARY_DIR}/${test_name}_test.bat)
+    file(WRITE ${bat_file} "@echo off\n")
+
+    #each line sets an environment variable
+    foreach(environ ${environs})
+      file(APPEND ${bat_file} "SET ${environ}\n")
+    endforeach(environ)
+
+    set(VOLK_TEST_ARGS "${test_name}")
+
+    #redo the test args to have a space between each
+    string(REPLACE ";" " " VOLK_TEST_ARGS "${VOLK_TEST_ARGS}")
+
+    #finally: append the test name to execute
+    file(APPEND ${bat_file} "${executable_name} ${VOLK_TEST_ARGS}\n")
+    file(APPEND ${bat_file} "\n")
+
+    add_test(NAME qa_${test_name}
+        COMMAND ${bat_file} ${TARGET_DIR_LIST}
+    )
+  endif(WIN32)
+
+endfunction(VOLK_ADD_TEST)
diff --git a/cmake/Modules/VolkBuildTypes.cmake b/cmake/Modules/VolkBuildTypes.cmake

new file mode 100644 (file)

index 0000000..a65beae
--- /dev/null
+++ b/cmake/Modules/VolkBuildTypes.cmake
@@ -0,0 +1,199 @@
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+if(DEFINED __INCLUDED_VOLK_BUILD_TYPES_CMAKE)
+    return()
+endif()
+set(__INCLUDED_VOLK_BUILD_TYPES_CMAKE TRUE)
+
+# Standard CMake Build Types and their basic CFLAGS:
+#  - None: nothing set
+#  - Debug: -O2 -g
+#  - Release: -O3
+#  - RelWithDebInfo: -O3 -g
+#  - MinSizeRel: -Os
+
+# Additional Build Types, defined below:
+#  - NoOptWithASM: -O0 -g -save-temps
+#  - O2WithASM: -O2 -g -save-temps
+#  - O3WithASM: -O3 -g -save-temps
+#  - DebugParanoid -O0 -g -Werror
+
+# Defines the list of acceptable cmake build types. When adding a new
+# build type below, make sure to add it to this list.
+list(APPEND AVAIL_BUILDTYPES
+  None Debug Release RelWithDebInfo MinSizeRel
+  DebugParanoid NoOptWithASM O2WithASM O3WithASM
+  ASAN
+)
+
+########################################################################
+# VOLK_CHECK_BUILD_TYPE(build type)
+#
+# Use this to check that the build type set in CMAKE_BUILD_TYPE on the
+# commandline is one of the valid build types used by this project. It
+# checks the value set in the cmake interface against the list of
+# known build types in AVAIL_BUILDTYPES. If the build type is found,
+# the function exits immediately. If nothing is found by the end of
+# checking all available build types, we exit with an error and list
+# the available build types.
+########################################################################
+function(VOLK_CHECK_BUILD_TYPE settype)
+  STRING(TOUPPER ${settype} _settype)
+  foreach(btype ${AVAIL_BUILDTYPES})
+    STRING(TOUPPER ${btype} _btype)
+    if(${_settype} STREQUAL ${_btype})
+      return() # found it; exit cleanly
+    endif(${_settype} STREQUAL ${_btype})
+  endforeach(btype)
+  # Build type not found; error out
+  message(FATAL_ERROR "Build type '${settype}' not valid, must be one of: ${AVAIL_BUILDTYPES}")
+endfunction(VOLK_CHECK_BUILD_TYPE)
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=DebugParanoid
+#
+# This type uses no optimization (-O0), outputs debug symbols (-g), warns
+# on everything, and stops on warnings.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+if(NOT WIN32)
+  SET(CMAKE_CXX_FLAGS_DEBUGPARANOID "-Wall -Wextra -g -O0" CACHE STRING
+    "Flags used by the C++ compiler during DebugParanoid builds." FORCE)
+  SET(CMAKE_C_FLAGS_DEBUGPARANOID "-Wall -Wextra -g -O0" CACHE STRING
+    "Flags used by the C compiler during DebugParanoid builds." FORCE)
+  SET(CMAKE_EXE_LINKER_FLAGS_DEBUGPARANOID
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used for linking binaries during NoOptWithASM builds." FORCE)
+  SET(CMAKE_SHARED_LINKER_FLAGS_DEBUGPARANOID
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used by the shared lib linker during NoOptWithASM builds." FORCE)
+
+  MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_DEBUGPARANOID
+    CMAKE_C_FLAGS_DEBUGPARANOID
+    CMAKE_EXE_LINKER_FLAGS_DEBUGPARANOID
+    CMAKE_SHARED_LINKER_DEBUGPARANOID)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=NoOptWithASM
+#
+# This type uses no optimization (-O0), outputs debug symbols (-g) and
+# outputs all intermediary files the build system produces, including
+# all assembly (.s) files. Look in the build directory for these
+# files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+if(NOT WIN32)
+  SET(CMAKE_CXX_FLAGS_NOOPTWITHASM "-save-temps -g -O0" CACHE STRING
+    "Flags used by the C++ compiler during NoOptWithASM builds." FORCE)
+  SET(CMAKE_C_FLAGS_NOOPTWITHASM "-save-temps -g -O0" CACHE STRING
+    "Flags used by the C compiler during NoOptWithASM builds." FORCE)
+  SET(CMAKE_EXE_LINKER_FLAGS_NOOPTWITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used for linking binaries during NoOptWithASM builds." FORCE)
+  SET(CMAKE_SHARED_LINKER_FLAGS_NOOPTWITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used by the shared lib linker during NoOptWithASM builds." FORCE)
+
+  MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_NOOPTWITHASM
+    CMAKE_C_FLAGS_NOOPTWITHASM
+    CMAKE_EXE_LINKER_FLAGS_NOOPTWITHASM
+    CMAKE_SHARED_LINKER_FLAGS_NOOPTWITHASM)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=O2WithASM
+#
+# This type uses level 2 optimization (-O2), outputs debug symbols
+# (-g) and outputs all intermediary files the build system produces,
+# including all assembly (.s) files. Look in the build directory for
+# these files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+
+if(NOT WIN32)
+  SET(CMAKE_CXX_FLAGS_O2WITHASM "-save-temps -g -O2" CACHE STRING
+    "Flags used by the C++ compiler during O2WithASM builds." FORCE)
+  SET(CMAKE_C_FLAGS_O2WITHASM "-save-temps -g -O2" CACHE STRING
+    "Flags used by the C compiler during O2WithASM builds." FORCE)
+  SET(CMAKE_EXE_LINKER_FLAGS_O2WITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used for linking binaries during O2WithASM builds." FORCE)
+  SET(CMAKE_SHARED_LINKER_FLAGS_O2WITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used by the shared lib linker during O2WithASM builds." FORCE)
+
+  MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_O2WITHASM
+    CMAKE_C_FLAGS_O2WITHASM
+    CMAKE_EXE_LINKER_FLAGS_O2WITHASM
+    CMAKE_SHARED_LINKER_FLAGS_O2WITHASM)
+endif(NOT WIN32)
+
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=O3WithASM
+#
+# This type uses level 3 optimization (-O3), outputs debug symbols
+# (-g) and outputs all intermediary files the build system produces,
+# including all assembly (.s) files. Look in the build directory for
+# these files.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+
+if(NOT WIN32)
+  SET(CMAKE_CXX_FLAGS_O3WITHASM "-save-temps -g -O3" CACHE STRING
+    "Flags used by the C++ compiler during O3WithASM builds." FORCE)
+  SET(CMAKE_C_FLAGS_O3WITHASM "-save-temps -g -O3" CACHE STRING
+    "Flags used by the C compiler during O3WithASM builds." FORCE)
+  SET(CMAKE_EXE_LINKER_FLAGS_O3WITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used for linking binaries during O3WithASM builds." FORCE)
+  SET(CMAKE_SHARED_LINKER_FLAGS_O3WITHASM
+    "-Wl,--warn-unresolved-symbols,--warn-once" CACHE STRING
+    "Flags used by the shared lib linker during O3WithASM builds." FORCE)
+
+  MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_O3WITHASM
+    CMAKE_C_FLAGS_O3WITHASM
+    CMAKE_EXE_LINKER_FLAGS_O3WITHASM
+    CMAKE_SHARED_LINKER_FLAGS_O3WITHASM)
+endif(NOT WIN32)
+
+########################################################################
+# For GCC and Clang, we can set a build type:
+#
+# -DCMAKE_BUILD_TYPE=ASAN
+#
+# This type creates an address sanitized build (-fsanitize=address)
+# and defaults to the DebugParanoid linker flags.
+# NOTE: This is not defined on Windows systems.
+########################################################################
+if(NOT WIN32)
+  SET(CMAKE_CXX_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
+    "Flags used by the C++ compiler during Address Sanitized builds." FORCE)
+  SET(CMAKE_C_FLAGS_ASAN "-Wall -Wextra -g -O2 -fsanitize=address -fno-omit-frame-pointer" CACHE STRING
+    "Flags used by the C compiler during Address Sanitized builds." FORCE)
+  MARK_AS_ADVANCED(
+    CMAKE_CXX_FLAGS_ASAN
+    CMAKE_C_FLAGS_ASAN
+    CMAKE_EXE_LINKER_FLAGS_DEBUGPARANOID
+    CMAKE_SHARED_LINKER_DEBUGPARANOID)
+endif(NOT WIN32)
diff --git a/cmake/Modules/VolkConfig.cmake.in b/cmake/Modules/VolkConfig.cmake.in

new file mode 100644 (file)

index 0000000..3ffdf36
--- /dev/null
+++ b/cmake/Modules/VolkConfig.cmake.in
@@ -0,0 +1,41 @@
+# Copyright 2016, 2018 - 2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+get_filename_component(VOLK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+
+if(NOT TARGET Volk::volk)
+  include("${VOLK_CMAKE_DIR}/VolkTargets.cmake")
+endif()
+
+# set VOLK_FOUND to be set globally, for whether a compatible Volk was
+# found -- could be a correct enough version or any version depending
+# on how find_package was called.
+if(NOT TARGET Volk::volk)
+  set(VOLK_FOUND FALSE)
+else()
+  set(VOLK_FOUND TRUE)
+endif()
+
+# cache whether a compatible Volk was found for
+# use anywhere in the calling project
+set(VOLK_FOUND ${VOLK_FOUND} CACHE BOOL "Whether a compatible Volk was found" FORCE)
+
+if(VOLK_FOUND)
+  # use the new target library, regardless of whether new or old style
+  # we still need to set a variable with the library name so that there
+  # is a variable to reference in the using-project's cmake scripts!
+  set(VOLK_LIBRARIES Volk::volk CACHE STRING "Volk Library" FORCE)
+
+  # INTERFACE_INCLUDE_DIRECTORIES should always be set
+  get_target_property(VOLK_INCLUDE_DIRS Volk::volk INTERFACE_INCLUDE_DIRECTORIES)
+  set(VOLK_INCLUDE_DIRS ${VOLK_INCLUDE_DIRS} CACHE STRING "Volk Include Directories" FORCE)
+
+  # for backward compatibility with old-CMake non-target project finding
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Volk DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+  mark_as_advanced(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+endif(VOLK_FOUND)
diff --git a/cmake/Modules/VolkConfigVersion.cmake.in b/cmake/Modules/VolkConfigVersion.cmake.in

new file mode 100644 (file)

index 0000000..b6e9b51
--- /dev/null
+++ b/cmake/Modules/VolkConfigVersion.cmake.in
@@ -0,0 +1,21 @@
+# Copyright 2014, 2015, 2018, 2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+set(MAJOR_VERSION @VERSION_INFO_MAJOR_VERSION@)
+set(MINOR_VERSION @VERSION_INFO_MINOR_VERSION@)
+set(MAINT_VERSION @VERSION_INFO_MAINT_VERSION@)
+
+set(PACKAGE_VERSION ${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION})
+
+if(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION})
+  if(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${MINOR_VERSION})
+    if(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MAINT_VERSION})
+      set(PACKAGE_VERSION_EXACT 1)       # exact match for API version
+      set(PACKAGE_VERSION_COMPATIBLE 1)  # compat for minor/patch version
+    endif()
+  endif()
+endif()
diff --git a/cmake/Modules/VolkPython.cmake b/cmake/Modules/VolkPython.cmake

new file mode 100644 (file)

index 0000000..b58519d
--- /dev/null
+++ b/cmake/Modules/VolkPython.cmake
@@ -0,0 +1,264 @@
+# Copyright 2010-2011,2013 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+if(DEFINED __INCLUDED_VOLK_PYTHON_CMAKE)
+    return()
+endif()
+set(__INCLUDED_VOLK_PYTHON_CMAKE TRUE)
+
+########################################################################
+# Setup the python interpreter:
+# This allows the user to specify a specific interpreter,
+# or finds the interpreter via the built-in cmake module.
+########################################################################
+#this allows the user to override PYTHON_EXECUTABLE
+
+# `PythonInterp` is deprecated in 3.12, we should use `Python3` instead.
+# The `Python3` CMake module is introduced in CMake 3.12, e.g. Ubuntu 18.04 comes with CMake 3.10 though.
+# https://cmake.org/cmake/help/latest/module/FindPythonInterp.html
+# https://cmake.org/cmake/help/latest/module/FindPython3.html#module:FindPython3
+
+# FUTURE TODO: With CMake 3.12+ we can simply do:
+#if(PYTHON_EXECUTABLE)
+#    set(Python_EXECUTABLE ${PYTHON_EXECUTABLE})
+#else(PYTHON_EXECUTABLE)
+#    find_package(Python3 COMPONENTS Interpreter)
+#endif(PYTHON_EXECUTABLE)
+#
+##make the path to the executable appear in the cmake gui
+#set(PYTHON_EXECUTABLE ${Python_EXECUTABLE} CACHE FILEPATH "python interpreter")
+# END FUTURE TODO: Stick with following version as long as we set CMake 3.8 minimum.
+
+if(PYTHON_EXECUTABLE)
+
+    set(PYTHONINTERP_FOUND TRUE)
+
+#otherwise if not set, try to automatically find it
+else(PYTHON_EXECUTABLE)
+
+    #use the built-in find script
+    set(Python_ADDITIONAL_VERSIONS 3.4 3.5 3.6 3.7 3.8)
+    find_package(PythonInterp 3)
+
+    #and if that fails use the find program routine
+    if(NOT PYTHONINTERP_FOUND)
+        find_program(PYTHON_EXECUTABLE NAMES python3 python)
+        if(PYTHON_EXECUTABLE)
+            set(PYTHONINTERP_FOUND TRUE)
+        endif(PYTHON_EXECUTABLE)
+    endif(NOT PYTHONINTERP_FOUND)
+
+endif(PYTHON_EXECUTABLE)
+
+#make the path to the executable appear in the cmake gui
+set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "python interpreter")
+
+
+########################################################################
+# Check for the existence of a python module:
+# - desc a string description of the check
+# - mod the name of the module to import
+# - cmd an additional command to run
+# - have the result variable to set
+########################################################################
+macro(VOLK_PYTHON_CHECK_MODULE desc mod cmd have)
+    message(STATUS "")
+    message(STATUS "Python checking for ${desc}")
+    execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c "
+#########################################
+try: import ${mod}
+except:
+    try: ${mod}
+    except: exit(-1)
+try: assert ${cmd}
+except: exit(-1)
+#########################################"
+        RESULT_VARIABLE ${have}
+    )
+    if(${have} EQUAL 0)
+        message(STATUS "Python checking for ${desc} - found")
+        set(${have} TRUE)
+    else(${have} EQUAL 0)
+        message(STATUS "Python checking for ${desc} - not found")
+        set(${have} FALSE)
+    endif(${have} EQUAL 0)
+endmacro(VOLK_PYTHON_CHECK_MODULE)
+
+########################################################################
+# Sets the python installation directory VOLK_PYTHON_DIR
+# cf. https://github.com/gnuradio/gnuradio/blob/master/cmake/Modules/GrPython.cmake
+# From https://github.com/pothosware/SoapySDR/tree/master/python
+# https://github.com/pothosware/SoapySDR/blob/master/LICENSE_1_0.txt
+########################################################################
+if(NOT DEFINED VOLK_PYTHON_DIR)
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} -c "import os
+import sysconfig
+import site
+
+install_dir = None
+# The next line passes a CMake variable into our script.
+prefix = '${CMAKE_INSTALL_PREFIX}'
+
+# We use `site` to identify if our chosen prefix is a default one.
+# https://docs.python.org/3/library/site.html
+try:
+    # https://docs.python.org/3/library/site.html#site.getsitepackages
+    paths = [p for p in site.getsitepackages() if p.startswith(prefix)]
+    if len(paths) == 1: install_dir = paths[0]
+except AttributeError: pass
+
+# If we found a default install path, `install_dir` is set.
+if not install_dir:
+    # We use a custom install prefix!
+    # Determine the correct install path in that prefix on the current platform.
+    # For Python 3.11+, we could use the 'venv' scheme for all platforms
+    # https://docs.python.org/3.11/library/sysconfig.html#installation-paths
+    if os.name == 'nt':
+        scheme = 'nt'
+    else:
+        scheme = 'posix_prefix'
+    install_dir = sysconfig.get_path('platlib', scheme)
+    prefix = sysconfig.get_path('data', scheme)
+
+#strip the prefix to return a relative path
+print(os.path.relpath(install_dir, prefix))"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    OUTPUT_VARIABLE VOLK_PYTHON_DIR
+)
+endif()
+file(TO_CMAKE_PATH ${VOLK_PYTHON_DIR} VOLK_PYTHON_DIR)
+
+########################################################################
+# Create an always-built target with a unique name
+# Usage: VOLK_UNIQUE_TARGET(<description> <dependencies list>)
+########################################################################
+function(VOLK_UNIQUE_TARGET desc)
+    file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib
+unique = hashlib.sha256(b'${reldir}${ARGN}').hexdigest()[:5]
+print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))"
+    OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE)
+    add_custom_target(${_target} ALL DEPENDS ${ARGN})
+endfunction(VOLK_UNIQUE_TARGET)
+
+########################################################################
+# Install python sources (also builds and installs byte-compiled python)
+########################################################################
+function(VOLK_PYTHON_INSTALL)
+    include(CMakeParseArgumentsCopy)
+    CMAKE_PARSE_ARGUMENTS(VOLK_PYTHON_INSTALL "" "DESTINATION;COMPONENT" "FILES;PROGRAMS" ${ARGN})
+
+    ####################################################################
+    if(VOLK_PYTHON_INSTALL_FILES)
+    ####################################################################
+        install(${ARGN}) #installs regular python files
+
+        #create a list of all generated files
+        unset(pysrcfiles)
+        unset(pycfiles)
+        unset(pyofiles)
+        foreach(pyfile ${VOLK_PYTHON_INSTALL_FILES})
+            get_filename_component(pyfile ${pyfile} ABSOLUTE)
+            list(APPEND pysrcfiles ${pyfile})
+
+            #determine if this file is in the source or binary directory
+            file(RELATIVE_PATH source_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${pyfile})
+            string(LENGTH "${source_rel_path}" source_rel_path_len)
+            file(RELATIVE_PATH binary_rel_path ${CMAKE_CURRENT_BINARY_DIR} ${pyfile})
+            string(LENGTH "${binary_rel_path}" binary_rel_path_len)
+
+            #and set the generated path appropriately
+            if(${source_rel_path_len} GREATER ${binary_rel_path_len})
+                set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${binary_rel_path})
+            else()
+                set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${source_rel_path})
+            endif()
+            list(APPEND pycfiles ${pygenfile}c)
+            list(APPEND pyofiles ${pygenfile}o)
+
+            #ensure generation path exists
+            get_filename_component(pygen_path ${pygenfile} PATH)
+            file(MAKE_DIRECTORY ${pygen_path})
+
+        endforeach(pyfile)
+
+        #the command to generate the pyc files
+        add_custom_command(
+            DEPENDS ${pysrcfiles} OUTPUT ${pycfiles}
+            COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pycfiles}
+        )
+
+        #the command to generate the pyo files
+        add_custom_command(
+            DEPENDS ${pysrcfiles} OUTPUT ${pyofiles}
+            COMMAND ${PYTHON_EXECUTABLE} -O ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pyofiles}
+        )
+
+        #create install rule and add generated files to target list
+        set(python_install_gen_targets ${pycfiles} ${pyofiles})
+        install(FILES ${python_install_gen_targets}
+            DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
+            COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
+        )
+
+
+    ####################################################################
+    elseif(VOLK_PYTHON_INSTALL_PROGRAMS)
+    ####################################################################
+        file(TO_NATIVE_PATH ${PYTHON_EXECUTABLE} pyexe_native)
+
+        if (CMAKE_CROSSCOMPILING)
+           set(pyexe_native "/usr/bin/env python")
+        endif()
+
+        foreach(pyfile ${VOLK_PYTHON_INSTALL_PROGRAMS})
+            get_filename_component(pyfile_name ${pyfile} NAME)
+            get_filename_component(pyfile ${pyfile} ABSOLUTE)
+            string(REPLACE "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" pyexefile "${pyfile}.exe")
+            list(APPEND python_install_gen_targets ${pyexefile})
+
+            get_filename_component(pyexefile_path ${pyexefile} PATH)
+            file(MAKE_DIRECTORY ${pyexefile_path})
+
+            add_custom_command(
+                OUTPUT ${pyexefile} DEPENDS ${pyfile}
+                COMMAND ${PYTHON_EXECUTABLE} -c
+                "open('${pyexefile}','w').write(r'\#!${pyexe_native}'+'\\n'+open('${pyfile}').read())"
+                COMMENT "Shebangin ${pyfile_name}"
+                VERBATIM
+            )
+
+            #on windows, python files need an extension to execute
+            get_filename_component(pyfile_ext ${pyfile} EXT)
+            if(WIN32 AND NOT pyfile_ext)
+                set(pyfile_name "${pyfile_name}.py")
+            endif()
+
+            install(PROGRAMS ${pyexefile} RENAME ${pyfile_name}
+                DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
+                COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
+            )
+        endforeach(pyfile)
+
+    endif()
+
+    VOLK_UNIQUE_TARGET("pygen" ${python_install_gen_targets})
+
+endfunction(VOLK_PYTHON_INSTALL)
+
+########################################################################
+# Write the python helper script that generates byte code files
+########################################################################
+file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py "
+import sys, py_compile
+files = sys.argv[1:]
+srcs, gens = files[:len(files)//2], files[len(files)//2:]
+for src, gen in zip(srcs, gens):
+    py_compile.compile(file=src, cfile=gen, doraise=True)
+")
diff --git a/cmake/Modules/VolkVersion.cmake b/cmake/Modules/VolkVersion.cmake

new file mode 100644 (file)

index 0000000..8be9d25
--- /dev/null
+++ b/cmake/Modules/VolkVersion.cmake
@@ -0,0 +1,73 @@
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+if(DEFINED __INCLUDED_VOLK_VERSION_CMAKE)
+    return()
+endif()
+set(__INCLUDED_VOLK_VERSION_CMAKE TRUE)
+
+#eventually, replace version.sh and fill in the variables below
+set(MAJOR_VERSION ${VERSION_INFO_MAJOR_VERSION})
+set(MINOR_VERSION ${VERSION_INFO_MINOR_VERSION})
+set(MAINT_VERSION ${VERSION_INFO_MAINT_VERSION})
+
+########################################################################
+# Extract the version string from git describe.
+########################################################################
+find_package(Git)
+
+if(GIT_FOUND AND EXISTS ${CMAKE_SOURCE_DIR}/.git)
+    message(STATUS "Extracting version information from git describe...")
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} describe --always --abbrev=8 --long
+        OUTPUT_VARIABLE GIT_DESCRIBE OUTPUT_STRIP_TRAILING_WHITESPACE
+        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+    )
+else()
+  if(NOT VOLK_GIT_COUNT)
+    set(VOLK_GIT_COUNT "0")
+  endif()
+
+  if(NOT VOLK_GIT_HASH)
+    set(VOLK_GIT_HASH "unknown")
+  endif()
+
+  set(GIT_DESCRIBE "v${MAJOR_VERSION}.${MINOR_VERSION}-${VOLK_GIT_COUNT}-${VOLK_GIT_HASH}")
+endif()
+
+########################################################################
+# Use the logic below to set the version constants
+########################################################################
+if("${MINOR_VERSION}" STREQUAL "git")
+    # VERSION: 1.0git-xxx-gxxxxxxxx
+    # DOCVER:  1.0git
+    # SOVERSION:  1.0git
+    set(VERSION "${GIT_DESCRIBE}")
+    set(DOCVER  "${MAJOR_VERSION}.0${MINOR_VERSION}")
+    set(SOVERSION  "${MAJOR_VERSION}.0${MINOR_VERSION}")
+    set(RC_MINOR_VERSION "0")
+    set(RC_MAINT_VERSION "0")
+elseif("${MAINT_VERSION}" STREQUAL "git")
+    # VERSION: 1.xgit-xxx-gxxxxxxxx
+    # DOCVER:  1.xgit
+    # SOVERSION:  1.xgit
+    set(VERSION "${GIT_DESCRIBE}")
+    set(DOCVER  "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
+    set(SOVERSION  "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}")
+    math(EXPR RC_MINOR_VERSION "${MINOR_VERSION} - 1")
+    set(RC_MAINT_VERSION "0")
+else()
+    # This is a numbered release.
+    # VERSION: 1.1{.x}
+    # DOCVER:  1.1{.x}
+    # SOVERSION:  1.1.0
+    set(VERSION "${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION}")
+    set(DOCVER "${VERSION}")
+    set(SOVERSION "${MAJOR_VERSION}.${MINOR_VERSION}")
+    set(RC_MINOR_VERSION ${MINOR_VERSION})
+    set(RC_MAINT_VERSION ${MAINT_VERSION})
+endif()
diff --git a/cmake/Toolchains/aarch64-linux-gnu.cmake b/cmake/Toolchains/aarch64-linux-gnu.cmake

new file mode 100644 (file)

index 0000000..d6e5395
--- /dev/null
+++ b/cmake/Toolchains/aarch64-linux-gnu.cmake
@@ -0,0 +1,41 @@
+#
+# Copyright 2018, 2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+if(MINGW OR CYGWIN OR WIN32)
+  set(UTIL_SEARCH_CMD where)
+elseif(UNIX OR APPLE)
+  set(UTIL_SEARCH_CMD which)
+endif()
+
+set(TOOLCHAIN_PREFIX aarch64-linux-gnu-)
+
+execute_process(
+  COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc
+  OUTPUT_VARIABLE BINUTILS_PATH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+
+get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY)
+
+# The following is not needed on debian
+# Without that flag CMake is not able to pass test compilation check
+#set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs")
+
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc)
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++)
+
+set(CMAKE_OBJCOPY ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy CACHE INTERNAL "objcopy tool")
+set(CMAKE_SIZE_UTIL ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size CACHE INTERNAL "size tool")
+
+set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-aarch64 -L /usr/aarch64-linux-gnu/")
diff --git a/cmake/Toolchains/arm-linux-gnueabihf.cmake b/cmake/Toolchains/arm-linux-gnueabihf.cmake

new file mode 100644 (file)

index 0000000..e7ed3cf
--- /dev/null
+++ b/cmake/Toolchains/arm-linux-gnueabihf.cmake
@@ -0,0 +1,44 @@
+#
+# Copyright 2018 - 2020 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+
+if(MINGW OR CYGWIN OR WIN32)
+  set(UTIL_SEARCH_CMD where)
+elseif(UNIX OR APPLE)
+  set(UTIL_SEARCH_CMD which)
+endif()
+
+set(TOOLCHAIN_PREFIX arm-linux-gnueabihf-)
+
+execute_process(
+  COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc
+  OUTPUT_VARIABLE BINUTILS_PATH
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+
+get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY)
+
+# The following is not needed on debian
+# Without that flag CMake is not able to pass test compilation check
+#set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs")
+
+set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc)
+set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER})
+set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++)
+## the following is needed for CheckCSourceCompiles used in lib/CMakeLists.txt
+set(CMAKE_C_FLAGS "-mfpu=neon" CACHE STRING "" FORCE)
+set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" FORCE)
+
+set(CMAKE_OBJCOPY ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy CACHE INTERNAL "objcopy tool")
+set(CMAKE_SIZE_UTIL ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size CACHE INTERNAL "size tool")
+
+set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH})
+
+set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-arm -L /usr/arm-linux-gnueabihf/")
diff --git a/cmake/Toolchains/arm_cortex_a15_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a15_hardfp_native.cmake

new file mode 100644 (file)

index 0000000..05f4c77
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a15_hardfp_native.cmake
@@ -0,0 +1,18 @@
+#
+# Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A8 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv7-a -mtune=cortex-a15 -mfpu=neon -mfloat-abi=hard" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -g" CACHE STRING "" FORCE) #same flags for asm sources
+\ No newline at end of file
diff --git a/cmake/Toolchains/arm_cortex_a53_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a53_hardfp_native.cmake

new file mode 100644 (file)

index 0000000..636025a
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a53_hardfp_native.cmake
@@ -0,0 +1,17 @@
+#
+# Copyright 2014, 2018, 2019, 2021 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A53 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv8-a -mtune=cortex-a53 -mfpu=neon-fp-armv8 -mfloat-abi=hard" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -mthumb -g" CACHE STRING "" FORCE) #same flags for asm sources
diff --git a/cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake

new file mode 100644 (file)

index 0000000..2551e1b
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake
@@ -0,0 +1,17 @@
+#
+# Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A72 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv8-a -mtune=cortex-a72 -mfpu=neon-fp-armv8 -mfloat-abi=hard" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -mthumb -g" CACHE STRING "" FORCE) #same flags for asm sources
diff --git a/cmake/Toolchains/arm_cortex_a8_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a8_hardfp_native.cmake

new file mode 100644 (file)

index 0000000..588bb0c
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a8_hardfp_native.cmake
@@ -0,0 +1,17 @@
+#
+# Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A8 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=hard" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -g" CACHE STRING "" FORCE) #same flags for asm sources
+\ No newline at end of file
diff --git a/cmake/Toolchains/arm_cortex_a8_softfp_native.cmake b/cmake/Toolchains/arm_cortex_a8_softfp_native.cmake

new file mode 100644 (file)

index 0000000..5cb2018
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a8_softfp_native.cmake
@@ -0,0 +1,17 @@
+#
+# Copyright 2014, 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A8 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=softfp" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -g" CACHE STRING "" FORCE) #same flags for asm sources
+\ No newline at end of file
diff --git a/cmake/Toolchains/arm_cortex_a9_hardfp_native.cmake b/cmake/Toolchains/arm_cortex_a9_hardfp_native.cmake

new file mode 100644 (file)

index 0000000..7cca24f
--- /dev/null
+++ b/cmake/Toolchains/arm_cortex_a9_hardfp_native.cmake
@@ -0,0 +1,17 @@
+#
+# Copyright 2014, 2018, 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+########################################################################
+# Toolchain file for building native on a ARM Cortex A8 w/ NEON
+# Usage: cmake -DCMAKE_TOOLCHAIN_FILE=<this file> <source directory>
+########################################################################
+set(CMAKE_CXX_COMPILER g++)
+set(CMAKE_C_COMPILER  gcc)
+set(CMAKE_CXX_FLAGS "-march=armv7-a -mtune=cortex-a9 -mfpu=neon -mfloat-abi=hard" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE) #same flags for C sources
+set(CMAKE_ASM_FLAGS "${CMAKE_CXX_FLAGS} -g" CACHE STRING "" FORCE) #same flags for asm sources
+\ No newline at end of file
diff --git a/cmake/Toolchains/intel-sde.cmake b/cmake/Toolchains/intel-sde.cmake

new file mode 100644 (file)

index 0000000..04f9065
--- /dev/null
+++ b/cmake/Toolchains/intel-sde.cmake
@@ -0,0 +1,11 @@
+#
+# Copyright 2019 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=knl")
+set(CMAKE_CROSSCOMPILING_EMULATOR "$ENV{TRAVIS_BUILD_DIR}/cache/$ENV{SDE_VERSION}/sde64 -knl --")
diff --git a/cmake/Toolchains/oe-sdk_cross.cmake b/cmake/Toolchains/oe-sdk_cross.cmake

new file mode 100644 (file)

index 0000000..b41463a
--- /dev/null
+++ b/cmake/Toolchains/oe-sdk_cross.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+set( CMAKE_SYSTEM_NAME Linux )
+#set( CMAKE_C_COMPILER  $ENV{CC} )
+#set( CMAKE_CXX_COMPILER  $ENV{CXX} )
+string(REGEX MATCH "sysroots/([a-zA-Z0-9]+)" CMAKE_SYSTEM_PROCESSOR $ENV{SDKTARGETSYSROOT})
+string(REGEX REPLACE "sysroots/" "" CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR})
+set( CMAKE_CXX_FLAGS $ENV{CXXFLAGS}  CACHE STRING "" FORCE )
+set( CMAKE_C_FLAGS $ENV{CFLAGS} CACHE STRING "" FORCE ) #same flags for C sources
+set( CMAKE_LDFLAGS_FLAGS ${CMAKE_CXX_FLAGS} CACHE STRING "" FORCE ) #same flags for C sources
+set( CMAKE_LIBRARY_PATH ${OECORE_TARGET_SYSROOT}/usr/lib )
+set( CMAKE_FIND_ROOT_PATH $ENV{OECORE_TARGET_SYSROOT} $ENV{OECORE_NATIVE_SYSROOT} )
+set( CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER )
+set( CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY )
+set( CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY )
+set ( ORC_INCLUDE_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/include/orc-0.4 )
+set ( ORC_LIBRARY_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/lib )
diff --git a/cmake/cmake_uninstall.cmake.in b/cmake/cmake_uninstall.cmake.in

new file mode 100644 (file)

index 0000000..d9d13ea
--- /dev/null
+++ b/cmake/cmake_uninstall.cmake.in
@@ -0,0 +1,39 @@
+# Copyright 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+# http://www.vtk.org/Wiki/CMake_FAQ#Can_I_do_.22make_uninstall.22_with_CMake.3F
+
+IF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  MESSAGE(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"")
+ENDIF(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+FILE(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+STRING(REGEX REPLACE "\n" ";" files "${files}")
+FOREACH(file ${files})
+  MESSAGE(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
+  IF(EXISTS "$ENV{DESTDIR}${file}")
+    EXEC_PROGRAM(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    IF(NOT "${rm_retval}" STREQUAL 0)
+      MESSAGE(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
+    ENDIF(NOT "${rm_retval}" STREQUAL 0)
+  ELSEIF(IS_SYMLINK "$ENV{DESTDIR}${file}")
+    EXEC_PROGRAM(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    IF(NOT "${rm_retval}" STREQUAL 0)
+      MESSAGE(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
+    ENDIF(NOT "${rm_retval}" STREQUAL 0)
+  ELSE(EXISTS "$ENV{DESTDIR}${file}")
+    MESSAGE(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.")
+  ENDIF(EXISTS "$ENV{DESTDIR}${file}")
+ENDFOREACH(file)
diff --git a/cmake/msvc/config.h b/cmake/msvc/config.h

new file mode 100644 (file)

index 0000000..2300acd
--- /dev/null
+++ b/cmake/msvc/config.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2012, 2017, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_CONFIG_H_ // [
+#define _MSC_CONFIG_H_
+
+////////////////////////////////////////////////////////////////////////
+// enable inline functions for C code
+////////////////////////////////////////////////////////////////////////
+#ifndef __cplusplus
+#define inline __inline
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// signed size_t
+////////////////////////////////////////////////////////////////////////
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
+
+////////////////////////////////////////////////////////////////////////
+// rint functions
+////////////////////////////////////////////////////////////////////////
+#if _MSC_VER < 1800
+#include <math.h>
+static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); }
+static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); }
+static inline long long llrint(double x)
+{
+    return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);
+}
+static inline long long llrintf(float x)
+{
+    return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);
+}
+static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); }
+static inline float rintf(float x)
+{
+    return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// math constants
+////////////////////////////////////////////////////////////////////////
+#if _MSC_VER < 1800
+#include <math.h>
+#define INFINITY HUGE_VAL
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// random and srandom
+////////////////////////////////////////////////////////////////////////
+#include <stdlib.h>
+static inline long int random(void) { return rand(); }
+static inline void srandom(unsigned int seed) { srand(seed); }
+
+#endif // _MSC_CONFIG_H_ ]
diff --git a/cmake/msvc/sys/time.h b/cmake/msvc/sys/time.h

new file mode 100644 (file)

index 0000000..d1f7ed2
--- /dev/null
+++ b/cmake/msvc/sys/time.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2018, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef _MSC_VER // [
+#error "Use this header only with Microsoft Visual C++ compilers!"
+#endif // _MSC_VER ]
+
+#ifndef _MSC_SYS_TIME_H_
+#define _MSC_SYS_TIME_H_
+
+// prevent windows.h from clobbering min and max functions with macros
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+
+// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668
+#include < time.h >
+#include <windows.h> //I've omitted this line.
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
+#else
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+#if _MSC_VER < 1900
+struct timespec {
+
+    time_t tv_sec; /* Seconds since 00:00:00 GMT, */
+
+    /* 1 January 1970 */
+
+    long tv_nsec; /* Additional nanoseconds since */
+
+    /* tv_sec */
+};
+#endif
+
+struct timezone {
+    int tz_minuteswest; /* minutes W of Greenwich */
+    int tz_dsttime;     /* type of dst correction */
+};
+
+static inline int gettimeofday(struct timeval* tv, struct timezone* tz)
+{
+    FILETIME ft;
+    unsigned __int64 tmpres = 0;
+    static int tzflag;
+
+    if (NULL != tv) {
+        GetSystemTimeAsFileTime(&ft);
+
+        tmpres |= ft.dwHighDateTime;
+        tmpres <<= 32;
+        tmpres |= ft.dwLowDateTime;
+
+        /*converting file time to unix epoch*/
+        tmpres -= DELTA_EPOCH_IN_MICROSECS;
+        tv->tv_sec = (long)(tmpres / 1000000UL);
+        tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+    if (NULL != tz) {
+        if (!tzflag) {
+            _tzset();
+            tzflag++;
+        }
+        tz->tz_minuteswest = _timezone / 60;
+        tz->tz_dsttime = _daylight;
+    }
+
+    return 0;
+}
+
+#endif //_MSC_SYS_TIME_H_
diff --git a/docs/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md b/docs/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md

new file mode 100644 (file)

index 0000000..354eaca
--- /dev/null
+++ b/docs/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md
@@ -0,0 +1,112 @@
+# List of authors resubmitting their contributions under LGPL-3.0-or-later
+
+VOLK is migrating from GNU General Public License v3.0 or later (GPL-3.0-or-later)
+to GNU Lesser General Public License v3.0 or later (LGPL-3.0-or-later).
+
+This file is a list of the authors who agreed to resubmit their code
+under the LGPL-3.0-or-later license to this repository.
+In case the affected code is licensed differently than VOLK's current license (GPL-3.0-or-later),
+this gives the VOLK project the right to use the current contributions under both LGPL-3.0-or-later and that other license.
+Future contributions by these authors are, however,
+licensed under LGPL-3.0-or-later, unless explicitly stated otherwise.
+
+## Current status
+
+We'd like to thank everyone who helped to re-license VOLK under LGPL!
+Being technical: There are 3 people left (out of 74) who we haven't been able to get in contact with (at all), for a total of 4 (out of 1092) commits, 13 (of 282822) additions, and 7 (of 170421) deletions. We have reviewed these commits and all are simple changes (e.g., 1 line change) and most are no longer relevant (e.g., to a file that no longer exists). VOLK maintainers are in agreement that the combination -- small numbers of changes per committer, simple changes per commit, commits no longer relevant -- means that we can proceed with relicensing without the approval of the folks. We will try reaching out periodically to these folks, but we believe it unlikely we will get a reply.
+
+## How to add myself to the table of authors
+There are several options. Choose the one you prefer.
+
+1. Open a PR and add yourself to the table.
+2. Add a Github comment that clearly states your wish to be added to the table.
+3. Write an email to the maintainers that clearly states your wish to be added to the table.
+
+### Information for option 2 and 3
+
+If you go with option 2 or 3, the maintainers add your statement to the corresponding git commit message for documentation purposes.
+
+Suggested wording
+
+> I, {contributor name}, hereby resubmit all my contributions to the VOLK project and repository under the terms of the LGPLv3+. My GitHub handle is {github handle}, my email addresses used for contributions are: {email address}, ... .
+
+
+## Table of authors
+
+Together with the date of agreement, these authors are:
+
+|       Date | Author (as used in commits) | GitHub handle   | Email address(es)                                                   |
+|------------|-----------------------------|-----------------|---------------------------------------------------------------------|
+| 2021-07-01 | Marcus Müller               | marcusmueller   | marcus@hostalia.de, mmueller@gnuradio.org, mueller@kit.edu          |
+| 2021-08-01 | Johannes Demel              | jdemel          | ufcsy@student.kit.edu, demel@uni-bremen.de, demel@ant.uni-bremen.de |
+| 2021-08-25 | Martin Braun                | mbr0wn          | martin@gnuradio.org, martin.braun@ettus.com                         |
+| 2021-08-27 | Carles Fernandez            | carlesfernandez | carles.fernandez@cttc.es, carles.fernandez@gmail.com, carlesfernandez@gmail.com |
+| 2021-08-27 | Magnus Lundmark             | Ka-zam          | magnus@skysense.io                                                  |
+| 2021-09-04 | Michael Dickens             | michaelld       | mlk@alum.mit.edu, michael.dickens@ettus.com                         |
+| 2021-09-05 | Andrej Rode                 | noc0lour        | mail@andrejro.de                                                    |
+| 2021-09-06 | rear1019                    | rear1019        | rear1019@posteo.de                                                  |
+| 2021-09-08 | Federico 'Larroca' La Rocca | git-artes       | flarroca@fing.edu.uy                                                |
+| 2021-09-08 | Bernhard M. Wiedemann       | bmwiedemann     | bwiedemann@suse.de                                                  |
+| 2021-09-08 | A. Maitland Bottoms         | maitbot         | bottoms@debian.org                                                  |
+| 2021-09-08 | Paul Cercueil               | pcercuei        | paul.cercueil@analog.com                                            |
+| 2021-09-08 | Jeison Cardoso              | jsonzilla, 0unit| cardoso.jeison@gmail.com                                            |
+| 2021-09-08 | Brennan Ashton              | btashton        | bashton@brennanashton.com                                           |
+| 2021-09-08 | Ryan Volz                   | ryanvolz        | ryan.volz@gmail.com, rvolz@mit.edu                                  |
+| 2021-09-08 | Douglas Anderson            | djanderson      | douglas.j.anderson@gmail.com, djanderson@users.noreply.github.com   |
+| 2021-09-08 | Nicholas McCarthy           | namccart        | namccart@gmail.com                                                  |
+| 2021-09-09 | Jaroslav Škarvada           | yarda           | jskarvad@redhat.com                                                 |
+| 2021-09-09 | Vasil Velichkov             | velichkov       | vvvelichkov@gmail.com                                               |
+| 2021-09-09 | Takehiro Sekine             | bstalk          | takehiro.sekine@ps23.jp                                             |
+| 2021-09-10 | Vanya Sergeev               | vsergeev        | vsergeev@gmail.com                                                  |
+| 2021-09-10 | Ben Hilburn                 | bhilburn        | bhilburn@gnuradio.org, bhilburn@gmail.com, ben@ettus.com, ben@hilburn.dev |
+| 2021-09-09 | Philip Balister             | balister        | philip@balister.org, root@usrp-e1xx.(none)                          |
+| 2021-09-12 | Andrey Rodionov             | dernasherbrezon | rodionovamp@mail.ru                                                 |
+| 2021-09-13 | Clayton Smith               | argilo          | argilo@gmail.com                                                    |
+| 2021-09-14 | Martin Kaesberger           | mkaesberger     | git@skipfish.de                                                     |
+| 2021-09-08 | Karl Semich                 | xloem           | 0xloem@gmail.com                                                    |
+| 2021-09-23 | Mike Piscopo                | ghostop14       | ghostop14@gmail.com                                                 |
+| 2021-09-13 | Amr Bekhit                  | amrbekhit       | amr@helmpcb.com, amrbekhit@gmail.com                                |
+| 2021-09-19 | Eric Blossom                | eblossom        | eb@comsec.com                                                       |
+| 2021-09-20 | Douglas Geiger              | douggeiger      | doug.geiger@bioradiation.net,douglas.geiger@nrl.navy.mil            |
+| 2021-09-21 | Zlika                       | Zlika           | zlika_ese@hotmail.com                                               |
+| 2021-09-24 | Ron Economos                | drmpeg          | w6rz@comcast.net                                                    |
+| 2021-09-26 | Marc Lichtman               | 777arc          | marcll@vt.edu                                                       |
+| 2021-09-28 | Stefan Wunsch               | stwunsch        | stefan.wunsch@student.kit.edu                                       |
+| 2021-10-02 | Albin Stigö                 | ast             | albin.stigo@gmail.com                                               |
+| 2021-10-09 | Florian Ritterhoff          | fritterhoff     | ritterho@hm.edu                                                     |
+| 2021-10-06 | Sylvain Munaut              | smunaut         | tnt@246tNt.com, 246tnt@gmail.com                                    |
+| 2021-10-10 | Dan Robertson               | dlrobertson     | dan@dlrobertson.com                                                 |
+| 2021-10-10 | Steve Markgraf              | steve-m         | steve@steve-m.de                                                    |
+| 2021-10-10 | Gwenhael Goavec-Merou       | trabucayre      | gwenhael.goavec-merou@trabucayre.com                                |
+| 2021-10-10 | Doron Behar                 | doronbehar      | doron.behar@gmail.com                                               |
+| 2021-10-10 | Brandon Enochs              | brandonenochs   | brandon.enochs@nrl.navy.mil                                         |
+| 2021-10-10 | Thomas Rondeau              | trondeau        | tom@trondeau.com, trondeau@vt.edu                                   |
+| 2021-10-11 | Tim O'Shea                  | oshtim          | tim.oshea753@gmail.com                                              |
+| 2021-10-13 | Alexey Slokva               | Alesha72003     | Alesha72003@ya.ru                                                   |
+| 2021-10-17 | Josh Blum                   | guruofquality   | josh@joshknows.com                                                  |
+| 2021-10-14 | Nicholas Corgan             | ncorgan         | n.corgan@gmail.com, nick.corgan@ettus.com                           |
+| 2021-10-24 | Andy Walls                  | awalls-cx18     | andy@silverblocksystems.net, awalls.cx18@gmail.com, awalls@md.metrocast.net |
+| 2021-10-26 | Ettus Research              | N/A             | nick.corgan@ettus.com, nick@ettus.com, ben.hilburn@ettus.com, michael.dickens@ettus.com |
+| 2021-10-27 | Pascal Giard                | evilynux        | evilynux@gmail.com, pascal.giard@lacime.etsmtl.ca, pascal.giard@etsmtl.ca, pascal.giard@mail.mcgill.ca |
+| 2021-10-27 | The Aerospace Corporation   | N/A             | oss@aero.org, damian.miralles@aero.org, jessica.iwamoto@aero.org, kyle.a.logue@aero.org |
+| 2021-10-29 | Christoph Mayer             | hcab14          | hcab14@gmail.com, Christoph.Mayer@cern.ch                           |
+| 2021-10-29 | Valerii Zapodovnikov        | ValZapod        | val.zapod.vz@gmail.com                                              |
+| 2021-10-29 | Stefan Oltmanns             | Stefan-Olt      | stefan-oltmanns@gmx.net                                             |
+| 2021-10-29 | Mathieu Rene                | mrene           | mrene@avgs.ca                                                       |
+| 2021-10-29 | Steven Behnke               | sbehnke         | steven_behnke@me.com                                                |
+| 2021-10-29 | Adam Thompson               | awthomp         | adamt@nvidia.com                                                    |
+| 2021-10-29 | Nick Foster                 | bistromath      | bistromath@gmail.com, nick@nerdnetworks.org                         |
+| 2021-10-29 | Roy Thompson                | roythompson     | rthompso@gmail.com                                                  |
+| 2021-10-29 | luzpaz                      | luzpaz          | luzpaz@users.noreply.github.com                                     |
+| 2021-10-29 | Damian Miralles             | dmiralles2009   | dmiralles2009@gmail.com, damian.miralles@colorado.edu               |
+| 2021-11-06 | Julien Olivain              | jolivain        | julien.olivain@lsv.ens-cachan.fr                                    |
+| 2021-11-07 | Jam M. Hernandez Quiceno    | JamMarHer       | jamarck96@gmail.com, jam_quiceno@partech.com                        |
+| 2021-11-19 | Abhishek Bhowmick           | abhowmick22     | abhowmick22@gmail.com                                               |
+| 2021-11-29 | Aang23 (Alan)               | Aang23          | qwerty15@gmx.fr, aang23@altillimity.com                             |
+| 2022-02-06 | Nathan West                 | n-west          | nwest@deepsig.io, nate.ewest@gmail.com, nathan.west@gnuradio.org, nathan.west@nrl.navy.mil, nathan.west@okstate.edu, nathan@pepper                             |
+| 2022-04-01 | Albert Holguin              | hades-          | aholguin_77@yahoo.com, codename.hash@gmail.com                      |
+| 2022-04-14 | Elliot Briggs               | N/A             | elliot.briggs@gmail.com                                             |
+| 2022-05-31 | Geoffrey Nieboer            | gnieboer        | gnieboer@gcndevelopment.com, gnieboer@corpcomm.net                  |
+| 2022-06-28 | Moritz Fischer              | N/A             | gnuradio@pure-entropy.org                                           |
+| 2022-06-29 | Johnathan Corgan            | jmcorgan        | jcorgan@corganenterprises.com, johnathan@corganlabs.com             |
+|            |                             |                 |                                                                     |
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md

new file mode 100644 (file)

index 0000000..5e664c7
--- /dev/null
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,685 @@
+# Changelog
+All notable changes to VOLK will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
+and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), starting with version 2.0.0.
+
+
+## [2.0.0] - 2019-08-06
+
+This is the first version to use semantic versioning. It starts the logging of changes.
+
+
+## [2.1.0] - 2019-12-22
+
+Hi everyone,
+
+we would like to announce that Michael Dickens and Johannes Demel are the new VOLK maintainers. We want to review and merge PRs in a timely manner as well as commenting on issues in order to resolve them.
+
+We want to thank all contributors. This release wouldn't have been possible without them.
+
+We're curious about VOLK users. Especially we'd like to learn about VOLK users who use VOLK outside GNU Radio.
+
+If you have ideas for VOLK enhancements, let us know. Start with an issue to discuss your idea. We'll be happy to see new features get merged into VOLK.
+
+### Highlights
+
+VOLK v2.1.0 is a collection of really cool changes. We'd like to highlight some of them.
+
+- The AVX FMA rotator bug is fixed
+- VOLK offers `volk::vector<>` for C++ to follow RAII
+- Move towards modern dependencies
+    - CMake 3.8
+    - Prefer Python3
+        - We will drop Python2 support in a future release!
+    - Use C++17 `std::filesystem`
+        - This enables VOLK to be built without Boost if available!
+- more stable CI
+- lots of bugfixes
+- more optimized kernels, especially more NEON versions
+
+### Contributors
+
+*  Albin Stigo <albin.stigo@gmail.com>
+*  Amr Bekhit <amr@helmpcb.com>
+*  Andrej Rode <mail@andrejro.de>
+*  Carles Fernandez <carles.fernandez@gmail.com>
+*  Christoph Mayer <hcab14@gmail.com>
+*  Clayton Smith <argilo@gmail.com>
+*  Damian Miralles <dmiralles2009@gmail.com>
+*  Johannes Demel <demel@ant.uni-bremen.de> <demel@uni-bremen.de>
+*  Marcus Müller <marcus@hostalia.de>
+*  Michael Dickens <michael.dickens@ettus.com>
+*  Philip Balister <philip@balister.org>
+*  Takehiro Sekine <takehiro.sekine@ps23.jp>
+*  Vasil Velichkov <vvvelichkov@gmail.com>
+*  luz.paz <luzpaz@users.noreply.github.com>
+
+
+### Changes
+
+* Usage
+    - Update README to reflect how to build on Raspberry Pi and the importance of running volk_profile
+
+* Toolchain
+    -  Add toolchain file for Raspberry Pi 3
+    -  Update Raspberry 4 toolchain file
+
+* Kernels
+    - Add neonv7 to volk_16ic_magnitude_16i
+    - Add neonv7 to volk_32fc_index_max_32u
+    - Add neonv7 to volk_32fc_s32f_power_spectrum_32f
+    - Add NEONv8 to volk_32f_64f_add_64f
+    - Add Neonv8 to volk_32fc_deinterleave_64f_x2
+    - Add volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
+    - Add NEONv8 to volk_32fc_convert_16ic
+
+* CI
+    - Fix AVX FMA rotator
+    - appveyor: Enable testing on windows
+    - Fixes for flaky kernels for more reliable CI
+        - volk_32f_log2_32f
+        - volk_32f_x3_sum_of_poly_32f
+        - volk_32f_index_max_{16,32}u
+        - volk_32f_8u_polarbutterflypuppet_32f
+        - volk_8u_conv_k7_r2puppet_8u
+        - volk_32fc_convert_16ic
+        - volk_32fc_s32f_magnitude_16i
+        - volk_32f_s32f_convert_{8,16,32}i
+        - volk_16ic_magnitude_16i
+        - volk_32f_64f_add_64f
+    - Use Intel SDE to test all kernels
+    - TravisCI
+        - Add native tests on arm64
+        - Add native tests on s390x and ppc64le (allow failure)
+
+* Build
+    - Build Volk without Boost if C++17 std::filesystem or std::experimental::filesystem is available
+    - Update to more modern CMake
+    - Prevent CMake to choose previously installed VOLK headers
+    - CMake
+        - bump minimum version to 3.8
+        - Use sha256 instead of md5 for unique target name hash
+    - Python: Prefer Python3 over Python2 if available
+
+* C++
+    - VOLK C++ allocator and C++11 std::vector type alias added
+\n
+## [2.2.0] - 2020-02-16
+
+Hi everyone,
+
+we have a new VOLK release v2.2.0!
+
+We want to thank all contributors. This release wouldn't have been possible without them.
+
+We're curious about VOLK users. Especially we'd like to learn about VOLK users who use VOLK outside GNU Radio.
+
+If you have ideas for VOLK enhancements, let us know. Start with an issue to discuss your idea. We'll be happy to see new features get merged into VOLK.
+
+The v2.1.0 release was rather large because we had a lot of backlog. We aim for more incremental releases in order to get new features out there.
+
+### Highlights
+
+VOLK v2.2.0 updates our build tools and adds support functionality to make it easier to use VOLK in your projects.
+
+* Dropped Python 2 build support
+    - Removed Python six module dependency
+* Use C11 aligned_alloc whenever possible
+    - MacOS `posix_memalign` fall-back
+    - MSVC `_aligned_malloc` fall-back
+* Add VOLK version in `volk_version.h` (included in `volk.h`)
+* Improved CMake code
+* Improved code with lots of refactoring and performance tweaks
+
+### Contributors
+
+*  Carles Fernandez <carles.fernandez@gmail.com>
+*  Gwenhael Goavec-Merou <gwenhael.goavec-merou@trabucayre.com>
+*  Albin Stigo <albin.stigo@gmail.com>
+*  Johannes Demel <demel@ant.uni-bremen.de> <demel@uni-bremen.de>
+*  Michael Dickens <michael.dickens@ettus.com>
+*  Valerii Zapodovnikov <val.zapod.vz@gmail.com>
+*  Vasil Velichkov <vvvelichkov@gmail.com>
+*  ghostop14 <ghostop14@gmail.com>
+*  rear1019 <rear1019@posteo.de>
+
+### Changes
+
+* CMake
+    - Fix detection of AVX and NEON
+    - Fix for macOS
+    - lib/CMakeLists: use __asm__ instead of asm for ARM tests
+    - lib/CMakeLists: fix detection when compiler support NEON but nor neonv7 nor neonv8
+    - lib/CMakeLists.txt: use __VOLK_ASM instead of __asm__
+    - lib/CMakeLists.txt: let VOLK choose preferred neon version when both are supported
+    - lib/CMakeLists.txt: simplify neon test support. Unset neon version if not supported
+    - For attribute, change from clang to "clang but not MSC"
+* Readme
+    - logo: Add logo at top of README.md
+* Build dependencies
+    - python: Drop Python2 support
+    - python: Reduce six usage
+    - python: Move to Python3 syntax and modules
+    - six: Remove build dependency on python six
+* Allocation
+    - alloc: Use C11 aligned_alloc
+    - alloc: Implement fall backs for C11 aligned_alloc
+    - alloc: Fix for incomplete MSVC standard compliance
+    - alloc: update to reflect alloc changes
+* Library usage
+    - Fixup VolkConfigVersion
+    - add volk_version.h
+* Refactoring
+    - qa_utils.cc: fix always false expression
+    - volk_prefs.c: check null realloc and use temporary pointer
+    - volk_profile.cc: double assignment and return 0
+    - volk_32f_x2_pow_32f.h: do not need to _mm256_setzero_ps()
+    - volk_8u_conv_k7_r2puppet_8u.h: d_polys[i] is positive
+    - kernels: change one iteration for's to if's
+    - kernels: get rid of some assignments
+    - qa_utils.cc: actually throw something
+    - qa_utils.cc: fix always true code
+    - rotator: Refactor AVX kernels
+    - rotator: Remove unnecessary variable
+    - kernel: Refactor square_dist_scalar_mult
+    - square_dist_scalar_mult: Speed-Up AVX, Add unaligned
+    - square_dist_scalar_mult: refactor AVX2 kernel
+    - kernel: create AVX2 meta intrinsics
+* CI
+    - appveyor: Test with python 3.4 and 3.8
+    - appveyor: Add job names
+    - appveyor: Make ctest more verbose
+* Performance
+    - Improve performance of generic kernels with complex multiply
+    - square_dist_scalar_mult: Add SSE version
+    - Adds NEON versions of cos, sin and tan
+
+## [2.2.1] - 2020-02-24
+
+Hi everyone,
+
+with VOLK 2.2.0, we introduced another AVX rotator bug which is fixed with this release.
+In the process 2 more bugs were identified and fixed. Further, we saw some documentation improvements.
+
+
+### Contributors
+
+*  Clayton Smith <argilo@gmail.com>
+*  Michael Dickens <michael.dickens@ettus.com>
+
+
+### Changes
+
+
+* Fix loop bound in AVX rotator
+* Fix out-of-bounds read in AVX2 square dist kernel
+* Fix length checks in AVX2 index max kernels
+* includes: rearrange attributes to simplify macros Whitespace
+* kernels: fix usage in header comments
+\n
+## [2.3.0] - 2020-05-09
+
+Hi everyone!
+
+VOLK 2.3 is out! We want to thank all contributors. This release wouldn't have been possible without them. We saw lots of great improvements.
+
+On GNU Radio 'master' VOLK was finally removed as a submodule.
+
+Currently we see ongoing discussions on how to improve CPU feature detection because VOLK is not as reliable as we'd like it to be in that department. We'd like to benefit from other open source projects and don't want to reinvent the wheel. Thus, one approach would be to include `cpu_features` as a submodule.
+
+### Highlights
+
+* Better reproducible builds
+* CMake improvements
+    - ORC is removed from the public interface where it was never supposed to be.
+    - CMake fixes for better usability
+* Updated and new CI chain
+    - TravisCI moved to new distro and does more tests for newer GCC/Clang
+    - Github Actions
+        - Add Action to check proper code formatting.
+        - Add CI that also runs on MacOS with XCode.
+* Enforce C/C++ coding style via clang-format
+* Kernel fixes
+    - Add puppet for `power_spectral_density` kernel
+    - Treat the `mod_range` puppet as a puppet for correct use with `volk_profile`
+    - Fix `index_max` kernels
+    - Fix `rotator`. We hope that we finally found the root cause of the issue.
+* Kernel optimizations
+    - Updated log10 calcs to use faster log2 approach
+    - Optimize `complexmultiplyconjugate`
+* New kernels
+    - accurate exp kernel is finally merged after years
+    - Add 32f_s32f_add_32f kernel to perform vector + scalar float operation
+
+### Contributors
+
+* Bernhard M. Wiedemann <bwiedemann@suse.de>
+* Clayton Smith <argilo@gmail.com>
+* Johannes Demel <demel@ant.uni-bremen.de> <demel@uni-bremen.de>
+* Michael Dickens <michael.dickens@ettus.com>
+* Tom Rondeau <tom@trondeau.com>
+* Vasil Velichkov <vvvelichkov@gmail.com>
+* ghostop14 <ghostop14@gmail.com>
+
+### Changes
+
+* Reproducible builds
+    - Drop compile-time CPU detection
+    - Drop another instance of compile-time CPU detection
+* CI updates
+    - ci: Add Github CI Action
+    - ci: Remove Ubuntu 16.04 GCC5 test on TravisCI
+    - TravisCI: Update CI to bionic distro
+    - TravisCI: Add GCC 8 test
+    - TravisCI: Structure CI file
+    - TravisCI: Clean-up .travis.yml
+* Enforce C/C++ coding style
+    - clang-format: Apply clang-format
+    - clang-format: Update PR with GitHub Action
+    - clang-format: Rebase onto current master
+* Fix compiler warnings
+    - shadow: rebase kernel fixes
+    - shadow: rebase volk_profile fix
+* CMake
+    - cmake: Remove the ORC from the VOLK public link interface
+    - versioning: Remove .Maint from libvolk version
+    - Fix endif macro name in comment
+* Kernels
+    - volk: accurate exp kernel
+        - exp: Rename SSE4.1 to SSE2 kernel
+    - Add 32f_s32f_add_32f kernel
+        - This kernel adds in vector + scalar functionality
+    - Fix the broken index max kernels
+    - Treat the mod_range puppet as such
+    - Add puppet for power spectral density kernel
+    - Updated log10 calcs to use faster log2 approach
+    - fix: Use unaligned load
+    - divide: Optimize complexmultiplyconjugate
+
+
+## [2.4.0] - 2020-11-22
+
+Hi everyone!
+
+We have another VOLK release! We're happy to announce VOLK v2.4.0! We want to thank all contributors. This release wouldn't have been possible without them.
+
+We introduce `cpu_features` as a private submodule in this release because we use it to detect CPU features during runtime now. This should enable more reliable feature detection. Further, it takes away the burden to implement platform specific code. As a result, optimized VOLK kernels build for MacOS and Windows with AppleClang/MSVC out-of-the-box now.
+
+
+### Highlights
+
+* Documentation
+    - Update README to be more verbose and to improve usefulness.
+
+* Compilers
+    - MSVC: Handle compiler flags and thus architecture specific kernels correctly. This enables optimized kernels with MSVC builds.
+    - AppleClang: Treat AppleClang as Clang.
+    - Paired with the `cpu_features` introduction, this enables us to use architecture specific kernels on a broader set of platforms.
+* CI
+    - Update to newest version of the Intel SDE
+    - Test the rotator kernel with a realistic scalar
+    - Introduce more test cases with newer GCC and newer Clang versions.
+* CMake
+    - Enable to not install `volk_modtool`.
+    - Remove "find_package_handle_standard_args" warning.
+* cpu_features
+    - Use `cpu_features` v0.6.0 as a private submodule to detect available CPU features.
+    - Fix incorrect feature detection for newer AVX versions.
+    - Circumvent platform specific feature detection.
+    - Enable more architecture specific kernels on more platforms.
+* Kernels
+    - Disable slow and broken SSE4.1 kernel in `volk_32fc_x2_dot_prod_32fc`
+    - Adjust min/max for `32f_s32f_convert_8i` kernel
+    - Use `INT8_*` instead of `CHAR_*`
+
+
+### Contributors
+
+* Adam Thompson <adamt@nvidia.com>
+* Andrej Rode <mail@andrejro.de>
+* Christoph Mayer <hcab14@gmail.com>
+* Clayton Smith <argilo@gmail.com>
+* Doron Behar <doron.behar@gmail.com>
+* Johannes Demel <demel@ant.uni-bremen.de>, <demel@uni-bremen.de>
+* Martin Kaesberger <git@skipfish.de>
+* Michael Dickens <michael.dickens@ettus.com>
+* Ron Economos <w6rz@comcast.net>
+
+
+### Changes
+
+* Documentation
+    - Update README to include ldconfig upon volk build and install completion
+    - Update README based on review
+    - readme: Fix wording
+    - docs: Fix conversion inaccuracy
+
+* MSVC
+    - archs: MSVC 2013 and greater don't have a SSE flag
+
+* CI
+    - update to newest version of the Intel SDE
+    - Test the rotator kernel with a realistic scalar
+
+* CMake
+    - build: Enable to not install volk_modtool
+    - cmake: Remove "find_package_handle_standard_args" warning.
+    - cmake: Ensure that cpu_features is built as a static library.
+
+* cpu_features
+    - readme: Add section on supported platforms
+    - readme: Make supported compiler section more specific
+    - travis: Add GCC 9 test on focal
+    - travis: Add tests for clang 8, 9, 10
+    - travis: Fix incorrect CXX compiler assignment
+    - cpu_features: Remove unused feature checks
+    - ci: Update TravisCI for cpu_features
+    - cpu_features: Fix MSVC build
+    - pic: Fix BUILD_PIC issue
+    - ci: Update CI system configuration
+    - cpu_features: Bump submodule pointer to v0.6.0
+    - docs: Add hints how to handle required submodules
+    - cpu_features: Switch to cpu_features
+    - ci: Update CI system for cpu_features
+    - cpu_features: Force PIC build flag
+    - appveyor: Add recursive clone command
+    - cpu_features: Remove xgetbv checks
+    - pic: Cache and force BUILD_PIC
+    - ci: Remove explicit BUILD_PIC from cmake args
+    - ci: Remove BUILD_PIC from CI cmake args
+    - cpu_features: Remove commented code
+    - cpu_features: Assume AppleClang == Clang
+    - cpu_features: Remove obsolete code in archs.xml
+    - fix for ARM cross-compiling CI
+    - ARM CI: remove unneeded environment variables
+
+* Housekeeping
+    - structure: Move release scripts to scripts folder
+
+* Kernels
+    - emit an emms instruction after using the mmx extension
+    - volk_32fc_x2_dot_prod_32fc: disable slow & broken SSE4.1 kernel
+    - fix: Adjust min/max for 32f_s32f_convert_8i kernel
+    - fix: Use INT8_* instead of CHAR_*
+
+
+## [2.4.1] - 2020-12-17
+
+Hi everyone!
+
+We have a new VOLK bugfix release! We are happy to announce VOLK v2.4.1! We want to thank all contributors. This release wouldn't have been possible without them.
+
+Our v2.4.0 release introduced quite a lot of changes under the hood. With this bugfix release, we want to make sure that everything works as expected again.
+
+
+### Contributors
+
+* A. Maitland Bottoms <bottoms@debian.org>
+* Johannes Demel <demel@uni-bremen.de>
+* Michael Dickens <michael.dickens@ettus.com>
+* Philip Balister <philip@balister.org>
+* Ron Economos <w6rz@comcast.net>
+* Ryan Volz <ryan.volz@gmail.com>
+
+
+### Changes
+
+* Build
+    - cpu_features CMake option
+    - Add cpu_features to static library build.
+    - Use static liborc-0.4 library for static library build.
+    - cmake: Detect if cpu_features submodule is present.
+
+* Install
+    - Check for lib64 versus lib and set LIB_SUFFIX accordingly.
+
+* CI
+    - Add CI test for static library build.
+
+* Releases
+    - project: Include git submodules (i.e. cpu_features) in release tarball.
+    - scripts: Add GPG signature to release script
+
+* Other
+    - readme: Update TravisCI status badge
+
+
+## [2.5.0] - 2021-06-05
+
+Hi everyone!
+
+We have a new VOLK release! We are happy to announce VOLK v2.5.0! We want to thank all contributors. This release wouldn't have been possible without them.
+
+This release adds new kernel implementations and fixes. Some of these were longstanding PRs that could only be merged recently thanks to our switch from CLA to DCO.
+
+### Announcements
+
+I would like to point out one upcoming change. After this release, we will rename our development branch to `main` as discussed in [issue #461](https://github.com/gnuradio/volk/issues/461).
+
+
+I'd like to point the community to this [VOLK relicensing GREP](https://github.com/gnuradio/greps/pull/33).
+This is an ongoing effort to relicense VOLK under LGPLv3.
+We're looking for people and organizations that are interested in leading this effort.
+
+### Contributors
+
+* Aang23 <qwerty15@gmx.fr>
+* Carles Fernandez <carles.fernandez@gmail.com>
+* Florian Ritterhoff <ritterho@hm.edu>
+* Jam M. Hernandez Quiceno <jamarck96@gmail.com>, <jam_quiceno@partech.com>
+* Jaroslav Škarvada <jskarvad@redhat.com>
+* Johannes Demel <demel@uni-bremen.de>
+* Magnus Lundmark <magnus@skysense.io>
+* Michael Dickens <michael.dickens@ettus.com>
+* Steven Behnke <steven_behnke@me.com>
+* alesha72003 <alesha72003@ya.ru>
+* dernasherbrezon <rodionovamp@mail.ru>
+* rear1019 <rear1019@posteo.de>
+
+
+### Changes
+
+* Kernels
+    - volk_32f_stddev_and_mean_32f_x2: implemented Young and Cramer's algorithm
+    - volk_32fc_accumulator_s32fc: Add new kernel
+    - volk_16ic_x2_dot_prod_16ic_u_avx2: Fix Typo, was `_axv2`.
+    - Remove _mm256_zeroupper() calls
+    - Enforce consistent function prototypes
+    - 32fc_index_max: Improve speed of AVX2 version
+    - conv_k7_r2: Disable broken AVX2 code
+    - improve volk_8i_s32f_convert_32f for ARM NEON
+    - Calculate cos in AVX512F
+    - Calculate sin using AVX512F
+
+
+* Compilers
+    - MSVC
+        - Fix MSVC builds
+    - GCC
+        - Fix segmentation fault when using GCC 8
+    - MinGW
+        - add support and test for MinGW/MSYS2
+
+* The README has received several improvements
+
+* Build
+    - Fix python version detection
+    - cmake: Check that 'distutils' is available
+    - c11: Remove pre-C11 preprocessor instructions
+
+* CI
+    - Add more CI to GitHub Actions
+    - Remove redundant tests from TravisCI
+    - Add non-x86 GitHub Actions
+    - Update compiler names in CI
+    - Disable fail-fast CI
+    - Add more debug output to tests
+
+* Contributing
+    - contributing: Add CONTRIBUTING.md and DCO.txt
+
+
+## [2.5.1] - 2022-02-12
+
+Hi everyone!
+
+We have a new VOLK release! We are happy to announce VOLK v2.5.1! We want to thank all contributors. This release wouldn't have been possible without them.
+
+The list of contributors is pretty long this time due to a lot of support to relicense VOLK under LGPL. Currently, we are "almost there" but need a few more approvals, please support us. We thank everyone for their support in this effort.
+
+We use `cpu_features` for a while now. This maintainance release should make it easier for package maintainers, FreeBSD users, and M1 users to use VOLK. Package maintainers should be able to use an external `cpu_features` module. For everyone else, `cpu_features` received support for M1 and FreeBSD.
+
+You can find [VOLK on Zenodo DOI: 10.5281/zenodo.3360942](https://doi.org/10.5281/zenodo.3360942).
+We started to actively support Zenodo now via a `.zenodo.json` file. This might come in handy for people who use VOLK in publications. As a contributor, if you want more information about yourself added to VOLK, feel free to add your ORCiD and affiliation.
+
+In the past, we relied on Boost for several tasks in `volk_profile`. For years, we minimized Boost usage to `boost::filesystem`. We mostly switched to C++17 `std::filesystem` years ago. The last distribution in our CI system that required Boost to build VOLK, was Ubuntu 14.04. Thus, now is the time to remove the Boost dependency completely and rely on C++17 features.
+
+Some VOLK kernels are untested for years. We decided to deprecate these kernels but assume that nobody uses them anyways. If your compiler spits out a warning that you use a deprecated kernel, get in touch. Besides, we received fixes for various kernels. Especially FEC kernels are notoriously difficult to debug because issues often pop up as performance regressions.
+
+Finally, we saw a lot of housekeeping in different areas. Scripts to support us in our LGPL relicensing effort, updated docs, and updated our code of conduct. We could remove some double entries in our QA system and fixed a `volk_malloc` bug that ASAN reported.
+Finally, we switched to the Python `sysconfig` module in our build system to ensure Python 3.12+ does not break our builds.
+
+
+
+### Contributors
+
+* A. Maitland Bottoms <bottoms@debian.org>
+* Aang23 <qwerty15@gmx.fr>
+* AlexandreRouma <alexandre.rouma@gmail.com>
+* Andrej Rode <mail@andrejro.de>
+* Ben Hilburn <ben@hilburn.dev>
+* Bernhard M. Wiedemann <bwiedemann@suse.de>
+* Brennan Ashton <bashton@brennanashton.com>
+* Carles Fernandez <carles.fernandez@gmail.com>
+* Clayton Smith <argilo@gmail.com>
+* Doug <douggeiger@users.noreply.github.com>
+* Douglas Anderson <djanderson@users.noreply.github.com>
+* Florian Ritterhoff <ritterho@hm.edu>
+* Jaroslav Škarvada <jskarvad@redhat.com>
+* Johannes Demel <demel@uni-bremen.de>
+* Josh Blum <josh@joshknows.com>
+* Kyle A Logue <kyle.a.logue@aero.org>
+* Luigi Cruz <luigifcruz@gmail.com>
+* Magnus Lundmark <magnus@skysense.io>
+* Marc L <marcll@vt.edu>
+* Marcus Müller <marcus@hostalia.de>
+* Martin Kaesberger <git@skipfish.de>
+* Michael Dickens <michael.dickens@ettus.com>
+* Nathan West <nwest@deepsig.io>
+* Paul Cercueil <paul.cercueil@analog.com>
+* Philip Balister <philip@balister.org>
+* Ron Economos <w6rz@comcast.net>
+* Ryan Volz <ryan.volz@gmail.com>
+* Sylvain Munaut <tnt@246tNt.com>
+* Takehiro Sekine <takehiro.sekine@ps23.jp>
+* Vanya Sergeev <vsergeev@gmail.com>
+* Vasil Velichkov <vvvelichkov@gmail.com>
+* Zlika <zlika_ese@hotmail.com>
+* namccart <namccart@gmail.com>
+* dernasherbrezon <rodionovamp@mail.ru>
+* rear1019 <rear1019@posteo.de>
+
+
+### Changes
+
+* Kernels
+    - Fixup underperforming GENERIC kernel for volk_8u_x4_conv_k7_r2_8u
+    - volk_32fc_x2_conjugate_dot_prod_32fc: New generic implementation
+    - Add volk_32f(c)_index_min_16/32u
+    - Fix volk_32fc_index_min_32u_neon
+    - Fix volk_32fc_index_min_32u_neon
+
+* Misc
+    - Fix volk_malloc alignment bug
+    - qa: Remove repeating tests
+    - python: Switch to sysconfig module
+    - deprecate: Add attribute deprecated
+    - deprecate: Exclude warnings on Windows
+    - docs: Update docs
+    - Add the list of contributors agreeing to LGPL licensing
+    - Add a script to count the lines that are pending resubmission
+    - Testing: Add test for LGPL licensing
+    - Update CODE_OF_CONDUCT file
+
+* Boost
+    - boost: Remove boost dependency
+    - c++: Require C++17 for std::filesystem
+
+* cpu_features
+      cpu_features: Update submodule pointer
+      cpu_features: Make cpu_features submodule optional
+
+* Zenodo
+      zenodo: Add metadata file
+      zenodo: Re-organize .zenodo.json
+
+## [2.5.2] - 2022-09-04
+
+Hi everyone!
+
+We have a new VOLK release! We are happy to announce VOLK v2.5.2! We want to thank all contributors. This release wouldn't have been possible without them.
+
+We are happy to announce that our re-licensing effort is complete. This has been a long and challenging journey. Being technical: There are 3 people left (out of 74) who we haven't been able to get in contact with (at all), for a total of 4 (out of 1092) commits, 13 (of 282822) additions, and 7 (of 170421) deletions. We have reviewed these commits and all are simple changes (e.g., 1 line change) and most are no longer relevant (e.g., to a file that no longer exists). VOLK maintainers are in agreement that the combination -- small numbers of changes per committer, simple changes per commit, commits no longer relevant -- means that we can proceed with re-licensing without the approval of the folks. We will try reaching out periodically to these folks, but we believe it unlikely we will get a reply.
+
+This maintainance release is intended to be the last VOLK 2.x release. After we completed our re-licensing effort, we want to make a VOLK 3.0 release soon. VOLK 3.0 will be fully compatible with VOLK 2.x on a technical level. However, VOLK 3+ will be released under LGPL. We are convinced a license change justifies a major release.
+
+### Contributors
+
+* Aang23 <qwerty15@gmx.fr>
+* Clayton Smith <argilo@gmail.com>
+* Johannes Demel <demel@ant.uni-bremen.de>, <demel@uni-bremen.de>
+* Michael Dickens <michael.dickens@ettus.com>
+* Michael Roe <michael-roe@users.noreply.github.com>
+
+### Changes
+
+* Android
+    - Add Android CI
+    - Fix armeabi-v7a on Android
+* CI
+    - Update all test jobs to more recent actions
+* volk_8u_x4_conv_k7_r2_8u
+    - Add NEON implementation `neonspiral` via `sse2neon.h`
+* Fixes
+    - Fix out-of-bounds reads
+    - Fix broken neon kernels
+    - Fix float to int conversion
+* CMake
+    - Suppress superfluous warning
+    - Fix Python install path calculation and documentation
+* cpu_features
+    - Update submodule pointer
+* VOLK 3.0 release preparations
+    - Use SPDX license identifiers everywhere
+    - Re-arrange files in top-level folder
+    - Update Doxygen and all Doxygen related tasks into `docs`
+
+## [3.0.0] - 2023-01-14
+
+Hi everyone!
+
+This is the VOLK v3.0.0 major release! This release marks the conclusion of a long lasting effort to complete [GREP 23](https://github.com/gnuradio/greps/blob/main/grep-0023-relicense-volk.md) that proposes to change the VOLK license to LGPLv3+. We would like to thank all VOLK contributors that they allowed this re-licensing effort to complete. This release wouldn't have been possible without them.
+
+For VOLK users it is important to not that the VOLK API does NOT change in this major release. After a series of discussion we are convinced a license change justifies a major release. Thus, you may switch to VOLK 3 and enjoy the additional freedom the LGPL offers.
+
+### Motivation for the switch to LGPLv3+
+
+We want to remove usage entry barriers from VOLK. As a result, we expect greater adoption and a growing user and contributor base of VOLK. This move helps to spread the value of Free and Open Source Software in the SIMD community, which so far is dominated by non-FOSS software. Moreover, we recognize the desire of our long term contributors to be able to use VOLK with their contributions in their projects. This may explicitly include proprietary projects. We want to enable all contributors to be able to use VOLK wherever they want. At the same time, we want to make sure that improvements to VOLK itself are easily obtained by everyone, i.e. strike a balance between permissiveness and strict copyleft.
+
+Since VOLK is a library it should choose a fitting license. If we see greater adoption of VOLK in more projects, we are confident that we will receive more contributions. May it be bug fixes, new kernels or even contributions to core features.
+
+Historically, VOLK was part of GNU Radio. Thus, it made total sense to use GPLv3+ just like GNU Radio. Since then, VOLK became its own project with its own repository and leadership. While it is still a core dependency of GNU Radio and considers GNU Radio as its main user, others may want to use it too. Especially, long term VOLK contributors may be able to use VOLK in a broader set of projects now.
+
+After a fruitful series of discussions we settled on the LGPLv3+. We believe this license strikes a good balance between permissiveness and strict copyleft for VOLK. We hope to foster contributions to VOLK. Furthermore, we hope to see VOLK usage in a wider set of projects.
+
+### Contributors
+
+The VOLK 3.0.0 release is only possible because all contributors helped to make it possible. Thus, we omit a list of contributors that contributed since the last release.
+Instead we want to thank everyone again!
+
+### Changes
+
+* License switch to LGPLv3+
+* Fix build for 32 bit arm with neon
+* Add experimental support for MIPS and RISC-V
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt

new file mode 100644 (file)

index 0000000..90eb1d7
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright 2022 Johannes Demel.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+
+message(STATUS "Doxygen found. Building docs ...")
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in
+    ${CMAKE_BINARY_DIR}/Doxyfile
+@ONLY)
+
+add_custom_target(volk_doc
+    ${DOXYGEN_EXECUTABLE} ${CMAKE_BINARY_DIR}/Doxyfile
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    COMMENT "Generating documentation with Doxygen" VERBATIM
+)
+
+endif(DOXYGEN_FOUND)
diff --git a/docs/CODE_OF_CONDUCT.md b/docs/CODE_OF_CONDUCT.md

new file mode 100644 (file)

index 0000000..2d20b6b
--- /dev/null
+++ b/docs/CODE_OF_CONDUCT.md
@@ -0,0 +1,127 @@
+# VOLK Code of Conduct
+
+This Code of Conduct is adapted from the GNU Radio Code of Conduct,
+available in the [gr-governance repository][GRCoC], dated 2021-07-12.
+
+## Our Pledge
+
+We as members, contributors, and leaders strive to make participation
+in our community a harassment-free experience for everyone. Qualities
+and characteristics we note specifically include regardless of age,
+body size, visible or invisible disability, visible or invisible
+enhancement, ethnicity, sex characteristics, gender identity, gender
+expression, sexual identity, sexual orientation, level of experience,
+education, socio-economic status, nationality, personal appearance,
+race, caste, color, religion, and military status -- but lists like
+these cannot be complete and hence we note "**for everyone**" in order
+to be as inclusive as possible.
+
+We pledge to act and interact in ways that contribute to an open,
+welcoming, diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for
+our community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Project maintainers are responsible for clarifying and enforcing our
+standards of acceptable behavior and will take appropriate and fair
+corrective action in response to any behavior that they deem
+inappropriate, threatening, offensive, or harmful.
+
+Project maintainers have the right and responsibility to remove, edit,
+or reject comments, commits, code, wiki edits, issues, and other
+contributions that are not aligned to this Code of Conduct, and will
+communicate reasons for moderation decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also
+applies when an individual is officially representing the community in
+public spaces.  Examples of representing our community include using
+an official e-mail address, posting via an official social media
+account, or acting as an appointed representative at an online or
+offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior
+may be reported by contacting the project team at
+**conduct@gnuradio.org** .  All complaints will be reviewed and
+investigated promptly and fairly.
+
+All complain handlers are obligated to respect the privacy and
+security of the reporter of any incident.
+
+## Enforcement Guidelines
+
+Project maintainers will follow these Community Impact Guidelines in
+determining the consequences for any action they deem in violation of
+this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior
+deemed unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from Project maintainers,
+providing clarity around the nature of the violation and an
+explanation of why the behavior was inappropriate. A public apology
+may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued
+behavior. No interaction with the people involved, including
+unsolicited interaction with those enforcing the Code of Conduct, for
+a specified period of time. This includes avoiding interactions in
+community spaces as well as external channels like social
+media. Violating these terms may lead to a temporary or permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards,
+including sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or
+public communication with the community for a specified period of
+time. No public or private interaction with the people involved,
+including unsolicited interaction with those enforcing the Code of
+Conduct, is allowed during this period.  Violating these terms may
+lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of
+community standards, including sustained inappropriate behavior,
+harassment of an individual, or aggression toward or disparagement of
+classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction
+within the community.
+
+[GRCoC]: https://github.com/gnuradio/gr-governance/blob/main/CODE_OF_CONDUCT.md
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md

new file mode 100644 (file)

index 0000000..699b6ec
--- /dev/null
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,78 @@
+# Contributing to VOLK
+
+Welcome! You are reading about how to contribute code to VOLK. First of
+all, we are very happy that you're about to contribute, and welcome your
+submissions! We hope many more will come.
+
+In this document, we will explain the main things to consider when submitting
+pull requests against VOLK. Reading this first will help a lot with
+streamlining the process of getting your code merged.
+
+There is also a [wiki-based version of this file][wikicontrib], which contains
+more detail. VOLK is part of the GNU Radio project and as such, it follows the 
+same contribution guidelines.  This file is an [adopted GNU Radio checklist][gnuradiocontrib].
+
+## What about non-code contributions?
+
+Those are at least as important as code contributions: Emails to the mailing
+list, answers on Stack Overflow, Wiki page edits, examples... We very much
+appreciate those. However, this document is specifically about contributing
+code.
+
+## DCO Signed?
+
+Any code contributions going into VOLK will become part of an LGPL-licensed
+(former contributions are GPL-licensed), open source repository. It is therefore
+imperative that code submissions belong to the authors, and that submitters have
+the authority to merge that code into the public VOLK codebase.
+
+For that purpose, we use the [Developer's Certificate of Origin](DCO.txt). It
+is the same document used by other projects. Signing the DCO states that there
+are no legal reasons to not merge your code.
+
+To sign the DCO, suffix your git commits with a "Signed-off-by" line. When
+using the command line, you can use `git commit -s` to automatically add this
+line. If there were multiple authors of the code, or other types of
+stakeholders, make sure that all are listed, each with a separate Signed-off-by
+line.
+
+## Coding Guidelines
+
+We have codified our coding guidelines in [GNU Radio GREP1][grep1]. Please read them, 
+and stick to them. For C/C++ code, use clang-format. For Python, PEP8 is your friend
+(but again, check the actual coding guidelines).
+
+## Git commit messages are very important
+
+We follow standard git commit message guidelines, similar to many other open
+source projects. See the [coding guidelines][grep1] for more details. In a
+nutshell:
+- Keep the lines below 72 characters
+- Subject line has the component prepended (e.g., `kernelname:`)
+- Avoid empty git commit messages
+- The git commit message explains the change, the code only explains the current
+  state
+
+## Unit Tests
+
+VOLK unit tests compare the results of each kernel version to the generic version.
+Keep the generic kernel version as simple as possible and verify your optimized
+kernels against the generic version.
+
+## The Buddy Principle: Submit One, Review One
+
+When you've submitted a pull request, please take the time to review another
+one. This helps make sure that there are always a number of reviews at least
+equal to the number of pull requests, which means the maintainers don't get
+overwhelmed when a lot is being contributed.
+
+## Standard command line options
+
+When writing programs that are executable from the command line,
+please follow existing examples regarding their command line arguments, and
+reuse them.
+
+[grep1]: https://github.com/gnuradio/greps/blob/master/grep-0001-coding-guidelines.md
+[wikicontrib]: https://wiki.gnuradio.org/index.php/Development
+[gr-devs]: https://github.com/orgs/gnuradio/teams/gr-devs
+[gnuradiocontrib]: https://github.com/gnuradio/gnuradio/blob/master/CONTRIBUTING.md
diff --git a/docs/DCO.txt b/docs/DCO.txt

new file mode 100644 (file)

index 0000000..8201f99
--- /dev/null
+++ b/docs/DCO.txt
@@ -0,0 +1,37 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+1 Letterman Drive
+Suite D4700
+San Francisco, CA, 94129
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in

new file mode 100644 (file)

index 0000000..70913f5
--- /dev/null
+++ b/docs/Doxyfile.in
@@ -0,0 +1,2364 @@
+# Doxyfile 1.8.6
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "Vector Optimized Library of Kernels"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = @VERSION@
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Architecture-tuned implementations of math kernels"
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
+# the documentation. The maximum height of the logo should not exceed 55 pixels
+# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
+# to the output directory.
+
+PROJECT_LOGO           = @CMAKE_SOURCE_DIR@/docs/volk_logo_small.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+# TODO: configure this to be a special docs directory. nw tried, but running
+# make doc won' create the directory, but with doxygen it will.  why?
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
+# new page for each member. If set to NO, the documentation of a member will be
+# part of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C.
+#
+# Note For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by by putting a % sign in front of the word
+# or globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO these classes will be included in the various overviews. This option has
+# no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
+# todo list. This list is created by putting \todo commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
+# test list. This list is created by putting \test commands in the
+# documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES the list
+# will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = @CMAKE_CURRENT_SOURCE_DIR@/DoxygenLayout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. Do not use file names with spaces, bibtex cannot handle them. See
+# also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO doxygen will only warn about wrong or incomplete parameter
+# documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces.
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = @CMAKE_SOURCE_DIR@
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank the
+# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
+# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
+# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
+# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
+# *.qsf, *.as and *.js.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.f90 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.as \
+                         *.js
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = @CMAKE_BINARY_DIR@ @CMAKE_SOURCE_DIR@/cpu_features @CMAKE_SOURCE_DIR@/README.md @CMAKE_SOURCE_DIR@/docs/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER ) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
+# defined cascading style sheet that is included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet file to the output directory. For an example
+# see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the stylesheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler ( hhc.exe). If non-empty
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated (
+# YES) or that it should be included in the master .chm file ( NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated (
+# YES) or a normal table of contents ( NO) in the .chm file.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = YES
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using prerendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavours of web server based searching depending on the
+# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
+# searching and an index file used by the script. When EXTERNAL_SEARCH is
+# enabled the indexing and searching needs to be provided by external tools. See
+# the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer ( doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. To get the times font for
+# instance you can specify
+# EXTRA_PACKAGES=times
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
+# replace them by respectively the title of the page, the current date and time,
+# only the current date, the version number of doxygen, the project name (see
+# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
+# validating XML parser to check the syntax of the XML files.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
+# Definitions (see http://autogen.sf.net) file that captures the structure of
+# the code including all documentation. Note that this feature is still
+# experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
+# in the source code. If set to NO only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES the includes files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = LV_HAVE_GENERIC \
+                         LV_HAVE_SSE \
+                         LV_HAVE_SSE2 \
+                         LV_HAVE_NEON \
+                         LV_HAVE_AVX \
+                         LV_HAVE_SSE4_2 \
+                         LV_HAVE_SSE3 \
+                         LV_HAVE_SSSE3
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have an
+# all uppercase name, and do not end with a semicolon. Such function macros are
+# typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have an unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
+# class index. If set to NO only the inherited external classes will be listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
+# the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = NO
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES, the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font n the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot.
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif and svg.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/DoxygenLayout.xml b/docs/DoxygenLayout.xml

new file mode 100644 (file)

index 0000000..c4336e6
--- /dev/null
+++ b/docs/DoxygenLayout.xml
@@ -0,0 +1,196 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.6 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <!--
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    -->
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/docs/extending_volk.dox b/docs/extending_volk.dox

new file mode 100644 (file)

index 0000000..86569f8
--- /dev/null
+++ b/docs/extending_volk.dox
@@ -0,0 +1,87 @@
+/*! \page extending_volk Extending VOLK
+
+There are two primary routes for extending VOLK for your own use. The
+preferred route is by writing kernels and proto-kernels as part of this
+repository and sending patches upstream. The alternative is creating your
+own VOLK module. The information for modifying this repository will be
+useful in both cases.
+
+## Modifying this repository
+
+### Adding kernels
+
+Adding kernels refers to introducing a new function to the VOLK API that is
+presumably a useful math function/operation. The first step is to create
+the file in volk/kernels/volk. Follow the naming scheme provided in the
+VOLK terms and techniques page. First create the generic protokernel.
+
+The generic protokernel should be written in plain C using explicitly sized
+types from stdint.h or volk_complex.h when appropriate. volk_complex.h
+includes explicitly sized complex types for floats and ints. The name of
+the generic kernel should be volk_signature_from_file_generic. If multiple
+versions of the generic kernel exist then a description can be appended to
+generic_, but it is not required to use alignment flags in the generic
+protokernel name. It is required to surround the entire generic function
+with preprocessor ifdef fences on the symbol LV_HAVE_GENERIC.
+
+Finally, add the kernel to the list of test cases in volk/lib/kernel_tests.h.
+Many kernels should be able to use the default test parameters, but if yours
+requires a lower tolerance, specific vector length, or other test parameters
+just create a new instance of volk_test_params_t for your kernel.
+
+### Adding protokernels
+
+The primary purpose of VOLK is to have multiple implementations of an operation
+tuned for a specific CPU architecture. Ideally there is at least one
+protokernel of each kernel for every architecture that VOLK supports.
+The pattern for protokernel naming is volk_kernel_signature_architecture_nick.
+The architecture should be one of the supported VOLK architectures. The nick is
+an optional name to distinguish between multiple implementations for a
+particular architecture.
+
+Architecture specific protokernels can be written in one of three ways.
+The first approach should always be to use compiler intrinsic functions.
+The second and third approaches are using either in-line assembly or
+assembly with .S files. Both methods of writing assembly exist in VOLK and
+should yield equivalent performance; which method you might choose is a
+matter of opinion. Regardless of the actual method the public function should
+be declared in the kernel header surrounded by ifdef fences on the symbol that
+fits the architecture implementation.
+
+#### Compiler Intrinsics
+
+Compiler intrinsics should be treated as functions that map to a specific
+assembly instruction. Most VOLK kernels take the form of a loop that iterates
+through a vector. Form a loop that iterates on a number of items that is natural
+for the architecture and then use compiler intrinsics to do the math for your
+operation or algorithm. Include the appropriate header inside the ifdef fences,
+but before your protokernel declaration.
+
+
+#### In-line Assembly
+
+In-line assembly uses a compiler macro to include verbatim assembly with C code.
+The process of in-line assembly protokernels is very similar to protokernels
+based on intrinsics.
+
+#### Assembly with .S files
+
+To write pure assembly protokernels, first declare the function name in the
+kernel header file the same way as any other protokernel, but include the extern
+keyword. Second, create a file (one for each protokernel) in
+volk/kernels/volk/asm/$arch. Disassemble another protokernel and copy the
+disassembled code in to this file to bootstrap a working implementation. Often
+the disassembled code can be hand-tuned to improve performance.
+
+## VOLK Modules
+
+VOLK has a concept of modules. Each module is an independent VOLK tree. Modules
+can be managed with the volk_modtool application. At a high level the module is
+a clone of all of the VOLK machinery without kernels. volk_modtool also makes it
+easy to copy kernels to a module.
+
+Kernels and protokernels are added to your own VOLK module the same way they are
+added to this repository, which was described in the previous section.
+
+*/
+
diff --git a/docs/kernels.dox b/docs/kernels.dox

new file mode 100644 (file)

index 0000000..35fca1e
--- /dev/null
+++ b/docs/kernels.dox
@@ -0,0 +1,128 @@
+/*! \page kernels Kernels
+
+\li \subpage volk_16i_32fc_dot_prod_32fc
+\li \subpage volk_16i_branch_4_state_8
+\li \subpage volk_16ic_convert_32fc
+\li \subpage volk_16ic_deinterleave_16i_x2
+\li \subpage volk_16ic_deinterleave_real_16i
+\li \subpage volk_16ic_deinterleave_real_8i
+\li \subpage volk_16ic_magnitude_16i
+\li \subpage volk_16i_convert_8i
+\li \subpage volk_16ic_s32f_deinterleave_32f_x2
+\li \subpage volk_16ic_s32f_deinterleave_real_32f
+\li \subpage volk_16ic_s32f_magnitude_32f
+\li \subpage volk_16ic_x2_dot_prod_16ic
+\li \subpage volk_16ic_x2_multiply_16ic
+\li \subpage volk_16i_max_star_16i
+\li \subpage volk_16i_max_star_horizontal_16i
+\li \subpage volk_16i_permute_and_scalar_add
+\li \subpage volk_16i_s32f_convert_32f
+\li \subpage volk_16i_x4_quad_max_star_16i
+\li \subpage volk_16i_x5_add_quad_16i_x4
+\li \subpage volk_16u_byteswap
+\li \subpage volk_32f_64f_add_64f
+\li \subpage volk_32f_64f_multiply_64f
+\li \subpage volk_32f_8u_polarbutterfly_32f
+\li \subpage volk_32f_accumulator_s32f
+\li \subpage volk_32f_acos_32f
+\li \subpage volk_32f_asin_32f
+\li \subpage volk_32f_atan_32f
+\li \subpage volk_32f_binary_slicer_32i
+\li \subpage volk_32f_binary_slicer_8i
+\li \subpage volk_32fc_32f_add_32fc
+\li \subpage volk_32fc_32f_dot_prod_32fc
+\li \subpage volk_32fc_32f_multiply_32fc
+\li \subpage volk_32fc_accumulator_s32fc
+\li \subpage volk_32fc_conjugate_32fc
+\li \subpage volk_32fc_convert_16ic
+\li \subpage volk_32fc_deinterleave_32f_x2
+\li \subpage volk_32fc_deinterleave_64f_x2
+\li \subpage volk_32fc_deinterleave_imag_32f
+\li \subpage volk_32fc_deinterleave_real_32f
+\li \subpage volk_32fc_deinterleave_real_64f
+\li \subpage volk_32fc_index_max_16u
+\li \subpage volk_32fc_index_max_32u
+\li \subpage volk_32fc_index_min_16u
+\li \subpage volk_32fc_index_min_32u
+\li \subpage volk_32fc_magnitude_32f
+\li \subpage volk_32fc_magnitude_squared_32f
+\li \subpage volk_32f_convert_64f
+\li \subpage volk_32f_cos_32f
+\li \subpage volk_32fc_s32f_atan2_32f
+\li \subpage volk_32fc_s32fc_multiply_32fc
+\li \subpage volk_32fc_s32fc_x2_rotator_32fc
+\li \subpage volk_32fc_s32f_deinterleave_real_16i
+\li \subpage volk_32fc_s32f_magnitude_16i
+\li \subpage volk_32fc_s32f_power_32fc
+\li \subpage volk_32fc_s32f_power_spectrum_32f
+\li \subpage volk_32fc_s32f_x2_power_spectral_density_32f
+\li \subpage volk_32fc_x2_add_32fc
+\li \subpage volk_32fc_x2_conjugate_dot_prod_32fc
+\li \subpage volk_32fc_x2_divide_32fc
+\li \subpage volk_32fc_x2_dot_prod_32fc
+\li \subpage volk_32fc_x2_multiply_32fc
+\li \subpage volk_32fc_x2_multiply_conjugate_32fc
+\li \subpage volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
+\li \subpage volk_32fc_x2_s32f_square_dist_scalar_mult_32f
+\li \subpage volk_32fc_x2_square_dist_32f
+\li \subpage volk_32f_exp_32f
+\li \subpage volk_32f_expfast_32f
+\li \subpage volk_32f_index_max_16u
+\li \subpage volk_32f_index_max_32u
+\li \subpage volk_32f_index_min_16u
+\li \subpage volk_32f_index_min_32u
+\li \subpage volk_32f_invsqrt_32f
+\li \subpage volk_32f_log2_32f
+\li \subpage volk_32f_s32f_32f_fm_detect_32f
+\li \subpage volk_32f_s32f_add_32f
+\li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f
+\li \subpage volk_32f_s32f_convert_16i
+\li \subpage volk_32f_s32f_convert_32i
+\li \subpage volk_32f_s32f_convert_8i
+\li \subpage volk_32f_s32f_multiply_32f
+\li \subpage volk_32f_s32f_normalize
+\li \subpage volk_32f_s32f_power_32f
+\li \subpage volk_32f_s32f_s32f_mod_range_32f
+\li \subpage volk_32f_s32f_stddev_32f
+\li \subpage volk_32f_sin_32f
+\li \subpage volk_32f_sqrt_32f
+\li \subpage volk_32f_stddev_and_mean_32f_x2
+\li \subpage volk_32f_tan_32f
+\li \subpage volk_32f_tanh_32f
+\li \subpage volk_32f_x2_add_32f
+\li \subpage volk_32f_x2_divide_32f
+\li \subpage volk_32f_x2_dot_prod_16i
+\li \subpage volk_32f_x2_dot_prod_32f
+\li \subpage volk_32f_x2_interleave_32fc
+\li \subpage volk_32f_x2_max_32f
+\li \subpage volk_32f_x2_min_32f
+\li \subpage volk_32f_x2_multiply_32f
+\li \subpage volk_32f_x2_pow_32f
+\li \subpage volk_32f_x2_s32f_interleave_16ic
+\li \subpage volk_32f_x2_subtract_32f
+\li \subpage volk_32f_x3_sum_of_poly_32f
+\li \subpage volk_32i_s32f_convert_32f
+\li \subpage volk_32i_x2_and_32i
+\li \subpage volk_32i_x2_or_32i
+\li \subpage volk_32u_byteswap
+\li \subpage volk_32u_popcnt
+\li \subpage volk_32u_reverse_32u
+\li \subpage volk_64f_convert_32f
+\li \subpage volk_64f_x2_add_64f
+\li \subpage volk_64f_x2_max_64f
+\li \subpage volk_64f_x2_min_64f
+\li \subpage volk_64f_x2_multiply_64f
+\li \subpage volk_64u_byteswap
+\li \subpage volk_64u_popcnt
+\li \subpage volk_8ic_deinterleave_16i_x2
+\li \subpage volk_8ic_deinterleave_real_16i
+\li \subpage volk_8ic_deinterleave_real_8i
+\li \subpage volk_8i_convert_16i
+\li \subpage volk_8ic_s32f_deinterleave_32f_x2
+\li \subpage volk_8ic_s32f_deinterleave_real_32f
+\li \subpage volk_8ic_x2_s32f_multiply_conjugate_32fc
+\li \subpage volk_8i_s32f_convert_32f
+\li \subpage volk_8u_x3_encodepolar_8u
+\li \subpage volk_8u_x4_conv_k7_r2_8u
+
+*/
diff --git a/docs/main_page.dox b/docs/main_page.dox

new file mode 100644 (file)

index 0000000..b1fba78
--- /dev/null
+++ b/docs/main_page.dox
@@ -0,0 +1,17 @@
+/*! \mainpage VOLK
+
+Welcome to VOLK!
+
+VOLK is the Vector-Optimized Library of Kernels. It is a library that contains
+kernels of hand-written SIMD code for different mathematical operations. Since
+each SIMD architecture can be very different and no compiler has yet come along
+to handle vectorization properly or highly efficiently, VOLK approaches the
+problem differently.
+
+For each architecture or platform that a developer wishes to vectorize for, a
+new proto-kernel is added to VOLK. At runtime, VOLK will select the correct
+proto-kernel. In this way, the users of VOLK call a kernel for performing the
+operation that is platform/architecture agnostic. This allows us to write
+portable SIMD code.
+
+*/
diff --git a/docs/terms_and_techniques.dox b/docs/terms_and_techniques.dox

new file mode 100644 (file)

index 0000000..fd0054b
--- /dev/null
+++ b/docs/terms_and_techniques.dox
@@ -0,0 +1,112 @@
+/*! \page concepts_terms_and_techniques Concepts, Terms, and Techniques
+
+This page is primarily a list of definitions and brief overview of successful
+techniques previously used to develop VOLK protokernels.
+
+## Definitions and Concepts
+
+### SIMD
+
+SIMD stands for Single Instruction Multiple Data. Leveraging SIMD instructions
+is the primary optimization in VOLK.
+
+### Architecture
+
+A VOLK architecture is normally called an Instruction Set Architecture (ISA).
+The architectures we target in VOLK usually have SIMD instructions.
+
+### Vector
+
+A vector in VOLK is the same as a C array. It sometimes, but not always
+coincides with the mathematical definition of a vector.
+
+### Kernel
+
+The 'kernel' part of the VOLK name comes from the high performance computing
+use of the word. In this context it is the inner loop of a vector operation.
+Since we don't use the word vector in the math sense a vector operation is an
+operation that is performed on a C array.
+
+### Protokernel
+
+A protokernel is an implementation of a kernel. Every kernel has a 'generic'
+protokernel that is implemented in C. Other protokernels are optimized for a
+particular architecture.
+
+
+## Techniques
+
+### New Kernels
+
+Add new kernels to the list in lib/kernel_tests.h. This adds the kernel to
+VOLK's QA tool as well as the volk profiler. Many kernels are able to
+share test parameters, but new kernels might need new ones.
+
+If the VOLK kernel does not 'fit' the standard set of function parameters
+expected by the volk_profile structure, you need to create a VOLK puppet
+function to help the profiler call the kernel. This is essentially due to the
+function run_volk_tests which has a limited number of function prototypes that
+it can test.
+
+### Protokernels
+
+Adding new proto-kernels (implementations of VOLK kernels for specific
+architectures) is relatively easy. In the relevant <kernel>.h file in
+the volk/include/volk/volk_<input-fingerprint_function-name_output-fingerprint>.h
+file, add a new #ifdef/#endif block for the LV_HAVE_<arch> corresponding
+to the <arch> you a working on (e.g. SSE, AVX, NEON, etc.).
+
+For example, for volk_32f_s32f_multiply_32f_u_neon:
+
+\code
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+  \brief Scalar float multiply
+  \param cVector The vector where the results will be stored
+  \param aVector One of the vectors to be multiplied
+  \param scalar the scalar value
+  \param num_points The number of values in aVector and bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector, const float* aVector, const float scalar, unsigned int num_points){
+  unsigned int number = 0;
+  const float* inputPtr = aVector;
+  float* outputPtr = cVector;
+  const unsigned int quarterPoints = num_points / 4;
+
+  float32x4_t aVal, cVal;
+
+  for(number = 0; number < num_points; number++){
+    aVal = vld1q_f32(inputPtr); // Load into NEON regs
+    cVal = vmulq_n_f32 (aVal, scalar); // Do the multiply
+    vst1q_f32(outputPtr, cVal); // Store results back to output
+    inputPtr += 8;
+    outputPtr += 8;
+  }
+  for(number = quarterPoints * 4; number < num_points; number++){
+      *outputPtr++ = (*inputPtr++) * scalar;
+  }
+}
+#endif /* LV_HAVE_NEON */
+\endcode
+
+### Allocating Memory
+
+SIMD code can be very sensitive to the alignment of the vectors, which is
+generally something like a 16-byte or 32-byte alignment requirement. The
+VOLK dispatcher functions, which is what we will normally call as users of
+VOLK, makes sure that the correct aligned or unaligned version is called
+depending on the state of the vectors passed to it. However, things typically
+work faster and more efficiently when the vectors are aligned. As such, VOLK
+has memory allocate and free methods to provide us with properly aligned
+vectors. We can also ask VOLK to give us the current machine's alignment
+requirement, which makes our job even easier when porting code.
+
+To get the machine's alignment, simply call the size_t volk_get_alignment().
+
+Allocate memory using void* volk_malloc(size_t size, size_t alignment).
+
+Make sure that any memory allocated by VOLK is also freed by VOLK with volk_free(void *p).
+
+
+*/
diff --git a/docs/using_volk.dox b/docs/using_volk.dox

new file mode 100644 (file)

index 0000000..6c4c07f
--- /dev/null
+++ b/docs/using_volk.dox
@@ -0,0 +1,26 @@
+/*! \page using_volk Using VOLK
+
+Using VOLK in your code requires proper linking and including the correct headers. VOLK currently supports both C and C++ bindings.
+
+VOLK provides both a pkgconfig and CMake module to help configuration and
+linking. The pkfconfig file is installed to
+$install_prefix/lib/pkgconfig/volk.pc. The CMake configuration module is in
+$install_prefix/lib/cmake/volk/VolkConfig.cmake.
+
+The header in the VOLK include directory (includedir in pkgconfig,
+VOLK_INCLUDE_DIRS in cmake module) contains the header volk/volk.h defines all
+of the symbols exposed by VOLK. Alternatively individual kernel headers are in
+the same location.
+
+In most cases it is sufficient to call the dispatcher for the kernel you are using.
+For example the following code will compute the dot product between the taps and
+input vector as part of a FIR filter.
+\code
+// assume there is sufficient history in the input buffer
+for(unsigned int ii=0; ii < input_length-ntaps; ++ii) {
+    volk_32fc_32f_dot_prod_32fc(filter_output[ii], input[ii-ntaps] taps, ntaps);
+}
+\endcode
+
+*/
+
diff --git a/docs/versioning.md b/docs/versioning.md

new file mode 100644 (file)

index 0000000..470966c
--- /dev/null
+++ b/docs/versioning.md
@@ -0,0 +1,279 @@
+Versioning
+==========
+
+Starting with version 2.0.0, VOLK adapts [Semantic Version](https://semver.org) as its versioning scheme.
+You'll find the full text of Semver 2.0.0 below.
+
+To highlight a few core aspects as relevant to VOLK:
+
+  * Adding a new kernel (e.g. `volk_32fc_prove_riemann_hypothesis`) is a backwards-compatible functionality addition. That requires increasing the MINOR version (clause 7).
+  * Adding a new proto-kernel (e.g. `volk_32fc_prove_riemann_hypothesis_a_neon`) is _also_ a backwards-compatible functionality addition, and hence also increases MINOR version. It's appreciated to bundle these.
+  * Fixing a bug in a proto-kernel that doesn't change the function signature nor intended behavior only requires PATCH to be increased.
+  * Changing what a kernel intentionally does, or aspects the use of the dispatcher, addition of a new architecture or other fundamental changes increase the MAJOR version.
+  
+--------------------------------------------------
+Begin of Creative Commons - CC BY 3.0 document from https://semver.org
+--------------------------------------------------
+
+Semantic Versioning 2.0.0
+==============================
+
+Summary
+-------
+
+Given a version number MAJOR.MINOR.PATCH, increment the:
+
+1. MAJOR version when you make incompatible API changes,
+1. MINOR version when you add functionality in a backwards-compatible
+   manner, and
+1. PATCH version when you make backwards-compatible bug fixes.
+
+Additional labels for pre-release and build metadata are available as extensions to the MAJOR.MINOR.PATCH format.
+
+Introduction
+------------
+
+In the world of software management there exists a dread place called
+"dependency hell." The bigger your system grows and the more packages you
+integrate into your software, the more likely you are to find yourself, one
+day, in this pit of despair.
+
+In systems with many dependencies, releasing new package versions can quickly
+become a nightmare. If the dependency specifications are too tight, you are in
+danger of version lock (the inability to upgrade a package without having to
+release new versions of every dependent package). If dependencies are
+specified too loosely, you will inevitably be bitten by version promiscuity
+(assuming compatibility with more future versions than is reasonable).
+Dependency hell is where you are when version lock and/or version promiscuity
+prevent you from easily and safely moving your project forward.
+
+As a solution to this problem, I propose a simple set of rules and
+requirements that dictate how version numbers are assigned and incremented.
+These rules are based on but not necessarily limited to pre-existing
+widespread common practices in use in both closed and open-source software.
+For this system to work, you first need to declare a public API. This may
+consist of documentation or be enforced by the code itself. Regardless, it is
+important that this API be clear and precise. Once you identify your public
+API, you communicate changes to it with specific increments to your version
+number. Consider a version format of X.Y.Z (Major.Minor.Patch). Bug fixes not
+affecting the API increment the patch version, backwards compatible API
+additions/changes increment the minor version, and backwards incompatible API
+changes increment the major version.
+
+I call this system "Semantic Versioning." Under this scheme, version numbers
+and the way they change convey meaning about the underlying code and what has
+been modified from one version to the next.
+
+
+Semantic Versioning Specification (SemVer)
+------------------------------------------
+
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
+"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
+interpreted as described in [RFC 2119](http://tools.ietf.org/html/rfc2119).
+
+1. Software using Semantic Versioning MUST declare a public API. This API
+could be declared in the code itself or exist strictly in documentation.
+However it is done, it should be precise and comprehensive.
+
+1. A normal version number MUST take the form X.Y.Z where X, Y, and Z are
+non-negative integers, and MUST NOT contain leading zeroes. X is the
+major version, Y is the minor version, and Z is the patch version.
+Each element MUST increase numerically. For instance: 1.9.0 -> 1.10.0 -> 1.11.0.
+
+1. Once a versioned package has been released, the contents of that version
+MUST NOT be modified. Any modifications MUST be released as a new version.
+
+1. Major version zero (0.y.z) is for initial development. Anything may change
+at any time. The public API should not be considered stable.
+
+1. Version 1.0.0 defines the public API. The way in which the version number
+is incremented after this release is dependent on this public API and how it
+changes.
+
+1. Patch version Z (x.y.Z | x > 0) MUST be incremented if only backwards
+compatible bug fixes are introduced. A bug fix is defined as an internal
+change that fixes incorrect behavior.
+
+1. Minor version Y (x.Y.z | x > 0) MUST be incremented if new, backwards
+compatible functionality is introduced to the public API. It MUST be
+incremented if any public API functionality is marked as deprecated. It MAY be
+incremented if substantial new functionality or improvements are introduced
+within the private code. It MAY include patch level changes. Patch version
+MUST be reset to 0 when minor version is incremented.
+
+1. Major version X (X.y.z | X > 0) MUST be incremented if any backwards
+incompatible changes are introduced to the public API. It MAY include minor
+and patch level changes. Patch and minor version MUST be reset to 0 when major
+version is incremented.
+
+1. A pre-release version MAY be denoted by appending a hyphen and a
+series of dot separated identifiers immediately following the patch
+version. Identifiers MUST comprise only ASCII alphanumerics and hyphen
+[0-9A-Za-z-]. Identifiers MUST NOT be empty. Numeric identifiers MUST
+NOT include leading zeroes. Pre-release versions have a lower
+precedence than the associated normal version. A pre-release version
+indicates that the version is unstable and might not satisfy the
+intended compatibility requirements as denoted by its associated
+normal version. Examples: 1.0.0-alpha, 1.0.0-alpha.1, 1.0.0-0.3.7,
+1.0.0-x.7.z.92.
+
+1. Build metadata MAY be denoted by appending a plus sign and a series of dot 
+separated identifiers immediately following the patch or pre-release version. 
+Identifiers MUST comprise only ASCII alphanumerics and hyphen [0-9A-Za-z-]. 
+Identifiers MUST NOT be empty. Build metadata SHOULD be ignored when determining
+version precedence. Thus two versions that differ only in the build metadata, 
+have the same precedence. Examples: 1.0.0-alpha+001, 1.0.0+20130313144700, 
+1.0.0-beta+exp.sha.5114f85.
+
+1. Precedence refers to how versions are compared to each other when ordered.
+Precedence MUST be calculated by separating the version into major, minor, patch
+and pre-release identifiers in that order (Build metadata does not figure 
+into precedence). Precedence is determined by the first difference when
+comparing each of these identifiers from left to right as follows: Major, minor,
+and patch versions are always compared numerically. Example: 1.0.0 < 2.0.0 <
+2.1.0 < 2.1.1. When major, minor, and patch are equal, a pre-release version has
+lower precedence than a normal version. Example: 1.0.0-alpha < 1.0.0. Precedence
+for two pre-release versions with the same major, minor, and patch version MUST
+be determined by comparing each dot separated identifier from left to right
+until a difference is found as follows: identifiers consisting of only digits
+are compared numerically and identifiers with letters or hyphens are compared
+lexically in ASCII sort order. Numeric identifiers always have lower precedence
+than non-numeric identifiers. A larger set of pre-release fields has a higher
+precedence than a smaller set, if all of the preceding identifiers are equal.
+Example: 1.0.0-alpha < 1.0.0-alpha.1 < 1.0.0-alpha.beta < 1.0.0-beta <
+1.0.0-beta.2 < 1.0.0-beta.11 < 1.0.0-rc.1 < 1.0.0.
+
+Why Use Semantic Versioning?
+----------------------------
+
+This is not a new or revolutionary idea. In fact, you probably do something
+close to this already. The problem is that "close" isn't good enough. Without
+compliance to some sort of formal specification, version numbers are
+essentially useless for dependency management. By giving a name and clear
+definition to the above ideas, it becomes easy to communicate your intentions
+to the users of your software. Once these intentions are clear, flexible (but
+not too flexible) dependency specifications can finally be made.
+
+A simple example will demonstrate how Semantic Versioning can make dependency
+hell a thing of the past. Consider a library called "Firetruck." It requires a
+Semantically Versioned package named "Ladder." At the time that Firetruck is
+created, Ladder is at version 3.1.0. Since Firetruck uses some functionality
+that was first introduced in 3.1.0, you can safely specify the Ladder
+dependency as greater than or equal to 3.1.0 but less than 4.0.0. Now, when
+Ladder version 3.1.1 and 3.2.0 become available, you can release them to your
+package management system and know that they will be compatible with existing
+dependent software.
+
+As a responsible developer you will, of course, want to verify that any
+package upgrades function as advertised. The real world is a messy place;
+there's nothing we can do about that but be vigilant. What you can do is let
+Semantic Versioning provide you with a sane way to release and upgrade
+packages without having to roll new versions of dependent packages, saving you
+time and hassle.
+
+If all of this sounds desirable, all you need to do to start using Semantic
+Versioning is to declare that you are doing so and then follow the rules. Link
+to this website from your README so others know the rules and can benefit from
+them.
+
+
+FAQ
+---
+
+### How should I deal with revisions in the 0.y.z initial development phase?
+
+The simplest thing to do is start your initial development release at 0.1.0
+and then increment the minor version for each subsequent release.
+
+### How do I know when to release 1.0.0?
+
+If your software is being used in production, it should probably already be
+1.0.0. If you have a stable API on which users have come to depend, you should
+be 1.0.0. If you're worrying a lot about backwards compatibility, you should
+probably already be 1.0.0.
+
+### Doesn't this discourage rapid development and fast iteration?
+
+Major version zero is all about rapid development. If you're changing the API
+every day you should either still be in version 0.y.z or on a separate
+development branch working on the next major version.
+
+### If even the tiniest backwards incompatible changes to the public API require a major version bump, won't I end up at version 42.0.0 very rapidly?
+
+This is a question of responsible development and foresight. Incompatible
+changes should not be introduced lightly to software that has a lot of
+dependent code. The cost that must be incurred to upgrade can be significant.
+Having to bump major versions to release incompatible changes means you'll
+think through the impact of your changes, and evaluate the cost/benefit ratio
+involved.
+
+### Documenting the entire public API is too much work!
+
+It is your responsibility as a professional developer to properly document
+software that is intended for use by others. Managing software complexity is a
+hugely important part of keeping a project efficient, and that's hard to do if
+nobody knows how to use your software, or what methods are safe to call. In
+the long run, Semantic Versioning, and the insistence on a well defined public
+API can keep everyone and everything running smoothly.
+
+### What do I do if I accidentally release a backwards incompatible change as a minor version?
+
+As soon as you realize that you've broken the Semantic Versioning spec, fix
+the problem and release a new minor version that corrects the problem and
+restores backwards compatibility. Even under this circumstance, it is 
+unacceptable to modify versioned releases. If it's appropriate,
+document the offending version and inform your users of the problem so that
+they are aware of the offending version.
+
+### What should I do if I update my own dependencies without changing the public API?
+
+That would be considered compatible since it does not affect the public API.
+Software that explicitly depends on the same dependencies as your package
+should have their own dependency specifications and the author will notice any
+conflicts. Determining whether the change is a patch level or minor level
+modification depends on whether you updated your dependencies in order to fix
+a bug or introduce new functionality. I would usually expect additional code
+for the latter instance, in which case it's obviously a minor level increment.
+
+### What if I inadvertently alter the public API in a way that is not compliant with the version number change (i.e. the code incorrectly introduces a major breaking change in a patch release)
+
+Use your best judgment. If you have a huge audience that will be drastically
+impacted by changing the behavior back to what the public API intended, then
+it may be best to perform a major version release, even though the fix could
+strictly be considered a patch release. Remember, Semantic Versioning is all
+about conveying meaning by how the version number changes. If these changes
+are important to your users, use the version number to inform them.
+
+### How should I handle deprecating functionality?
+
+Deprecating existing functionality is a normal part of software development and
+is often required to make forward progress. When you deprecate part of your
+public API, you should do two things: (1) update your documentation to let
+users know about the change, (2) issue a new minor release with the deprecation
+in place. Before you completely remove the functionality in a new major release
+there should be at least one minor release that contains the deprecation so
+that users can smoothly transition to the new API.
+
+### Does semver have a size limit on the version string?
+
+No, but use good judgment. A 255 character version string is probably overkill, 
+for example. Also, specific systems may impose their own limits on the size of 
+the string.
+
+About
+-----
+
+The Semantic Versioning specification is authored by [Tom
+Preston-Werner](http://tom.preston-werner.com), inventor of Gravatars and
+cofounder of GitHub.
+
+If you'd like to leave feedback, please [open an issue on
+GitHub](https://github.com/mojombo/semver/issues).
+
+
+License
+-------
+
+Creative Commons - CC BY 3.0
+http://creativecommons.org/licenses/by/3.0/
diff --git a/docs/volk_logo.png b/docs/volk_logo.png

new file mode 100644 (file)

index 0000000..e965d28

Binary files /dev/null and b/docs/volk_logo.png differ
diff --git a/docs/volk_logo_small.png b/docs/volk_logo_small.png

new file mode 100644 (file)

index 0000000..6ef015a

Binary files /dev/null and b/docs/volk_logo_small.png differ
diff --git a/gen/archs.xml b/gen/archs.xml

new file mode 100644 (file)

index 0000000..cd6905f
--- /dev/null
+++ b/gen/archs.xml
@@ -0,0 +1,176 @@
+<!-- archs appear in order of significance for blind, de-facto version ordering -->
+<grammar>
+
+<arch name="generic"> <!-- name is required-->
+</arch>
+
+<arch name="softfp">
+  <flag compiler="gnu">-mfloat-abi=softfp</flag>
+  <flag compiler="clang">-mfloat-abi=softfp</flag>
+</arch>
+
+<arch name="hardfp">
+  <flag compiler="gnu">-mfloat-abi=hard</flag>
+  <flag compiler="clang">-mfloat-abi=hard</flag>
+</arch>
+
+<arch name="neon">
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
+<arch name="neonv7">
+  <flag compiler="gnu">-mfpu=neon</flag>
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-mfpu=neon</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
+<arch name="neonv8">
+  <flag compiler="gnu">-funsafe-math-optimizations</flag>
+  <flag compiler="clang">-funsafe-math-optimizations</flag>
+  <alignment>16</alignment>
+  <check name="neon"></check>
+</arch>
+
+<arch name="32">
+  <flag compiler="gnu">-m32</flag>
+  <flag compiler="clang">-m32</flag>
+</arch>
+
+<arch name="64">
+  <!-- <check name="check_extended_cpuid"> -->
+      <!-- <param>0x80000001</param>
+  </check> -->
+  <!--<check name="cpuid_x86_bit">   checks to see if a bit is set -->
+  <!--    <param>3</param>           eax, ebx, ecx, [edx] -->
+  <!--    <param>0x80000001</param>  cpuid operation -->
+  <!--    <param>29</param>          bit shift -->
+  <!-- </check> -->
+  <flag compiler="gnu">-m64</flag>
+  <flag compiler="clang">-m64</flag>
+</arch>
+
+<arch name="popcount">
+  <check name="popcnt"></check>
+  <flag compiler="gnu">-mpopcnt</flag>
+  <flag compiler="clang">-mpopcnt</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+</arch>
+
+<arch name="mmx">
+  <check name="mmx"></check>
+  <flag compiler="gnu">-mmmx</flag>
+  <flag compiler="clang">-mmmx</flag>
+  <alignment>8</alignment>
+</arch>
+
+<arch name="fma">
+    <check name="fma3"></check>
+    <flag compiler="gnu">-mfma</flag>
+    <flag compiler="clang">-mfma</flag>
+    <flag compiler="msvc">/arch:AVX2</flag>
+    <alignment>32</alignment>
+</arch>
+
+<arch name="sse">
+  <check name="sse"></check>
+  <flag compiler="gnu">-msse</flag>
+  <flag compiler="clang">-msse</flag>
+  <environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
+  <include>xmmintrin.h</include>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="sse2">
+  <check name="sse2"></check>
+  <flag compiler="gnu">-msse2</flag>
+  <flag compiler="clang">-msse2</flag>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="orc">
+</arch>
+
+<!-- it's here for overrule stuff. -->
+<arch name="norc">
+</arch>
+
+<arch name="sse3">
+  <check name="sse3"></check>
+  <flag compiler="gnu">-msse3</flag>
+  <flag compiler="clang">-msse3</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+  <environment>_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);</environment>
+  <include>pmmintrin.h</include>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="ssse3">
+  <check name="ssse3"></check>
+  <flag compiler="gnu">-mssse3</flag>
+  <flag compiler="clang">-mssse3</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_a">
+  <check name="sse4a"></check>
+  <flag compiler="gnu">-msse4a</flag>
+  <flag compiler="clang">-msse4a</flag>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_1">
+  <check name="sse4_1"></check>
+  <flag compiler="gnu">-msse4.1</flag>
+  <flag compiler="clang">-msse4.1</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="sse4_2">
+  <check name="sse4_2"></check>
+  <flag compiler="gnu">-msse4.2</flag>
+  <flag compiler="clang">-msse4.2</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+  <alignment>16</alignment>
+</arch>
+
+<arch name="avx">
+  <check name="avx"></check>
+  <flag compiler="gnu">-mavx</flag>
+  <flag compiler="clang">-mavx</flag>
+  <flag compiler="msvc">/arch:AVX</flag>
+  <alignment>32</alignment>
+</arch>
+
+<arch name="avx2">
+    <check name="avx2"></check>
+    <flag compiler="gnu">-mavx2</flag>
+    <flag compiler="clang">-mavx2</flag>
+    <flag compiler="msvc">/arch:AVX2</flag>
+    <alignment>32</alignment>
+</arch>
+
+<arch name="avx512f">
+    <check name="avx512f"></check>
+    <flag compiler="gnu">-mavx512f</flag>
+    <flag compiler="clang">-mavx512f</flag>
+    <flag compiler="msvc">/arch:AVX512F</flag>
+    <alignment>64</alignment>
+</arch>
+
+<arch name="avx512cd">
+    <check name="avx512cd"></check>
+    <flag compiler="gnu">-mavx512cd</flag>
+    <flag compiler="clang">-mavx512cd</flag>
+    <flag compiler="msvc">/arch:AVX512CD</flag>
+    <alignment>64</alignment>
+</arch>
+
+</grammar>
diff --git a/gen/machines.xml b/gen/machines.xml

new file mode 100644 (file)

index 0000000..d787a6b
--- /dev/null
+++ b/gen/machines.xml
@@ -0,0 +1,64 @@
+<grammar>
+
+<machine name="generic">
+<archs>generic orc|</archs>
+</machine>
+
+<machine name="neon">
+<archs>generic neon orc|</archs>
+</machine>
+
+<machine name="neonv7">
+<archs>generic neon neonv7 softfp|hardfp orc|</archs>
+</machine>
+
+<machine name="neonv8">
+<archs>generic neon neonv8</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="sse2">
+<archs>generic 32|64| mmx| sse sse2 orc|</archs>
+</machine>
+
+<machine name="sse3">
+<archs>generic 32|64| mmx| sse sse2 sse3 orc|</archs>
+</machine>
+
+<machine name="ssse3">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 orc|</archs>
+</machine>
+
+<machine name="sse4_a">
+<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
+</machine>
+
+<machine name="sse4_1">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 orc|</archs>
+</machine>
+
+<machine name="sse4_2">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx2">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx512f">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f orc|</archs>
+</machine>
+
+<!-- trailing | bar means generate without either for MSVC -->
+<machine name="avx512cd">
+<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
+</machine>
+
+</grammar>
diff --git a/gen/volk_arch_defs.py b/gen/volk_arch_defs.py

new file mode 100644 (file)

index 0000000..ee0af6d
--- /dev/null
+++ b/gen/volk_arch_defs.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+
+archs = list()
+arch_dict = dict()
+
+class arch_class(object):
+    def __init__(self, flags, checks, **kwargs):
+        for key, cast, failval in (
+            ('name', str, None),
+            ('environment', str, None),
+            ('include', str, None),
+            ('alignment', int, 1)
+        ):
+            try: setattr(self, key, cast(kwargs[key]))
+            except: setattr(self, key, failval)
+        self.checks = checks
+        assert(self.name)
+        self._flags = flags
+
+    def is_supported(self, compiler):
+        if not self._flags.keys(): return True
+        return compiler in self._flags.keys()
+
+    def get_flags(self, compiler):
+        try: return self._flags[compiler]
+        except KeyError: return list()
+
+    def __repr__(self): return self.name
+
+def register_arch(**kwargs):
+    arch = arch_class(**kwargs)
+    archs.append(arch)
+    arch_dict[arch.name] = arch
+
+########################################################################
+# register the arches
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch')
+for arch_xml in archs_xml:
+    kwargs = dict()
+    for attr in arch_xml.attributes.keys():
+        kwargs[attr] = arch_xml.attributes[attr].value
+    for node in arch_xml.childNodes:
+        try:
+            name = node.tagName
+            val = arch_xml.getElementsByTagName(name)[0].firstChild.data
+            kwargs[name] = val
+        except: pass
+    checks = list()
+    for check_xml in arch_xml.getElementsByTagName("check"):
+        name = check_xml.attributes["name"].value
+        params = list()
+        for param_xml in check_xml.getElementsByTagName("param"):
+            params.append(param_xml.firstChild.data)
+        checks.append([name, params])
+    flags = dict()
+    for flag_xml in arch_xml.getElementsByTagName("flag"):
+        name = flag_xml.attributes["compiler"].value
+        if name not in flags: flags[name] = list()
+        flags[name].append(flag_xml.firstChild.data)
+    register_arch(flags=flags, checks=checks, **kwargs)
+
+if __name__ == '__main__':
+    print(archs)
+    
diff --git a/gen/volk_compile_utils.py b/gen/volk_compile_utils.py

new file mode 100644 (file)

index 0000000..d3d62c9
--- /dev/null
+++ b/gen/volk_compile_utils.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import argparse
+import volk_arch_defs
+import volk_machine_defs
+
+
+def do_arch_flags_list(compiler):
+    output = list()
+    for arch in volk_arch_defs.archs:
+        if not arch.is_supported(compiler): continue
+        fields = [arch.name] + arch.get_flags(compiler)
+        output.append(','.join(fields))
+    print(';'.join(output))
+
+
+def do_machines_list(arch_names):
+    output = list()
+    for machine in volk_machine_defs.machines:
+        machine_arch_set = set(machine.arch_names)
+        if set(arch_names).intersection(machine_arch_set) == machine_arch_set:
+            output.append(machine.name)
+    print(';'.join(output))
+
+
+def do_machine_flags_list(compiler, machine_name):
+    output = list()
+    machine = volk_machine_defs.machine_dict[machine_name]
+    for arch in machine.archs:
+        output.extend(arch.get_flags(compiler))
+    print(' '.join(output))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', type=str)
+    parser.add_argument('--compiler', type=str)
+    parser.add_argument('--archs', type=str)
+    parser.add_argument('--machine', type=str)
+    args = parser.parse_args()
+
+    if args.mode == 'arch_flags': return do_arch_flags_list(args.compiler.lower())
+    if args.mode == 'machines': return do_machines_list(args.archs.split(';'))
+    if args.mode == 'machine_flags': return do_machine_flags_list(args.compiler.lower(), args.machine)
+
+if __name__ == '__main__': 
+    main()
+    
diff --git a/gen/volk_kernel_defs.py b/gen/volk_kernel_defs.py

new file mode 100644 (file)

index 0000000..994f1cb
--- /dev/null
+++ b/gen/volk_kernel_defs.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python
+# Copyright 2011-2012 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import os
+import re
+import sys
+import glob
+
+########################################################################
+# Strip comments from a c/cpp file.
+# Input is code string, output is code string without comments.
+# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
+########################################################################
+def comment_remover(text):
+    def replacer(match):
+        s = match.group(0)
+        if s.startswith('/'):
+            return ""
+        else:
+            return s
+    pattern = re.compile(
+        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
+        re.DOTALL | re.MULTILINE
+    )
+    return re.sub(pattern, replacer, text)
+
+########################################################################
+# Split code into nested sections according to ifdef preprocessor macros
+########################################################################
+def split_into_nested_ifdef_sections(code):
+    sections = list()
+    section = ''
+    header = 'text'
+    in_section_depth = 0
+    for i, line in enumerate(code.splitlines()):
+        m = re.match(r'^(\s*)#(\s*)(\w+)(.*)$', line)
+        line_is = 'normal'
+        if m:
+            p0, p1, fcn, stuff = m.groups()
+            if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
+            if fcn in ('else', 'elif'): line_is = 'else'
+            if fcn in ('endif',): line_is = 'end'
+
+        if line_is == 'if': in_section_depth += 1
+        if line_is == 'end': in_section_depth -= 1
+
+        if in_section_depth == 1 and line_is == 'if':
+            sections.append((header, section))
+            section = ''
+            header = line
+            continue
+
+        if in_section_depth == 1 and line_is == 'else':
+            sections.append((header, section))
+            section = ''
+            header = line
+            continue
+
+        if in_section_depth == 0 and line_is == 'end':
+            sections.append((header, section))
+            section = ''
+            header = 'text'
+            continue
+
+        section += line + '\n'
+
+    sections.append((header, section)) #and pack remainder into sections
+    sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
+
+    #recurse into non-text sections to fill subsections
+    for i, (header, section) in enumerate(sections):
+        if header == 'text': continue
+        sections[i] = (header, split_into_nested_ifdef_sections(section))
+
+    return sections
+
+########################################################################
+# Recursive print of sections to test code above
+########################################################################
+def print_sections(sections, indent = '  '):
+    for header, body in sections:
+        if header == 'text':
+            print(indent, ('\n'+indent).join(body.splitlines()))
+            continue
+        print(indent.replace(' ', '-') + '>', header)
+        print_sections(body, indent + '  ')
+
+########################################################################
+# Flatten a section to just body text
+########################################################################
+def flatten_section_text(sections):
+    output = ''
+    for hdr, bdy in sections:
+        if hdr != 'text': output += flatten_section_text(bdy)
+        else: output += bdy
+    return output
+
+########################################################################
+# Extract kernel info from section, represent as an implementation
+########################################################################
+class impl_class(object):
+    def __init__(self, kern_name, header, body):
+        #extract LV_HAVE_*
+        self.deps = set(res.lower() for res in re.findall(r'LV_HAVE_(\w+)', header))
+        #extract function suffix and args
+        body = flatten_section_text(body)
+        try:
+            fcn_matcher = re.compile(r'^.*(%s\w*)\s*\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
+            body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
+            m = fcn_matcher.match(body)
+            impl_name, the_rest = m.groups()
+            self.name = impl_name.replace(kern_name+'_', '')
+            self.args = list()
+            fcn_args = the_rest.split(',')
+            for fcn_arg in fcn_args:
+                arg_matcher = re.compile(r'^\s*(.*\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
+                m = arg_matcher.match(fcn_arg)
+                arg_type, arg_name = m.groups()
+                self.args.append((arg_type, arg_name))
+        except Exception as ex:
+            raise Exception('I can\'t parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex))
+
+        assert self.name
+        self.is_aligned = self.name.startswith('a_')
+
+    def __repr__(self):
+        return self.name
+
+########################################################################
+# Get sets of LV_HAVE_* from the code
+########################################################################
+def extract_lv_haves(code):
+    haves = list()
+    for line in code.splitlines():
+        if not line.strip().startswith('#'): continue
+        have_set = set(res.lower() for res in  re.findall(r'LV_HAVE_(\w+)', line))
+        if have_set: haves.append(have_set)
+    return haves
+
+########################################################################
+# Represent a processing kernel, parse from file
+########################################################################
+class kernel_class(object):
+    def __init__(self, kernel_file):
+        self.name = os.path.splitext(os.path.basename(kernel_file))[0]
+        self.pname = self.name.replace('volk_', 'p_')
+        code = open(kernel_file, 'rb').read().decode("utf-8")
+        code = comment_remover(code)
+        sections = split_into_nested_ifdef_sections(code)
+        self._impls = list()
+        for header, section in sections:
+            if 'ifndef' not in header.lower(): continue
+            for sub_hdr, body in section:
+                if 'if' not in sub_hdr.lower(): continue
+                if 'LV_HAVE_' not in sub_hdr: continue
+                self._impls.append(impl_class(
+                    kern_name=self.name, header=sub_hdr, body=body,
+                ))
+        assert(self._impls)
+        self.has_dispatcher = False
+        for impl in self._impls:
+            if impl.name == 'dispatcher':
+                self._impls.remove(impl)
+                self.has_dispatcher = True
+                break
+        self.args = self._impls[0].args
+        self.arglist_types = ', '.join([a[0] for a in self.args])
+        self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
+        self.arglist_names = ', '.join([a[1] for a in self.args])
+
+    def get_impls(self, archs):
+        archs = set(archs)
+        impls = list()
+        for impl in self._impls:
+            if impl.deps.intersection(archs) == impl.deps:
+                impls.append(impl)
+        return impls
+
+    def __repr__(self):
+        return self.name
+
+########################################################################
+# Extract information from the VOLK kernels
+########################################################################
+__file__ = os.path.abspath(__file__)
+srcdir = os.path.dirname(os.path.dirname(__file__))
+kernel_files = sorted(glob.glob(os.path.join(srcdir, "kernels", "volk", "*.h")))
+kernels = list(map(kernel_class, kernel_files))
+
+if __name__ == '__main__':
+    print(kernels)
+    
diff --git a/gen/volk_machine_defs.py b/gen/volk_machine_defs.py

new file mode 100644 (file)

index 0000000..19efd48
--- /dev/null
+++ b/gen/volk_machine_defs.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+from volk_arch_defs import arch_dict
+
+machines = list()
+machine_dict = dict()
+
+class machine_class(object):
+    def __init__(self, name, archs):
+        self.name = name
+        self.archs = list()
+        self.arch_names = list()
+        for arch_name in archs:
+            if not arch_name: continue
+            arch = arch_dict[arch_name]
+            self.archs.append(arch)
+            self.arch_names.append(arch_name)
+        self.alignment = max([a.alignment for a in self.archs])
+
+    def __repr__(self): return self.name
+
+def register_machine(name, archs):
+    for i, arch_name in enumerate(archs):
+        if '|' in arch_name: #handle special arch names with the '|'
+            for arch_sub in arch_name.split('|'):
+                if arch_sub:
+                    register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:])
+                else:
+                    register_machine(name, archs[:i] + archs[i+1:])
+            return
+    machine = machine_class(name=name, archs=archs)
+    machines.append(machine)
+    machine_dict[machine.name] = machine
+
+########################################################################
+# register the machines
+########################################################################
+#TODO skip the XML and put it here
+from xml.dom import minidom
+import os
+gendir = os.path.dirname(__file__)
+machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine')
+for machine_xml in machines_xml:
+    kwargs = dict()
+    for attr in machine_xml.attributes.keys():
+        kwargs[attr] = machine_xml.attributes[attr].value
+    for node in machine_xml.childNodes:
+        try:
+            name = node.tagName
+            val = machine_xml.getElementsByTagName(name)[0].firstChild.data
+            kwargs[name] = val
+        except: pass
+    kwargs['archs'] = kwargs['archs'].split()
+    register_machine(**kwargs)
+
+if __name__ == '__main__':
+    print(machines)
diff --git a/gen/volk_tmpl_utils.py b/gen/volk_tmpl_utils.py

new file mode 100644 (file)

index 0000000..e1e7b4b
--- /dev/null
+++ b/gen/volk_tmpl_utils.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# Copyright 2012 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import os
+import re
+import sys
+import argparse
+import volk_arch_defs
+import volk_machine_defs
+import volk_kernel_defs
+from mako.template import Template
+
+
+def __parse_tmpl(_tmpl, **kwargs):
+    defs = {
+        'archs': volk_arch_defs.archs,
+        'arch_dict': volk_arch_defs.arch_dict,
+        'machines': volk_machine_defs.machines,
+        'machine_dict': volk_machine_defs.machine_dict,
+        'kernels': volk_kernel_defs.kernels,
+    }
+    defs.update(kwargs)
+    _tmpl = """
+
+/* this file was generated by volk template utils, do not edit! */
+
+""" + _tmpl
+    return str(Template(_tmpl).render(**defs))
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str)
+    parser.add_argument('--output', type=str)
+    args, extras = parser.parse_known_args()
+
+    output = __parse_tmpl(open(args.input).read(), args=extras)
+    if args.output: open(args.output, 'w').write(output)
+    else: print(output)
+
+
+if __name__ == '__main__': 
+    main()
+    
diff --git a/include/volk/constants.h b/include/volk/constants.h

new file mode 100644 (file)

index 0000000..92dad5a
--- /dev/null
+++ b/include/volk/constants.h
@@ -0,0 +1,25 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2006,2009,2013 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_CONSTANTS_H
+#define INCLUDED_VOLK_CONSTANTS_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+VOLK_API const char* volk_prefix();
+VOLK_API const char* volk_version();
+VOLK_API const char* volk_c_compiler();
+VOLK_API const char* volk_compiler_flags();
+VOLK_API const char* volk_available_machines();
+
+__VOLK_DECL_END
+
+#endif /* INCLUDED_VOLK_CONSTANTS_H */
diff --git a/include/volk/saturation_arithmetic.h b/include/volk/saturation_arithmetic.h

new file mode 100644 (file)

index 0000000..486315b
--- /dev/null
+++ b/include/volk/saturation_arithmetic.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#ifndef INCLUDED_volk_saturation_arithmetic_H_
+#define INCLUDED_volk_saturation_arithmetic_H_
+
+#include <limits.h>
+
+static inline int16_t sat_adds16i(int16_t x, int16_t y)
+{
+    int32_t res = (int32_t)x + (int32_t)y;
+
+    if (res < SHRT_MIN)
+        res = SHRT_MIN;
+    if (res > SHRT_MAX)
+        res = SHRT_MAX;
+
+    return res;
+}
+
+static inline int16_t sat_muls16i(int16_t x, int16_t y)
+{
+    int32_t res = (int32_t)x * (int32_t)y;
+
+    if (res < SHRT_MIN)
+        res = SHRT_MIN;
+    if (res > SHRT_MAX)
+        res = SHRT_MAX;
+
+    return res;
+}
+
+#endif /* INCLUDED_volk_saturation_arithmetic_H_ */
diff --git a/include/volk/sse2neon.h b/include/volk/sse2neon.h

new file mode 100644 (file)

index 0000000..29db305
--- /dev/null
+++ b/include/volk/sse2neon.h
@@ -0,0 +1,8864 @@
+/*
+ * Copyright 2022 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: MIT
+ * 
+ * This file is from :
+ * https://github.com/DLTcollab/sse2neon
+ */
+
+// Turn off Clang formatting, as 
+// this would make diffs a lot more
+// tricky.
+// clang-format off
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#else /* non-GNU / non-clang compilers */
+#warning "Macro name collisions may happen with unsupported compiler."
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+#if !defined(__aarch64__) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if !defined(__aarch64__)
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://docs.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128();
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 12 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+#if defined(__ARM_FEATURE_CRYPTO) && \
+    (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+//   FOR j := 0 to 3
+//     i := 16*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7FFF) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+15:i] := 0x7FFF
+//     ELSE
+//       dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    const __m128 i16Min = _mm_set_ps1((float) INT16_MIN);
+    const __m128 i16Max = _mm_set_ps1((float) INT16_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i16Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i16Min), _mm_cmplt_ps(a, i16Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT16_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT16_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    return vreinterpret_m64_s16(vmovn_s32(vreinterpretq_s32_m128i(res32)));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+//
+//   FOR j := 0 to 3
+//     i := 8*j
+//     k := 32*j
+//     IF a[k+31:k] >= FP32(0x7F) && a[k+31:k] <= FP32(0x7FFFFFFF)
+//       dst[i+7:i] := 0x7F
+//     ELSE
+//       dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    const __m128 i8Min = _mm_set_ps1((float) INT8_MIN);
+    const __m128 i8Max = _mm_set_ps1((float) INT8_MAX);
+    const __m128 i32Max = _mm_set_ps1((float) INT32_MAX);
+    const __m128i maxMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpge_ps(a, i8Max), _mm_cmple_ps(a, i32Max)));
+    const __m128i betweenMask = _mm_castps_si128(
+        _mm_and_ps(_mm_cmpgt_ps(a, i8Min), _mm_cmplt_ps(a, i8Max)));
+    const __m128i minMask = _mm_cmpeq_epi32(_mm_or_si128(maxMask, betweenMask),
+                                            _mm_setzero_si128());
+    __m128i max = _mm_and_si128(maxMask, _mm_set1_epi32(INT8_MAX));
+    __m128i min = _mm_and_si128(minMask, _mm_set1_epi32(INT8_MIN));
+    __m128i cvt = _mm_and_si128(betweenMask, _mm_cvtps_epi32(a));
+    __m128i res32 = _mm_or_si128(_mm_or_si128(max, min), cvt);
+    int16x4_t res16 = vmovn_s32(vreinterpretq_s32_m128i(res32));
+    int8x8_t res8 = vmovn_s16(vcombine_s16(res16, res16));
+    static const uint32_t bitMask[2] = {0xFFFFFFFF, 0};
+    int8x8_t mask = vreinterpret_s8_u32(vld1_u32(bitMask));
+
+    return vreinterpret_m64_s8(vorr_s8(vand_s8(mask, res8), vdup_n_s8(0)));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    if (r.field.bit22) {
+        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
+    } else {
+        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// The current precision is 1% error.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr()
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pi16(a, imm)                                           \
+    __extension__({                                                        \
+        vreinterpret_m64_s16(__builtin_shufflevector(                      \
+            vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+            ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
+    })
+#else
+#define _mm_shuffle_pi16(a, imm)                                               \
+    __extension__({                                                            \
+        int16x4_t ret;                                                         \
+        ret =                                                                  \
+            vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
+            1);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
+            2);                                                                \
+        ret = vset_lane_s16(                                                   \
+            vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
+            3);                                                                \
+        vreinterpret_m64_s16(ret);                                             \
+    })
+#endif
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//          i := j*64
+//          dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] =
+        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] =
+        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? ~UINT64_C(0)
+               : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
+            (*(double *) &b1) == (*(double *) &b1))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t d[2];
+    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
+            (*(double *) &b0) == (*(double *) &b0))
+               ? UINT64_C(0)
+               : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 >= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 > *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 <= *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+
+    return (*(double *) &a0 < *(double *) &b0);
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      k := 64*j
+//      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0 = ((double *) &rnd)[0];
+    double d1 = ((double *) &rnd)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = ((double *) &rnd)[0];
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
+                                                 vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    double bf = (double) b;
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0 = ((double *) &a)[0];
+    double a1 = ((double *) &a)[1];
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double ret = *((double *) &a);
+    return (int32_t) ret;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+//  FOR j := 0 to 1
+//    i := 64*j
+//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return _mm_set_pd(0, a);
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] << imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] << (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128
+FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vdupq_n_u8(0), vreinterpretq_u8_m128i(a)};
+    return vreinterpretq_m128i_u8(
+        vld1q_u8(((uint8_t const *) tmp) + (16 - imm)));
+}
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := (a[i+15] ? 0xFFFF : 0x0)
+//     ELSE
+//       dst[i+15:i] := SignExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) == 0)) {                                \
+            ret = a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {              \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-(imm)))); \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_s32(                                   \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));                \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF count[63:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF count[63:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF count[63:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> count[63:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~15)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u16(                                   \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~31)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u32(                                   \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                               \
+    __extension__({                                                          \
+        __m128i ret;                                                         \
+        if (_sse2neon_unlikely((imm) & ~63)) {                               \
+            ret = _mm_setzero_si128();                                       \
+        } else {                                                             \
+            ret = vreinterpretq_m128i_u64(                                   \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-(imm)))); \
+        }                                                                    \
+        ret;                                                                 \
+    })
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+//
+//   tmp := imm8[7:0]
+//   IF tmp > 15
+//     tmp := 16
+//   FI
+//   dst[127:0] := a[127:0] >> (tmp*8)
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128
+FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    uint8x16_t tmp[2] = {vreinterpretq_u8_m128i(a), vdupq_n_u8(0)};
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + imm));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Stores 128-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores 32-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
+//
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+//   i := j*64
+//   IF ((j & 1) == 0)
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ELSE
+//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
+//   FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    int idx;
+    uint8x16_t tmp[2];
+    if (imm >= 16) {
+        idx = imm - 16;
+        tmp[0] = vreinterpretq_u8_m128i(a);
+        tmp[1] = vdupq_n_u8(0);
+    } else {
+        idx = imm;
+        tmp[0] = vreinterpretq_u8_m128i(b);
+        tmp[1] = vreinterpretq_u8_m128i(a);
+    }
+    return vreinterpretq_m128i_u8(vld1q_u8(((uint8_t const *) tmp) + idx));
+}
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (_sse2neon_unlikely((imm) >= 16)) {                              \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     IF b[i+7] == 1
+//       dst[i+7:i] := 0
+//     ELSE
+//       index[2:0] := b[i+2:i]
+//       dst[i+7:i] := a[index*8+7:index*8]
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := CEIL(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
+    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double *f = (double *) &a;
+    return _mm_set_pd(floor(f[1]), floor(f[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                              \
+    __extension__({                                                            \
+        float32x4_t tmp1 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(b, (imm8 >> 6) & 0x3),               \
+                           vreinterpretq_f32_m128(a), 0);                      \
+        float32x4_t tmp2 =                                                     \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
+                           ((imm8 >> 4) & 0x3));                               \
+        const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
+                                  ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
+        uint32x4_t mask = vld1q_u32(data);                                     \
+        float32x4_t all_zeros = vdupq_n_f32(0);                                \
+                                                                               \
+        vreinterpretq_m128_f32(                                                \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
+    })
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c15 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c26 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+    _a = vextq_u8(_a, _a, 1);
+    c37 = vabsq_s16(vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(_a), low_b)));
+#if defined(__aarch64__)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__) || defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t zf =
+        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t cf =
+        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
+    uint64x2_t result = vandq_u64(zf, cf);
+    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__)
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+    asm volatile("mrs %0, cntvct_el0" : "=r"(val));
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
+// clang-format on
diff --git a/include/volk/volk_alloc.hh b/include/volk/volk_alloc.hh

new file mode 100644 (file)

index 0000000..8600f49
--- /dev/null
+++ b/include/volk/volk_alloc.hh
@@ -0,0 +1,77 @@
+/* -*- C++ -*- */
+/*
+ * Copyright 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_ALLOC_H
+#define INCLUDED_VOLK_ALLOC_H
+
+#include <cstdlib>
+#include <limits>
+#include <new>
+#include <vector>
+
+#include <volk/volk.h>
+
+namespace volk {
+
+/*!
+ * \brief C++11 allocator using volk_malloc and volk_free
+ *
+ * \details
+ *   adapted from https://en.cppreference.com/w/cpp/named_req/Alloc
+ */
+template <class T>
+struct alloc {
+    typedef T value_type;
+
+    alloc() = default;
+
+    template <class U>
+    constexpr alloc(alloc<U> const&) noexcept
+    {
+    }
+
+    T* allocate(std::size_t n)
+    {
+        if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
+            throw std::bad_alloc();
+
+        if (auto p = static_cast<T*>(volk_malloc(n * sizeof(T), volk_get_alignment())))
+            return p;
+
+        throw std::bad_alloc();
+    }
+
+    void deallocate(T* p, std::size_t) noexcept { volk_free(p); }
+};
+
+template <class T, class U>
+bool operator==(alloc<T> const&, alloc<U> const&)
+{
+    return true;
+}
+
+template <class T, class U>
+bool operator!=(alloc<T> const&, alloc<U> const&)
+{
+    return false;
+}
+
+
+/*!
+ * \brief type alias for std::vector using volk::alloc
+ *
+ * \details
+ * example code:
+ *   volk::vector<float> v(100); // vector using volk_malloc, volk_free
+ */
+template <class T>
+using vector = std::vector<T, alloc<T>>;
+
+} // namespace volk
+#endif // INCLUDED_VOLK_ALLOC_H
diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h

new file mode 100644 (file)

index 0000000..d549161
--- /dev/null
+++ b/include/volk/volk_avx2_intrinsics.h
@@ -0,0 +1,333 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX2 intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-paste.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
+#include "volk/volk_avx_intrinsics.h"
+#include <immintrin.h>
+
+static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
+{
+    const __m128i zeros = _mm_set1_epi8(0x00);
+    const __m128i sign_extract = _mm_set1_epi8(0x80);
+    const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x00,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x01,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x02,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x03,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x04,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x05,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x06,
+                                                  0xff,
+                                                  0xff,
+                                                  0xff,
+                                                  0x07);
+    __m256i sign_bits = _mm256_setzero_si256();
+
+    fbits = _mm_cmpgt_epi8(fbits, zeros);
+    fbits = _mm_and_si128(fbits, sign_extract);
+    sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
+    sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
+    sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
+
+    return _mm256_castsi256_ps(sign_bits);
+}
+
+static inline __m256
+_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
+{
+    // prepare sign mask for correct +-
+    __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
+
+    __m256 llr0, llr1;
+    _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
+
+    // calculate result
+    llr0 = _mm256_xor_ps(llr0, sign_mask);
+    __m256 dst = _mm256_add_ps(llr0, llr1);
+    return dst;
+}
+
+static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
+                                                     const __m256 cplxValue1)
+{
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
+    const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
+    const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
+    return _mm256_permutevar8x32_ps(complex_result, idx);
+}
+
+static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
+                                                     const __m256 symbols1,
+                                                     const __m256 points0,
+                                                     const __m256 points1,
+                                                     const __m256 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+    const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+    const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
+    return _mm256_mul_ps(norms, scalar);
+}
+
+/*
+ * The function below vectorizes the inner loop of the following code:
+ *
+ * float max_values[8] = {0.f};
+ * unsigned max_indices[8] = {0};
+ * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ * for (unsigned i = 0; i < num_points / 8; ++i) {
+ *     for (unsigned j = 0; j < 8; ++j) {
+ *         float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
+ *         bool compare = abs_squared > max_values[j];
+ *         max_values[j] = compare ? abs_squared : max_values[j];
+ *         max_indices[j] = compare ? current_indices[j] : max_indices[j]
+ *         current_indices[j] += 8; // update for next outer loop iteration
+ *         ++src0;
+ *     }
+ * }
+ */
+static inline void vector_32fc_index_max_variant0(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* max_values,
+                                                  __m256i* max_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    /*
+     * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
+     * hadd_ps(a, b) computes
+     * (b_7 + b_6,
+     *  b_5 + b_4,
+     *  ---------
+     *  a_7 + b_6,
+     *  a_5 + a_4,
+     *  ---------
+     *  b_3 + b_2,
+     *  b_1 + b_0,
+     *  ---------
+     *  a_3 + a_2,
+     *  a_1 + a_0).
+     * The result is the squared absolute value of complex numbers at index
+     * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
+     * current_indices!
+     */
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+
+    /*
+     * Compare the recently computed squared absolute values with the
+     * previously determined maximum values. cmp_ps(a, b) determines
+     * a > b ? 0xFFFFFFFF for each element in the vectors =>
+     * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
+     *
+     * If either operand is NaN, 0 is returned as an “ordered” comparision is
+     * used => the blend operation will select the value from *max_values.
+     */
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
+
+    /* Select maximum by blending. This is the only line which differs from variant1 */
+    *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
+
+    /*
+     * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
+     * each element in the vectors =>
+     * max_indices = compare_mask ? current_indices : max_indices
+     *
+     * Note: The casting of data types is required to make the compiler happy
+     * and does not change values.
+     */
+    *max_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    /* compute indices of complex numbers which will be loaded in the next iteration */
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+/* See _variant0 for details */
+static inline void vector_32fc_index_max_variant1(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* max_values,
+                                                  __m256i* max_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
+
+    /*
+     * This is the only line which differs from variant0. Using maxps instead of
+     * blendvps is faster on Intel CPUs (on the ones tested with).
+     *
+     * Note: The order of arguments matters if a NaN is encountered in which
+     * case the value of the second argument is selected. This is consistent
+     * with the “ordered” comparision and the blend operation: The comparision
+     * returns false if a NaN is encountered and the blend operation
+     * consequently selects the value from max_indices.
+     */
+    *max_values = _mm256_max_ps(abs_squared, *max_values);
+
+    *max_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+/*
+ * The function below vectorizes the inner loop of the following code:
+ *
+ * float min_values[8] = {FLT_MAX};
+ * unsigned min_indices[8] = {0};
+ * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+ * for (unsigned i = 0; i < num_points / 8; ++i) {
+ *     for (unsigned j = 0; j < 8; ++j) {
+ *         float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
+ *         bool compare = abs_squared < min_values[j];
+ *         min_values[j] = compare ? abs_squared : min_values[j];
+ *         min_indices[j] = compare ? current_indices[j] : min_indices[j]
+ *         current_indices[j] += 8; // update for next outer loop iteration
+ *         ++src0;
+ *     }
+ * }
+ */
+static inline void vector_32fc_index_min_variant0(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* min_values,
+                                                  __m256i* min_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    /*
+     * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
+     * hadd_ps(a, b) computes
+     * (b_7 + b_6,
+     *  b_5 + b_4,
+     *  ---------
+     *  a_7 + b_6,
+     *  a_5 + a_4,
+     *  ---------
+     *  b_3 + b_2,
+     *  b_1 + b_0,
+     *  ---------
+     *  a_3 + a_2,
+     *  a_1 + a_0).
+     * The result is the squared absolute value of complex numbers at index
+     * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
+     * current_indices!
+     */
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+
+    /*
+     * Compare the recently computed squared absolute values with the
+     * previously determined minimum values. cmp_ps(a, b) determines
+     * a < b ? 0xFFFFFFFF for each element in the vectors =>
+     * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
+     *
+     * If either operand is NaN, 0 is returned as an “ordered” comparision is
+     * used => the blend operation will select the value from *min_values.
+     */
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
+
+    /* Select minimum by blending. This is the only line which differs from variant1 */
+    *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
+
+    /*
+     * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
+     * each element in the vectors =>
+     * min_indices = compare_mask ? current_indices : min_indices
+     *
+     * Note: The casting of data types is required to make the compiler happy
+     * and does not change values.
+     */
+    *min_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    /* compute indices of complex numbers which will be loaded in the next iteration */
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+/* See _variant0 for details */
+static inline void vector_32fc_index_min_variant1(__m256 in0,
+                                                  __m256 in1,
+                                                  __m256* min_values,
+                                                  __m256i* min_indices,
+                                                  __m256i* current_indices,
+                                                  __m256i indices_increment)
+{
+    in0 = _mm256_mul_ps(in0, in0);
+    in1 = _mm256_mul_ps(in1, in1);
+
+    __m256 abs_squared = _mm256_hadd_ps(in0, in1);
+    __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
+
+    /*
+     * This is the only line which differs from variant0. Using maxps instead of
+     * blendvps is faster on Intel CPUs (on the ones tested with).
+     *
+     * Note: The order of arguments matters if a NaN is encountered in which
+     * case the value of the second argument is selected. This is consistent
+     * with the “ordered” comparision and the blend operation: The comparision
+     * returns false if a NaN is encountered and the blend operation
+     * consequently selects the value from min_indices.
+     */
+    *min_values = _mm256_min_ps(abs_squared, *min_values);
+
+    *min_indices =
+        _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
+                                             _mm256_castsi256_ps(*current_indices),
+                                             compare_mask));
+
+    *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
diff --git a/include/volk/volk_avx_intrinsics.h b/include/volk/volk_avx_intrinsics.h

new file mode 100644 (file)

index 0000000..0de88d8
--- /dev/null
+++ b/include/volk/volk_avx_intrinsics.h
@@ -0,0 +1,195 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold AVX intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_
+#include <immintrin.h>
+
+static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
+{
+    __m256 yl, yh, tmp1, tmp2;
+    yl = _mm256_moveldup_ps(y);        // Load yl with cr,cr,dr,dr ...
+    yh = _mm256_movehdup_ps(y);        // Load yh with ci,ci,di,di ...
+    tmp1 = _mm256_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+    x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
+    tmp2 = _mm256_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+    // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+    return _mm256_addsub_ps(tmp1, tmp2);
+}
+
+static inline __m256 _mm256_conjugate_ps(__m256 x)
+{
+    const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+    return _mm256_xor_ps(x, conjugator); // conjugate y
+}
+
+static inline __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
+{
+    const __m256 nswap = _mm256_permute_ps(x, 0xb1);
+    const __m256 dreal = _mm256_moveldup_ps(y);
+    const __m256 dimag = _mm256_movehdup_ps(y);
+
+    const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+    const __m256 dimagconj = _mm256_xor_ps(dimag, conjugator);
+    const __m256 multreal = _mm256_mul_ps(x, dreal);
+    const __m256 multimag = _mm256_mul_ps(nswap, dimagconj);
+    return _mm256_add_ps(multreal, multimag);
+}
+
+static inline __m256 _mm256_normalize_ps(__m256 val)
+{
+    __m256 tmp1 = _mm256_mul_ps(val, val);
+    tmp1 = _mm256_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm256_shuffle_ps(tmp1, tmp1, _MM_SHUFFLE(3, 1, 2, 0)); // equals 0xD8
+    tmp1 = _mm256_sqrt_ps(tmp1);
+    return _mm256_div_ps(val, tmp1);
+}
+
+static inline __m256 _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+    __m256 complex1, complex2;
+    cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+    complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+    complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+    return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+}
+
+static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
+{
+    return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
+}
+
+static inline __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0,
+                                                const __m256 symbols1,
+                                                const __m256 points0,
+                                                const __m256 points1,
+                                                const __m256 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
+    const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
+    const __m256 norms = _mm256_magnitudesquared_ps(diff0, diff1);
+    return _mm256_mul_ps(norms, scalar);
+}
+
+static inline __m256 _mm256_polar_sign_mask(__m128i fbits)
+{
+    __m256 sign_mask_dummy = _mm256_setzero_ps();
+    const __m128i zeros = _mm_set1_epi8(0x00);
+    const __m128i sign_extract = _mm_set1_epi8(0x80);
+    const __m128i shuffle_mask0 = _mm_setr_epi8(0xff,
+                                                0xff,
+                                                0xff,
+                                                0x00,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x01,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x02,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x03);
+    const __m128i shuffle_mask1 = _mm_setr_epi8(0xff,
+                                                0xff,
+                                                0xff,
+                                                0x04,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x05,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x06,
+                                                0xff,
+                                                0xff,
+                                                0xff,
+                                                0x07);
+
+    fbits = _mm_cmpgt_epi8(fbits, zeros);
+    fbits = _mm_and_si128(fbits, sign_extract);
+    __m128i sign_bits0 = _mm_shuffle_epi8(fbits, shuffle_mask0);
+    __m128i sign_bits1 = _mm_shuffle_epi8(fbits, shuffle_mask1);
+
+    __m256 sign_mask =
+        _mm256_insertf128_ps(sign_mask_dummy, _mm_castsi128_ps(sign_bits0), 0x0);
+    return _mm256_insertf128_ps(sign_mask, _mm_castsi128_ps(sign_bits1), 0x1);
+    //  // This is the desired function call. Though it seems to be missing in GCC.
+    //  // Compare: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+    //  return _mm256_set_m128(_mm_castsi128_ps(sign_bits1),
+    //  _mm_castsi128_ps(sign_bits0));
+}
+
+static inline void
+_mm256_polar_deinterleave(__m256* llr0, __m256* llr1, __m256 src0, __m256 src1)
+{
+    // deinterleave values
+    __m256 part0 = _mm256_permute2f128_ps(src0, src1, 0x20);
+    __m256 part1 = _mm256_permute2f128_ps(src0, src1, 0x31);
+    *llr0 = _mm256_shuffle_ps(part0, part1, 0x88);
+    *llr1 = _mm256_shuffle_ps(part0, part1, 0xdd);
+}
+
+static inline __m256 _mm256_polar_minsum_llrs(__m256 src0, __m256 src1)
+{
+    const __m256 sign_mask = _mm256_set1_ps(-0.0f);
+    const __m256 abs_mask =
+        _mm256_andnot_ps(sign_mask, _mm256_castsi256_ps(_mm256_set1_epi8(0xff)));
+
+    __m256 llr0, llr1;
+    _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
+
+    // calculate result
+    __m256 sign =
+        _mm256_xor_ps(_mm256_and_ps(llr0, sign_mask), _mm256_and_ps(llr1, sign_mask));
+    __m256 dst =
+        _mm256_min_ps(_mm256_and_ps(llr0, abs_mask), _mm256_and_ps(llr1, abs_mask));
+    return _mm256_or_ps(dst, sign);
+}
+
+static inline __m256 _mm256_polar_fsign_add_llrs(__m256 src0, __m256 src1, __m128i fbits)
+{
+    // prepare sign mask for correct +-
+    __m256 sign_mask = _mm256_polar_sign_mask(fbits);
+
+    __m256 llr0, llr1;
+    _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
+
+    // calculate result
+    llr0 = _mm256_xor_ps(llr0, sign_mask);
+    __m256 dst = _mm256_add_ps(llr0, llr1);
+    return dst;
+}
+
+static inline __m256 _mm256_accumulate_square_sum_ps(
+    __m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
+{
+    aux = _mm256_mul_ps(aux, val);
+    aux = _mm256_sub_ps(aux, acc);
+    aux = _mm256_mul_ps(aux, aux);
+    aux = _mm256_mul_ps(aux, rec);
+    return _mm256_add_ps(sq_acc, aux);
+}
+
+#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
diff --git a/include/volk/volk_common.h b/include/volk/volk_common.h

new file mode 100644 (file)

index 0000000..55a8145
--- /dev/null
+++ b/include/volk/volk_common.h
@@ -0,0 +1,171 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_LIBVOLK_COMMON_H
+#define INCLUDED_LIBVOLK_COMMON_H
+
+////////////////////////////////////////////////////////////////////////
+// Cross-platform attribute macros
+////////////////////////////////////////////////////////////////////////
+#if _MSC_VER
+#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE __forceinline
+#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+#define __VOLK_ATTR_EXPORT __declspec(dllexport)
+#define __VOLK_ATTR_IMPORT __declspec(dllimport)
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm
+#define __VOLK_VOLATILE
+#elif defined(__clang__)
+// AppleClang also defines __GNUC__, so do this check first.  These
+// will probably be the same as for __GNUC__, but let's keep them
+// separate just to be safe.
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif defined __GNUC__
+#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
+#define __VOLK_ATTR_UNUSED __attribute__((unused))
+#define __VOLK_ATTR_INLINE __attribute__((always_inline))
+#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#if __GNUC__ >= 4
+#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
+#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
+#else
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#endif
+#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
+#elif _MSC_VER
+#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE __forceinline
+#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
+#define __VOLK_ATTR_EXPORT __declspec(dllexport)
+#define __VOLK_ATTR_IMPORT __declspec(dllimport)
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm
+#define __VOLK_VOLATILE
+#else
+#define __VOLK_ATTR_ALIGNED(x)
+#define __VOLK_ATTR_UNUSED
+#define __VOLK_ATTR_INLINE
+#define __VOLK_ATTR_DEPRECATED
+#define __VOLK_ATTR_EXPORT
+#define __VOLK_ATTR_IMPORT
+#define __VOLK_PREFETCH(addr)
+#define __VOLK_ASM __asm__
+#define __VOLK_VOLATILE __volatile__
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Ignore annoying warnings in MSVC
+////////////////////////////////////////////////////////////////////////
+#if defined(_MSC_VER)
+#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
+                                // possible loss of data
+#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// C-linkage declaration macros
+// FIXME: due to the usage of complex.h, require gcc for c-linkage
+////////////////////////////////////////////////////////////////////////
+#if defined(__cplusplus) && (__GNUC__)
+#define __VOLK_DECL_BEGIN extern "C" {
+#define __VOLK_DECL_END }
+#else
+#define __VOLK_DECL_BEGIN
+#define __VOLK_DECL_END
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// Define VOLK_API for library symbols
+// http://gcc.gnu.org/wiki/Visibility
+////////////////////////////////////////////////////////////////////////
+#ifdef volk_EXPORTS
+#define VOLK_API __VOLK_ATTR_EXPORT
+#else
+#define VOLK_API __VOLK_ATTR_IMPORT
+#endif
+
+////////////////////////////////////////////////////////////////////////
+// The bit128 union used by some
+////////////////////////////////////////////////////////////////////////
+#include <stdint.h>
+
+#ifdef LV_HAVE_SSE
+#ifdef _WIN32
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+union bit128 {
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i[4];
+    float f[4];
+    double d[2];
+
+#ifdef LV_HAVE_SSE
+    __m128 float_vec;
+#endif
+
+#ifdef LV_HAVE_SSE2
+    __m128i int_vec;
+    __m128d double_vec;
+#endif
+};
+
+union bit256 {
+    uint8_t i8[32];
+    uint16_t i16[16];
+    uint32_t i[8];
+    float f[8];
+    double d[4];
+
+#ifdef LV_HAVE_AVX
+    __m256 float_vec;
+    __m256i int_vec;
+    __m256d double_vec;
+#endif
+};
+
+#define bit128_p(x) ((union bit128*)(x))
+#define bit256_p(x) ((union bit256*)(x))
+
+////////////////////////////////////////////////////////////////////////
+// log2f
+////////////////////////////////////////////////////////////////////////
+#include <math.h>
+// +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
+static inline float log2f_non_ieee(float f)
+{
+    float const result = log2f(f);
+    return isinf(result) ? copysignf(127.0f, result) : result;
+}
+
+////////////////////////////////////////////////////////////////////////
+// Constant used to do log10 calculations as faster log2
+////////////////////////////////////////////////////////////////////////
+// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
+#define volk_log2to10factor 3.01029995663981209120
+
+#endif /*INCLUDED_LIBVOLK_COMMON_H*/
diff --git a/include/volk/volk_complex.h b/include/volk/volk_complex.h

new file mode 100644 (file)

index 0000000..c7dd83d
--- /dev/null
+++ b/include/volk/volk_complex.h
@@ -0,0 +1,106 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2010, 2011, 2015, 2018, 2020, 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_COMPLEX_H
+#define INCLUDED_VOLK_COMPLEX_H
+
+/*!
+ * \brief Provide typedefs and operators for all complex types in C and C++.
+ *
+ * The typedefs encompass all signed integer and floating point types.
+ * Each operator function is intended to work across all data types.
+ * Under C++, these operators are defined as inline templates.
+ * Under C, these operators are defined as preprocessor macros.
+ * The use of macros makes the operators agnostic to the type.
+ *
+ * The following operator functions are defined:
+ * - lv_cmake - make a complex type from components
+ * - lv_creal - get the real part of the complex number
+ * - lv_cimag - get the imaginary part of the complex number
+ * - lv_conj - take the conjugate of the complex number
+ */
+
+#ifdef __cplusplus
+
+#include <stdint.h>
+#include <complex>
+
+typedef std::complex<int8_t> lv_8sc_t;
+typedef std::complex<int16_t> lv_16sc_t;
+typedef std::complex<int32_t> lv_32sc_t;
+typedef std::complex<int64_t> lv_64sc_t;
+typedef std::complex<float> lv_32fc_t;
+typedef std::complex<double> lv_64fc_t;
+
+template <typename T>
+inline std::complex<T> lv_cmake(const T& r, const T& i)
+{
+    return std::complex<T>(r, i);
+}
+
+template <typename T>
+inline typename T::value_type lv_creal(const T& x)
+{
+    return x.real();
+}
+
+template <typename T>
+inline typename T::value_type lv_cimag(const T& x)
+{
+    return x.imag();
+}
+
+template <typename T>
+inline T lv_conj(const T& x)
+{
+    return std::conj(x);
+}
+
+#else /* __cplusplus */
+
+#include <complex.h>
+#include <tgmath.h>
+
+typedef char complex lv_8sc_t;
+typedef short complex lv_16sc_t;
+typedef long complex lv_32sc_t;
+typedef long long complex lv_64sc_t;
+typedef float complex lv_32fc_t;
+typedef double complex lv_64fc_t;
+
+#define lv_cmake(r, i) ((r) + _Complex_I * (i))
+
+// When GNUC is available, use the complex extensions.
+// The extensions always return the correct value type.
+// http://gcc.gnu.org/onlinedocs/gcc/Complex.html
+#ifdef __GNUC__
+
+#define lv_creal(x) (__real__(x))
+
+#define lv_cimag(x) (__imag__(x))
+
+#define lv_conj(x) (~(x))
+
+// When not available, use the c99 complex function family,
+// which always returns double regardless of the input type,
+// unless we have C99 and thus tgmath.h overriding functions
+// with type-generic versions.
+#else /* __GNUC__ */
+
+#define lv_creal(x) (creal(x))
+
+#define lv_cimag(x) (cimag(x))
+
+#define lv_conj(x) (conj(x))
+
+#endif /* __GNUC__ */
+
+#endif /* __cplusplus */
+
+#endif /* INCLUDE_VOLK_COMPLEX_H */
diff --git a/include/volk/volk_malloc.h b/include/volk/volk_malloc.h

new file mode 100644 (file)

index 0000000..a26f5f6
--- /dev/null
+++ b/include/volk/volk_malloc.h
@@ -0,0 +1,62 @@
+/* -*- c -*- */
+/*
+ * Copyright 2014, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_MALLOC_H
+#define INCLUDED_VOLK_MALLOC_H
+
+#include <stdlib.h>
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+/*!
+ * \brief Allocate \p size bytes of data aligned to \p alignment.
+ *
+ * \details
+ * We use C11 and want to rely on C11 library features,
+ * namely we use `aligned_alloc` to allocate aligned memory.
+ * see: https://en.cppreference.com/w/c/memory/aligned_alloc
+ *
+ * Not all platforms support this feature.
+ * For Apple Clang, we fall back to `posix_memalign`.
+ * see: https://linux.die.net/man/3/aligned_alloc
+ * For MSVC, we fall back to `_aligned_malloc`.
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2019
+ *
+ * Because of the ways in which volk_malloc may allocate memory, it is
+ * important to always free volk_malloc pointers using volk_free.
+ * Mainly, in case MSVC is used. Consult corresponding documentation
+ * in case you use MSVC.
+ *
+ * \param size The number of bytes to allocate.
+ * \param alignment The byte alignment of the allocated memory.
+ * \return pointer to aligned memory.
+ */
+VOLK_API void* volk_malloc(size_t size, size_t alignment);
+
+/*!
+ * \brief Free's memory allocated by volk_malloc.
+ *
+ * \details
+ * We rely on C11 syntax and compilers and just call `free` in case
+ * memory was allocated with `aligned_alloc` or `posix_memalign`.
+ * Thus, in this case `volk_free` inherits the same behavior `free` exhibits.
+ * see: https://en.cppreference.com/w/c/memory/free
+ * In case `_aligned_malloc` was used, we call `_aligned_free`.
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=vs-2019
+ *
+ * \param aptr The aligned pointer allocated by volk_malloc.
+ */
+VOLK_API void volk_free(void* aptr);
+
+__VOLK_DECL_END
+
+#endif /* INCLUDED_VOLK_MALLOC_H */
diff --git a/include/volk/volk_neon_intrinsics.h b/include/volk/volk_neon_intrinsics.h

new file mode 100644 (file)

index 0000000..56d9270
--- /dev/null
+++ b/include/volk/volk_neon_intrinsics.h
@@ -0,0 +1,284 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * Copyright (c) 2016-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * _vtaylor_polyq_f32
+ * _vlogq_f32
+ *
+ */
+
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+
+  _vsincosq_f32
+
+*/
+
+/*
+ * This file is intended to hold NEON intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
+#include <arm_neon.h>
+
+
+/* Magnitude squared for float32x4x2_t */
+static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
+{
+    float32x4_t iValue, qValue, result;
+    iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
+    qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
+    result = vaddq_f32(iValue, qValue);                       // Add the I2 and Q2 values
+    return result;
+}
+
+/* Inverse square root for float32x4_t */
+static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
+{
+    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
+    sqrt_reciprocal = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+
+    return sqrt_reciprocal;
+}
+
+/* Inverse */
+static inline float32x4_t _vinvq_f32(float32x4_t x)
+{
+    // Newton's method
+    float32x4_t recip = vrecpeq_f32(x);
+    recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
+    return recip;
+}
+
+/* Complex multiplication for float32x4x2_t */
+static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
+                                                    float32x4x2_t b_val)
+{
+    float32x4x2_t tmp_real;
+    float32x4x2_t tmp_imag;
+    float32x4x2_t c_val;
+
+    // multiply the real*real and imag*imag to get real result
+    // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+    tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+    // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+    tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+    // Multiply cross terms to get the imaginary result
+    // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+    tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+    // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+    tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+    // combine the products
+    c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+    c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+    return c_val;
+}
+
+/* From ARM Compute Library, MIT license */
+static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
+{
+    float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
+    float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
+    float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
+    float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
+    float32x4_t x2 = vmulq_f32(x, x);
+    float32x4_t x4 = vmulq_f32(x2, x2);
+    float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
+    return res;
+}
+
+/* Natural logarithm.
+ * From ARM Compute Library, MIT license */
+static inline float32x4_t _vlogq_f32(float32x4_t x)
+{
+    const float32x4_t log_tab[8] = {
+        vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
+        vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
+        vdupq_n_f32(5.17591238022f),  vdupq_n_f32(0.844007015228f),
+        vdupq_n_f32(4.58445882797f),  vdupq_n_f32(0.0141278216615f),
+    };
+
+    const int32x4_t CONST_127 = vdupq_n_s32(127);             // 127
+    const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
+
+    // Extract exponent
+    int32x4_t m = vsubq_s32(
+        vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
+    float32x4_t val =
+        vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
+
+    // Polynomial Approximation
+    float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
+
+    // Reconstruct
+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
+
+    return poly;
+}
+
+/* Evaluation of 4 sines & cosines at once.
+ * Optimized from here (zlib license)
+ * http://gruntthepeon.free.fr/ssemath/ */
+static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
+{
+    const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
+    const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
+    const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
+    const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
+    const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
+    const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
+    const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
+    const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
+    const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
+    const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
+
+    const float32x4_t CONST_1 = vdupq_n_f32(1.f);
+    const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
+    const float32x4_t CONST_0 = vdupq_n_f32(0.f);
+    const uint32x4_t CONST_2 = vdupq_n_u32(2);
+    const uint32x4_t CONST_4 = vdupq_n_u32(4);
+
+    uint32x4_t emm2;
+
+    uint32x4_t sign_mask_sin, sign_mask_cos;
+    sign_mask_sin = vcltq_f32(x, CONST_0);
+    x = vabsq_f32(x);
+    // scale by 4/pi
+    float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
+
+    // store the integer part of y in mm0
+    emm2 = vcvtq_u32_f32(y);
+    /* j=(j+1) & (~1) (see the cephes sources) */
+    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+    y = vcvtq_f32_u32(emm2);
+
+    /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed. */
+    const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
+
+    // The magic pass: "Extended precision modular arithmetic"
+    x = vmlaq_f32(x, y, c_minus_cephes_DP1);
+    x = vmlaq_f32(x, y, c_minus_cephes_DP2);
+    x = vmlaq_f32(x, y, c_minus_cephes_DP3);
+
+    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
+    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
+
+    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+    float32x4_t y1, y2;
+    float32x4_t z = vmulq_f32(x, x);
+
+    y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
+    y1 = vmlaq_f32(c_coscof_p2, z, y1);
+    y1 = vmulq_f32(y1, z);
+    y1 = vmulq_f32(y1, z);
+    y1 = vmlsq_f32(y1, z, CONST_1_2);
+    y1 = vaddq_f32(y1, CONST_1);
+
+    y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
+    y2 = vmlaq_f32(c_sincof_p2, z, y2);
+    y2 = vmulq_f32(y2, z);
+    y2 = vmlaq_f32(x, x, y2);
+
+    /* select the correct result from the two polynoms */
+    const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
+    const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
+
+    float32x4x2_t sincos;
+    sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+    sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+
+    return sincos;
+}
+
+static inline float32x4_t _vsinq_f32(float32x4_t x)
+{
+    const float32x4x2_t sincos = _vsincosq_f32(x);
+    return sincos.val[0];
+}
+
+static inline float32x4_t _vcosq_f32(float32x4_t x)
+{
+    const float32x4x2_t sincos = _vsincosq_f32(x);
+    return sincos.val[1];
+}
+
+static inline float32x4_t _vtanq_f32(float32x4_t x)
+{
+    const float32x4x2_t sincos = _vsincosq_f32(x);
+    return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
+}
+
+static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
+                                                          float32x4_t acc,
+                                                          float32x4_t val,
+                                                          float32x4_t rec,
+                                                          float32x4_t aux)
+{
+    aux = vmulq_f32(aux, val);
+    aux = vsubq_f32(aux, acc);
+    aux = vmulq_f32(aux, aux);
+#ifdef LV_HAVE_NEONV8
+    return vfmaq_f32(sq_acc, aux, rec);
+#else
+    aux = vmulq_f32(aux, rec);
+    return vaddq_f32(sq_acc, aux);
+#endif
+}
+
+#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
diff --git a/include/volk/volk_prefs.h b/include/volk/volk_prefs.h

new file mode 100644 (file)

index 0000000..3aa8bc5
--- /dev/null
+++ b/include/volk/volk_prefs.h
@@ -0,0 +1,39 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011, 2012, 2015, 2019, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_PREFS_H
+#define INCLUDED_VOLK_PREFS_H
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_arch_pref {
+    char name[128];   // name of the kernel
+    char impl_a[128]; // best aligned impl
+    char impl_u[128]; // best unaligned impl
+} volk_arch_pref_t;
+
+////////////////////////////////////////////////////////////////////////
+// get path to volk_config profiling info; second arguments specifies
+// if config file should be tested on existence for reading.
+// returns \0 in the argument on failure.
+////////////////////////////////////////////////////////////////////////
+VOLK_API void volk_get_config_path(char*, bool);
+
+////////////////////////////////////////////////////////////////////////
+// load prefs into global prefs struct
+////////////////////////////////////////////////////////////////////////
+VOLK_API size_t volk_load_preferences(volk_arch_pref_t**);
+
+__VOLK_DECL_END
+
+#endif // INCLUDED_VOLK_PREFS_H
diff --git a/include/volk/volk_sse3_intrinsics.h b/include/volk/volk_sse3_intrinsics.h

new file mode 100644 (file)

index 0000000..d34de45
--- /dev/null
+++ b/include/volk/volk_sse3_intrinsics.h
@@ -0,0 +1,67 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold SSE3 intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_
+#include <pmmintrin.h>
+
+static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y)
+{
+    __m128 yl, yh, tmp1, tmp2;
+    yl = _mm_moveldup_ps(y);        // Load yl with cr,cr,dr,dr
+    yh = _mm_movehdup_ps(y);        // Load yh with ci,ci,di,di
+    tmp1 = _mm_mul_ps(x, yl);       // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+    x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+    tmp2 = _mm_mul_ps(x, yh);       // tmp2 = ai*ci,ar*ci,bi*di,br*di
+    return _mm_addsub_ps(tmp1,
+                         tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+}
+
+static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
+{
+    const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+    y = _mm_xor_ps(y, conjugator); // conjugate y
+    return _mm_complexmul_ps(x, y);
+}
+
+static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+    cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+    cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+    return _mm_hadd_ps(cplxValue1, cplxValue2);      // Add the I2 and Q2 values
+}
+
+static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
+{
+    return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
+}
+
+static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0,
+                                                  const __m128 symbols1,
+                                                  const __m128 points0,
+                                                  const __m128 points1,
+                                                  const __m128 scalar)
+{
+    /*
+     * Calculate: |y - x|^2 * SNR_lin
+     * Consider 'symbolsX' and 'pointsX' to be complex float
+     * 'symbolsX' are 'y' and 'pointsX' are 'x'
+     */
+    const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+    const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+    const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1);
+    return _mm_mul_ps(norms, scalar);
+}
+
+#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
diff --git a/include/volk/volk_sse_intrinsics.h b/include/volk/volk_sse_intrinsics.h

new file mode 100644 (file)

index 0000000..ce90065
--- /dev/null
+++ b/include/volk/volk_sse_intrinsics.h
@@ -0,0 +1,59 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This file is intended to hold SSE intrinsics of intrinsics.
+ * They should be used in VOLK kernels to avoid copy-pasta.
+ */
+
+#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
+#include <xmmintrin.h>
+
+static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+    __m128 iValue, qValue;
+    // Arrange in i1i2i3i4 format
+    iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+    // Arrange in q1q2q3q4 format
+    qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+    iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+    qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+    return _mm_add_ps(iValue, qValue);   // Add the I2 and Q2 values
+}
+
+static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
+{
+    return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
+}
+
+static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
+                                                 const __m128 symbols1,
+                                                 const __m128 points0,
+                                                 const __m128 points1,
+                                                 const __m128 scalar)
+{
+    // calculate scalar * |x - y|^2
+    const __m128 diff0 = _mm_sub_ps(symbols0, points0);
+    const __m128 diff1 = _mm_sub_ps(symbols1, points1);
+    const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
+    return _mm_mul_ps(norms, scalar);
+}
+
+static inline __m128 _mm_accumulate_square_sum_ps(
+    __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
+{
+    aux = _mm_mul_ps(aux, val);
+    aux = _mm_sub_ps(aux, acc);
+    aux = _mm_mul_ps(aux, aux);
+    aux = _mm_mul_ps(aux, rec);
+    return _mm_add_ps(sq_acc, aux);
+}
+
+#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */
diff --git a/include/volk/volk_version.h.in b/include/volk/volk_version.h.in

new file mode 100644 (file)

index 0000000..46013da
--- /dev/null
+++ b/include/volk/volk_version.h.in
@@ -0,0 +1,37 @@
+/* -*- C -*- */
+/*
+ * Copyright 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_VERSION_H
+#define INCLUDED_VOLK_VERSION_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+/*
+ * define macros for the Volk version, which can then be used by any
+ * project that #include's this header, e.g., to determine whether
+ * some specific API is present and functional.
+ */
+
+#define VOLK_VERSION_MAJOR @VOLK_VERSION_MAJOR@
+#define VOLK_VERSION_MINOR @VOLK_VERSION_MINOR@
+#define VOLK_VERSION_MAINT @VOLK_VERSION_MAINT@
+
+/*
+ * VOLK_VERSION % 100 is the MAINT version
+ * (VOLK_VERSION / 100) % 100 is the MINOR version
+ * (VOLK_VERSION / 100) / 100 is the MAJOR version
+ */
+
+#define VOLK_VERSION @VOLK_VERSION_MAJOR@@VOLK_VERSION_MINOR@@VOLK_VERSION_MAINT@
+
+__VOLK_DECL_END
+
+#endif /* INCLUDED_VOLK_VERSION_H */
diff --git a/kernels/README.md b/kernels/README.md

new file mode 100644 (file)

index 0000000..ed44a49
--- /dev/null
+++ b/kernels/README.md
@@ -0,0 +1,68 @@
+# How to create custom kernel dispatchers
+
+A kernel dispatcher is kernel implementation that calls other kernel implementations.
+By default, a dispatcher is generated by the build system for every kernel such that:
+  * the best aligned implementation is called when all pointer arguments are aligned,
+  * and otherwise the best unaligned implementation is called.
+
+The author of a VOLK kernel may create a custom dispatcher,
+to be called in place of the automatically generated one.
+A custom dispatcher may be useful to handle head and tail cases,
+or to implement different alignment and bounds checking logic.
+
+## Code for an example dispatcher w/ tail case
+
+```cpp
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
+{
+    const unsigned int num_points_r = num_points%4;
+    const unsigned int num_points_x = num_points - num_points_r;
+
+    if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
+    {
+        volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
+    }
+    else
+    {
+        volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
+    }
+
+    volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
+}
+
+#endif //LV_HAVE_DISPATCHER
+```
+
+## Code for an example dispatcher w/ tail case and accumulator
+
+```cpp
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_DISPATCHER
+
+static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
+{
+    const unsigned int num_points_r = num_points%16;
+    const unsigned int num_points_x = num_points - num_points_r;
+
+    if (volk_is_aligned(VOLK_OR_PTR(input, taps)))
+    {
+        volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
+    }
+    else
+    {
+        volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
+    }
+
+    float result_tail = 0;
+    volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
+
+    *result += result_tail;
+}
+
+#endif //LV_HAVE_DISPATCHER
+```
diff --git a/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s b/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s

new file mode 100644 (file)

index 0000000..14041d7
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s
@@ -0,0 +1,51 @@
+@ static inline void volk_16i_max_star_horizontal_16i_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_16i_max_star_horizontal_16i_a_neonasm
+volk_16i_max_star_horizontal_16i_a_neonasm:
+       @ r0 - cVector: pointer to output array
+       @ r1 - aVector: pointer to input array 1
+       @ r2 - num_points: number of items to process
+
+    pld     [r1, #128]
+    push    {r4, r5, r6}    @ preserve register states
+    lsrs    r5, r2, #4      @ 1/16th points = num_points/16
+    vmov.i32    q12, #0     @ q12 = [0,0,0,0]
+    beq .smallvector        @ less than 16 elements in vector
+    mov r4, r1              @ r4 = aVector
+    mov r12, r0             @ gcc calls this ip
+    mov r3, #0              @ number = 0
+
+.loop1:
+    vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load
+    pld [r4, #128]
+    add r3, r3, #1           @ number += 1
+    cmp r3, r5               @ number < 1/16th points
+    vsub.i16    q10, q8, q9  @ subtraction
+    vcge.s16    q11, q10, #0 @ result > 0?
+    vcgt.s16    q10, q12, q10 @ result < 0?
+    vand.i16    q11, q8, q11 @ multiply by comparisons
+    vand.i16    q10, q9, q10 @ multiply by other comparison
+    vadd.i16    q10, q11, q10 @ add results to get max
+    vst1.16 {d20-d21}, [r12]! @ store the results
+    bne .loop1               @ at least 16 items left
+    add r1, r1, r3, lsl #5
+    add r0, r0, r3, lsl #4
+.smallvector:
+    ands    r2, r2, #15
+    beq .end
+    mov r3, #0
+.loop3:
+    ldrh    r4, [r1]
+    bic r5, r3, #1
+    ldrh    ip, [r1, #2]
+    add r3, r3, #2
+    add r1, r1, #4
+    rsb r6, ip, r4
+    sxth    r6, r6
+    cmp r6, #0
+    movgt   ip, r4
+    cmp r3, r2
+    strh    ip, [r0, r5]
+    bcc .loop3
+.end:
+    pop {r4, r5, r6}
+    bx  lr
diff --git a/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s b/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s

new file mode 100644 (file)

index 0000000..d3c0a49
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s
@@ -0,0 +1,52 @@
+@ static inline void volk_32f_s32f_multiply_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32f_s32f_multiply_32f_a_neonasm
+volk_32f_s32f_multiply_32f_a_neonasm:
+       @ r0 - cVector: pointer to output array
+       @ r1 - aVector: pointer to input array 1
+       @ r2 - bVector: pointer to input array 2
+       @ r3 - num_points: number of items to process
+
+       stmfd   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12}    @ prologue - save register states
+
+
+    @ quarter_points = num_points / 4
+       movs r11, r3, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+    @ number = quarter_points
+       mov r10, r3
+    @ copy address of input vector
+    mov r4, r1
+    @ copy address of output vector
+    mov r5, r0
+
+    @ load the scalar to a quad register
+    @ vmov.32 d2[0], r2
+    @ The scalar might be in s0, not totally sure
+    vdup.32 q2, d0[0]
+
+    @ this is giving fits. Current theory is hf has something to do with it
+    .loop1:
+    @  vld1.32 {q1}, [r4:128]! @ aVal
+    @  vmul.f32 q3, q1, q2
+    @  vst1.32 {q3}, [r5:128]! @ cVal
+    @
+    @  subs r10, r10, #1
+    @  bne     .loop1  @ first loop
+
+    @ number = quarter_points * 4
+    mov        r10, r11, asl #2
+
+    .loop2:
+    @   cmp    num_points, number
+    @   bls    .done
+    @
+    @   vld1.32 {d0[0]}, [aVector]!
+    @   vmul.f32 s2, s0, s4
+    @   vst1.32 {d1[0]}, [cVector]!
+    @   add number, number, #1
+    @   b .loop2
+
+.done:
+       ldmfd   sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s b/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s

new file mode 100644 (file)

index 0000000..7b561d6
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s
@@ -0,0 +1,54 @@
+@ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32f_x2_add_32f_a_neonasm
+volk_32f_x2_add_32f_a_neonasm:
+       @ r0 - cVector: pointer to output array
+       @ r1 - aVector: pointer to input array 1
+       @ r2 - bVector: pointer to input array 2
+       @ r3 - num_points: number of items to process
+       cVector .req r0
+       aVector .req r1
+       bVector .req r2
+       num_points .req r3
+       quarterPoints .req r7
+       number .req r8
+       aVal .req q0 @ d0-d1
+       bVal .req q1 @ d2-d3
+       cVal .req q2 @ d4-d5
+
+       @ AAPCS Section 5.1.1
+       @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+       stmfd   sp!, {r7, r8, sl}       @ prologue - save register states
+
+       movs quarterPoints, num_points, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+       mov     number, #0      @ number, 0
+.loop1:
+       pld [aVector, #128] @ pre-load hint - this is implementation specific!
+       pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+       vld1.32 {d0-d1}, [aVector:128]! @ aVal
+       add     number, number, #1
+       vld1.32 {d2-d3}, [bVector:128]! @ bVal
+       vadd.f32 cVal, bVal, aVal
+       cmp     number, quarterPoints
+       vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+       blt     .loop1  @ first loop
+
+       mov     number, quarterPoints, asl #2
+
+.loop2:
+       cmp     num_points, number
+       bls     .done
+
+       vld1.32 {d0[0]}, [aVector]!
+       vld1.32 {d0[1]}, [bVector]!
+       vadd.f32 s2, s1, s0
+       vst1.32 {d1[0]}, [cVector]!
+       add number, number, #1
+       b .loop2
+
+.done:
+       ldmfd   sp!, {r7, r8, sl} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s b/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s

new file mode 100644 (file)

index 0000000..07409e8
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s
@@ -0,0 +1,65 @@
+@ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32f_x2_add_32f_a_neonpipeline
+volk_32f_x2_add_32f_a_neonpipeline:
+       @ r0 - cVector: pointer to output array
+       @ r1 - aVector: pointer to input array 1
+       @ r2 - bVector: pointer to input array 2
+       @ r3 - num_points: number of items to process
+       cVector .req r0
+       aVector .req r1
+       bVector .req r2
+       num_points .req r3
+       quarterPoints .req r7
+       number .req r8
+       aVal .req q0 @ d0-d1
+       bVal .req q1 @ d2-d3
+       cVal .req q2 @ d4-d5
+
+       stmfd   sp!, {r7, r8, sl}       @ prologue - save register states
+
+       pld [aVector, #128] @ pre-load hint - this is implementation specific!
+       pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+       movs quarterPoints, num_points, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+       mov number, quarterPoints
+
+       @ Optimizing for pipeline
+       vld1.32 {d0-d1}, [aVector:128]! @ aVal
+       vld1.32 {d2-d3}, [bVector:128]! @ bVal
+       subs number, number, #1
+    beq .flushpipe
+
+.loop1:
+       pld [aVector, #128] @ pre-load hint - this is implementation specific!
+       pld [bVector, #128] @ pre-load hint - this is implementation specific!
+       vadd.f32 cVal, bVal, aVal
+       vld1.32 {d0-d1}, [aVector:128]! @ aVal
+       vld1.32 {d2-d3}, [bVector:128]! @ bVal
+       vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+       subs number, number, #1
+       bne     .loop1  @ first loop
+
+.flushpipe:
+       @ One more time
+       vadd.f32 cVal, bVal, aVal
+       vst1.32 {d4-d5}, [cVector:128]! @ cVal
+
+       mov     number, quarterPoints, asl #2
+
+.loop2:
+       cmp     num_points, number
+       bls     .done
+
+       vld1.32 {d0[0]}, [aVector]!
+       vld1.32 {d0[1]}, [bVector]!
+       vadd.f32 s2, s1, s0
+       vst1.32 {d1[0]}, [cVector]!
+       add number, number, #1
+       b .loop2
+
+.done:
+       ldmfd   sp!, {r7, r8, sl} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s b/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s

new file mode 100644 (file)

index 0000000..949d203
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s
@@ -0,0 +1,58 @@
+@ static inline void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32f_x2_dot_prod_32f_a_neonasm
+volk_32f_x2_dot_prod_32f_a_neonasm:
+       @ r0 - cVector: pointer to output array
+       @ r1 - aVector: pointer to input array 1
+       @ r2 - bVector: pointer to input array 2
+       @ r3 - num_points: number of items to process
+       cVector .req r0
+       aVector .req r1
+       bVector .req r2
+       num_points .req r3
+       quarterPoints .req r7
+       number .req r8
+       aVal .req q0 @ d0-d1
+       bVal .req q1 @ d2-d3
+       cVal .req q2 @ d4-d5
+
+       @ AAPCS Section 5.1.1
+       @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP
+       stmfd   sp!, {r7, r8, sl}       @ prologue - save register states
+
+    veor.32 q0, q0, q0
+       movs quarterPoints, num_points, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+       mov     number, #0      @ number, 0
+.loop1:
+       pld [aVector, #128] @ pre-load hint - this is implementation specific!
+       pld [bVector, #128] @ pre-load hint - this is implementation specific!
+
+       vld1.32 {q1}, [aVector:128]!    @ aVal
+       vld1.32 {q2}, [bVector:128]!    @ bVal
+    vmla.f32 q0, q1, q2
+
+       add     number, number, #1
+       cmp     number, quarterPoints
+       blt     .loop1  @ first loop
+
+    @ strange order comes from trying to schedule instructions
+    vadd.f32 s0, s0, s1
+    vadd.f32 s2, s2, s3
+       mov     number, quarterPoints, asl #2
+    vadd.f32 s0, s0, s2
+
+.loop2:
+       cmp     num_points, number
+       bls     .done
+
+       vld1.32 {d1[0]}, [aVector]!
+       vld1.32 {d1[1]}, [bVector]!
+       vmla.f32 s0, s2, s3
+       add number, number, #1
+       b .loop2
+
+.done:
+       vstr s0, [cVector]
+       ldmfd   sp!, {r7, r8, sl} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s b/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s

new file mode 100644 (file)

index 0000000..e4002b8
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s
@@ -0,0 +1,116 @@
+@ static inline void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+    @ r0 = cVector
+    @ r1 = aVector
+    @ r2 = bVector
+    @ r3 = num_points
+    .global    volk_32f_x2_dot_prod_32f_a_neonasm_opts
+volk_32f_x2_dot_prod_32f_a_neonasm_opts:
+     push    {r4, r5, r6, r7, r8, r9, r10, r11}
+    @ sixteenth_points = num_points / 16
+     lsrs       r8, r3, #4
+     sub        r13, r13, #16 @ subtracting 16 from stack pointer?, wat?
+    @ 0 out neon accumulators
+     veor       q0, q3, q3
+     veor       q1, q3, q3
+     veor       q2, q3, q3
+     veor       q3, q3, q3
+     beq        .smallvector @ if less than 16 points skip main loop
+     mov        r7, r2  @ copy input ptrs
+     mov        r6, r1  @ copy input ptrs
+     mov        r5, #0  @ loop counter
+.mainloop:
+     vld4.32    {d16,d18,d20,d22}, [r6]!
+     add        r5, r5, #1 @ inc loop counter
+     cmp        r5, r8     @ loop counter < sixteenth_points?
+     vld4.32    {d24,d26,d28,d30}, [r7]!
+     vld4.32    {d17,d19,d21,d23}, [r6]!
+     vld4.32    {d25,d27,d29,d31}, [r7]!
+     vmla.f32   q3, q8, q12
+     vmla.f32   q0, q13, q9
+     vmla.f32   q1, q14, q10
+     vmla.f32   q2, q15, q11
+     bne        .mainloop
+     lsl        r12, r8, #6 @ r12=r8/64
+     add        r1, r1, r12
+     add        r2, r2, r12
+.smallvector: @ actually this can be skipped for small vectors
+     vadd.f32   q3, q3, q0
+     lsl        r8, r8, #4 @ sixteenth_points * 16
+     cmp        r3, r8     @ num_points < sixteenth_points*16?
+     vadd.f32   q2, q1, q2
+     vadd.f32   q3, q2, q3 @ sum of 4 accumulators in to q3
+     vadd.f32   s15, s12, s15 @ q3 is s12-s15, so reduce to a single float
+     vadd.f32   s15, s15, s13
+     vadd.f32   s15, s15, s14
+     bls        .done      @ if vector is multiple of 16 then finish
+     sbfx       r11, r1, #2, #1 @ check alignment
+     rsb        r9, r8, r3
+     and        r11, r11, #3
+     mov        r6, r1
+     cmp        r11, r9
+     movcs      r11, r9
+     cmp        r9, #3
+     movls      r11, r9
+     cmp        r11, #0
+     beq        .nothingtodo
+     mov        r5, r2
+     mov        r12, r8
+.dlabel5:
+     add        r12, r12, #1
+     vldmia     r6!, {s14}
+     rsb        r4, r8, r12
+     vldmia     r5!, {s13}
+     cmp        r4, r11
+     vmla.f32   s15, s13, s14
+     mov        r7, r6
+     mov        r4, r5
+     bcc        .dlabel5
+     cmp        r9, r11
+     beq        .done
+.dlabel8:
+     rsb        r9, r11, r9
+     lsr        r8, r9, #2
+     lsls       r10, r8, #2
+     beq        .dlabel6
+     lsl        r6, r11, #2
+     veor       q8, q8, q8
+     add        r1, r1, r6
+     add        r6, r2, r6
+     mov        r5, #0
+.dlabel9:
+     add        r5, r5, #1
+     vld1.32    {d20-d21}, [r6]!
+     cmp        r5, r8
+     vld1.64    {d18-d19}, [r1 :64]!
+     vmla.f32   q8, q10, q9
+     bcc        .dlabel9
+     vadd.f32   d16, d16, d17
+     lsl        r2, r10, #2
+     veor       q9, q9, q9
+     add        r7, r7, r2
+     vpadd.f32  d6, d16, d16
+     add        r4, r4, r2
+     cmp        r9, r10
+     add        r12, r12, r10
+     vadd.f32   s15, s15, s12
+     beq        .done
+.dlabel6:
+     mov        r2, r7
+.dlabel7:
+     add        r12, r12, #1
+     vldmia     r2!, {s13}
+     cmp        r3, r12
+     vldmia     r4!, {s14}
+     vmla.f32   s15, s13, s14
+     bhi        .dlabel7
+.done:
+     vstr       s15, [r0]
+     add        r13, r13, #16
+     pop        {r4, r5, r6, r7, r8, r9, r10, r11}
+     bx         lr @ lr is the return address
+.nothingtodo:
+     mov        r12, r8
+     mov        r4, r2
+     mov        r7, r1
+     b          .dlabel8
+
diff --git a/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s

new file mode 100644 (file)

index 0000000..481cade
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s
@@ -0,0 +1,79 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+    .global    volk_32fc_32f_dot_prod_32fc_a_neonasm
+    volk_32fc_32f_dot_prod_32fc_a_neonasm:
+    @ r0 - result: pointer to output array (32fc)
+    @ r1 - input: pointer to input array 1 (32fc)
+    @ r2 - taps: pointer to input array 2 (32f)
+    @ r3 - num_points: number of items to process
+    
+    result .req r0
+    input .req r1
+    taps .req r2
+    num_points .req r3
+    quarterPoints .req r7
+    number .req r8
+    @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+    realAccQ   .req q0 @ d0-d1/s0-s3
+    compAccQ   .req q1 @ d2-d3/s4-s7
+    realAccS   .req s0 @ d0[0]
+    compAccS   .req s4 @ d2[0]
+    tapsVal    .req q2 @ d4-d5
+    outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+    inCompVal  .req q11 @ d20-d21
+    
+    stmfd      sp!, {r7, r8, sl}       @ prologue - save register states
+    
+    veor realAccQ, realAccQ @ zero out accumulators
+    veor compAccQ, compAccQ @ zero out accumulators
+    movs quarterPoints, num_points, lsr #2
+    beq .loop2 @ if zero into quarterPoints
+    
+    mov number, quarterPoints
+
+.loop1:
+    @ do work here
+    @pld [taps, #128] @ pre-load hint - this is implementation specific!
+    @pld [input, #128] @ pre-load hint - this is implementation specific!
+    vld1.32 {d4-d5}, [taps:128]! @ tapsVal
+    vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal
+    vmul.f32 realMul, tapsVal, inRealVal
+    vmul.f32 compMul, tapsVal, inCompVal
+    vadd.f32 realAccQ, realAccQ, realMul
+    vadd.f32 compAccQ, compAccQ, compMul
+    subs number, number, #1
+    bne        .loop1  @ first loop
+
+    @ Sum up across realAccQ and compAccQ
+    vpadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vpadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (realAccQ)
+       mov     number, quarterPoints, asl #2
+
+.loop2:
+    cmp        num_points, number
+    bls        .done
+    
+    vld1.32 {d4[0]}, [taps]! @ s8
+    vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+    vmul.f32 s5, s8, s10
+    vmul.f32 s6, s8, s12
+    vadd.f32 realAccS, realAccS, s5
+    vadd.f32 compAccS, compAccS, s6
+    
+    add number, number, #1
+    b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+    ldmfd      sp!, {r7, r8, sl} @ epilogue - restore register states
+    bx lr
diff --git a/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s

new file mode 100644 (file)

index 0000000..309c921
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s
@@ -0,0 +1,74 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points)
+       .global volk_32fc_32f_dot_prod_32fc_a_neonasmvmla
+volk_32fc_32f_dot_prod_32fc_a_neonasmvmla:
+       @ r0 - result: pointer to output array (32fc)
+       @ r1 - input: pointer to input array 1 (32fc)
+       @ r2 - taps: pointer to input array 2 (32f)
+       @ r3 - num_points: number of items to process
+
+       result .req r0
+       input .req r1
+       taps .req r2
+       num_points .req r3
+       quarterPoints .req r7
+       number .req r8
+       @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+       realAccQ   .req q0 @ d0-d1/s0-s3
+       compAccQ   .req q1 @ d2-d3/s4-s7
+       realAccS   .req s0 @ d0[0]
+       compAccS   .req s4 @ d2[0]
+       tapsVal    .req q2 @ d4-d5
+       outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+       inCompVal  .req q11 @ d20-d21
+
+       stmfd   sp!, {r7, r8, sl}       @ prologue - save register states
+
+       veor realAccQ, realAccQ @ zero out accumulators
+       veor compAccQ, compAccQ @ zero out accumulators
+       movs quarterPoints, num_points, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+       mov number, quarterPoints
+
+.loop1:
+       @ do work here
+       pld [taps, #128] @ pre-load hint - this is implementation specific!
+       pld [input, #128] @ pre-load hint - this is implementation specific!
+       vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+       vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+       vmla.f32 realAccQ, tapsVal, inRealVal
+       vmla.f32 compAccQ, tapsVal, inCompVal
+       subs number, number, #1
+       bne     .loop1  @ first loop
+
+    @ Sum up across realAccQ and compAccQ
+    vadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+    @ critical values are now in s0 (realAccS), s4 (compAccS)
+       mov     number, quarterPoints, asl #2
+.loop2:
+       cmp     num_points, number
+       bls     .done
+
+       vld1.32 {d4[0]}, [taps]! @ s8
+       vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+       vmla.f32 realAccS, s8, s10 @ d0[0]
+       vmla.f32 compAccS, s8, s12 @ d2[0]
+
+       add number, number, #1
+       b .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+       ldmfd   sp!, {r7, r8, sl} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s

new file mode 100644 (file)

index 0000000..03b28f1
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s
@@ -0,0 +1,97 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points) {
+       .global volk_32fc_32f_dot_prod_32fc_a_neonpipeline
+volk_32fc_32f_dot_prod_32fc_a_neonpipeline:
+       @ r0 - result: pointer to output array (32fc)
+       @ r1 - input: pointer to input array 1 (32fc)
+       @ r2 - taps: pointer to input array 2 (32f)
+       @ r3 - num_points: number of items to process
+
+       result .req r0
+       input .req r1
+       taps .req r2
+       num_points .req r3
+       quarterPoints .req r7
+       number .req r8
+       @ Note that according to the ARM EABI (AAPCS) Section 5.1.1:
+    @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls;
+    @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved
+    @ registers d16-d31 (q8-q15), if present, do not need to be preserved.
+       realAccQ   .req q0 @ d0-d1/s0-s3
+       compAccQ   .req q1 @ d2-d3/s4-s7
+       realAccS   .req s0 @ d0[0]
+       compAccS   .req s4 @ d2[0]
+       tapsVal    .req q2 @ d4-d5
+       outVal     .req q3 @ d6-d7
+    realMul    .req q8 @ d8-d9
+    compMul    .req q9 @ d16-d17
+    inRealVal  .req q10 @ d18-d19
+       inCompVal  .req q11 @ d20-d21
+
+       stmfd   sp!, {r7, r8, sl}       @ prologue - save register states
+
+    pld [taps, #128] @ pre-load hint - this is implementation specific!
+       pld [input, #128] @ pre-load hint - this is implementation specific!
+
+       veor realAccQ, realAccQ @ zero out accumulators
+       veor compAccQ, compAccQ @ zero out accumulators
+       movs quarterPoints, num_points, lsr #2
+       beq .loop2 @ if zero into quarterPoints
+
+       @mov number, quarterPoints
+       mov number, #0
+       @ Optimizing for pipeline
+       vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+       vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+       add number, number, #1
+
+.loop1:
+       @ do work here
+       pld [taps, #128] @ pre-load hint - this is implementation specific!
+       pld [input, #128] @ pre-load hint - this is implementation specific!
+       vmul.f32 realMul, tapsVal, inRealVal
+       vmul.f32 compMul, tapsVal, inCompVal
+       vadd.f32 realAccQ, realAccQ, realMul
+       vadd.f32 compAccQ, compAccQ, compMul
+       vld1.32 {tapsVal}, [taps:128]! @ tapsVal
+       vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal
+
+       @subs number, number, #1
+       @bls    .loop1  @ first loop
+    add number, number, #1
+    cmp number, quarterPoints
+    blt .loop1
+
+       vmul.f32 realMul, tapsVal, inRealVal
+       vmul.f32 compMul, tapsVal, inCompVal
+       vadd.f32 realAccQ, realAccQ, realMul
+       vadd.f32 compAccQ, compAccQ, compMul
+
+    @ Sum up across realAccQ and compAccQ
+    vadd.f32 d0, d0, d1      @ realAccQ +-> d0
+    vadd.f32 d2, d2, d3      @ compAccQ +-> d2
+    vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ)
+    vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ)
+
+    @ critical values are now in s0 (realAccS), s4 (realAccQ)
+       mov     number, quarterPoints, asl #2
+       cmp num_points, number
+       beq     .done
+
+.loop2:
+       vld1.32 {d4[0]}, [taps]! @ s8
+       vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12
+       vmul.f32 s5, s8, s10
+       vmul.f32 s6, s8, s12
+       vadd.f32 realAccS, realAccS, s5
+       vadd.f32 compAccS, compAccS, s6
+
+       add number, number, #1
+    cmp number, num_points
+       blt .loop2
+
+.done:
+    vst1.32 {d0[0]}, [result]! @ realAccS
+    vst1.32 {d2[0]}, [result]  @ compAccS
+
+       ldmfd   sp!, {r7, r8, sl} @ epilogue - restore register states
+       bx      lr
diff --git a/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s

new file mode 100644 (file)

index 0000000..285da06
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_unrollasm.s
@@ -0,0 +1,146 @@
+@ static inline void volk_32fc_32f_dot_prod_32fc_a_unrollasm ( lv_32fc_t* result, const  lv_32fc_t* input, const  float* taps, unsigned int num_points)
+.global        volk_32fc_32f_dot_prod_32fc_a_unrollasm
+volk_32fc_32f_dot_prod_32fc_a_unrollasm:
+       @ r0 - result: pointer to output array (32fc)
+       @ r1 - input: pointer to input array 1 (32fc)
+       @ r2 - taps: pointer to input array 2 (32f)
+       @ r3 - num_points: number of items to process
+
+    push    {r4, r5, r6, r7, r8, r9}
+    vpush   {q4-q7}
+    sub     r13, r13, #56   @ 0x38
+    add     r12, r13, #8
+    lsrs    r8, r3, #3
+    veor.32 q2, q5, q5
+    veor.32 q3, q5, q5
+    veor.32 q4, q5, q5
+    veor.32 q5, q5, q5
+    beq     .smallvector
+    vld2.32 {d20-d23}, [r1]!
+    vld1.32 {d24-d25}, [r2]!
+    mov     r5, #1
+
+
+
+.mainloop:
+    vld2.32 {d14-d17}, [r1]!  @ q7,q8
+    vld1.32 {d18-d19}, [r2]!  @ q9
+
+    vmul.f32        q0, q12, q10 @ real mult
+    vmul.f32        q1, q12, q11 @ imag mult
+
+    add     r5, r5, #1
+    cmp     r5, r8
+
+    vadd.f32        q4, q4, q0@ q4 accumulates real
+    vadd.f32        q5, q5, q1@ q5 accumulates imag
+
+    vld2.32 {d20-d23}, [r1]!  @ q10-q11
+    vld1.32 {d24-d25}, [r2]!  @ q12
+
+    vmul.f32        q13, q9, q7
+    vmul.f32        q14, q9, q8
+    vadd.f32        q2, q2, q13  @ q2 accumulates real
+    vadd.f32        q3, q3, q14  @ q3 accumulates imag
+
+
+
+    bne     .mainloop
+
+    vmul.f32        q0, q12, q10 @ real mult
+    vmul.f32        q1, q12, q11 @ imag mult
+
+    vadd.f32        q4, q4, q0@ q4 accumulates real
+    vadd.f32        q5, q5, q1@ q5 accumulates imag
+
+
+.smallvector:
+    vadd.f32        q0, q2, q4
+    add     r12, r13, #24
+    lsl     r8, r8, #3
+    vadd.f32        q1, q3, q5
+    cmp     r3, r8
+
+    vadd.f32    d0, d0, d1
+    vadd.f32    d1, d2, d3
+    vadd.f32    s14, s0, s1
+    vadd.f32    s15, s2, s3
+
+    vstr    s14, [r13]
+    vstr    s15, [r13, #4]
+    bls     .D1
+    rsb     r12, r8, r3
+    lsr     r4, r12, #2
+    cmp     r4, #0
+    cmpne   r12, #3
+    lsl     r5, r4, #2
+    movhi   r6, #0
+    movls   r6, #1
+    bls     .L1
+    vmov.i32        q10, #0 @ 0x00000000
+    mov     r9, r1
+    mov     r7, r2
+    vorr    q11, q10, q10
+
+.smallloop:
+    add     r6, r6, #1
+    vld2.32 {d16-d19}, [r9]!
+    cmp     r4, r6
+    vld1.32 {d24-d25}, [r7]!
+    vmla.f32        q11, q12, q8
+    vmla.f32        q10, q12, q9
+    bhi     .smallloop
+    vmov.i32        q9, #0  @ 0x00000000
+    cmp     r12, r5
+    vadd.f32        d20, d20, d21
+    add     r8, r8, r5
+    vorr    q8, q9, q9
+    add     r1, r1, r5, lsl #3
+    vadd.f32        d22, d22, d23
+    add     r2, r2, r5, lsl #2
+    vpadd.f32       d18, d20, d20
+    vpadd.f32       d16, d22, d22
+    vmov.32 r4, d18[0]
+    vmov.32 r12, d16[0]
+    vmov    s13, r4
+    vadd.f32        s15, s13, s15
+    vmov    s13, r12
+    vadd.f32        s14, s13, s14
+    beq     .finishreduction
+    .L1:
+    add     r12, r8, #1
+    vldr    s13, [r2]
+    cmp     r3, r12
+    vldr    s11, [r1]
+    vldr    s12, [r1, #4]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+    bls     .finishreduction
+    add     r8, r8, #2
+    vldr    s13, [r2, #4]
+    cmp     r3, r8
+    vldr    s11, [r1, #8]
+    vldr    s12, [r1, #12]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+    bls     .finishreduction
+    vldr    s13, [r2, #8]
+    vldr    s11, [r1, #16]
+    vldr    s12, [r1, #20]
+    vmla.f32        s14, s13, s11
+    vmla.f32        s15, s13, s12
+
+.finishreduction:
+    vstr    s14, [r13]
+    vstr    s15, [r13, #4]
+    .D1:
+    ldr     r3, [r13]
+    str     r3, [r0]
+    ldr     r3, [r13, #4]
+    str     r3, [r0, #4]
+    add     r13, r13, #56   @ 0x38
+    vpop    {q4-q7}
+    pop     {r4, r5, r6, r7, r8, r9}
+    bx      r14
+
+
diff --git a/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s b/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s

new file mode 100644 (file)

index 0000000..a1c5b7f
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s
@@ -0,0 +1,98 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32fc_x2_dot_prod_32fc_neonasm
+volk_32fc_x2_dot_prod_32fc_neonasm:
+    push    {r4, r5, r6, r7, r8, lr}
+    vpush   {q0-q7}
+    vpush   {q8-q15}
+    mov r8, r3          @ hold on to num_points (r8)
+    @ zero out accumulators -- leave 1 reg in alu
+    veor    q8, q15, q15
+    mov r7, r0          @ (r7) is cVec
+    veor    q9, q15, q15
+    mov r5, r1          @ (r5) is aVec
+    veor    q10, q15, q15
+    mov r6, r2          @ (r6) is bVec
+    veor    q11, q15, q15
+    lsrs    r3, r3, #3  @ eighth_points (r3) = num_points/8
+    veor    q12, q15, q15
+    mov r12, r2         @ (r12) is bVec
+    veor    q13, q15, q15
+    mov r4, r1          @ (r4) is aVec
+    veor    q14, q15, q15
+    veor    q15, q15, q15
+    beq .smallvector @ nathan optimized this file based on an objdump
+    @ but I don't understand this jump. Seems like it should go to loop2
+    @ and smallvector (really vector reduction) shouldn't need to be a label
+    mov r2, #0          @ 0 out r2 (now number)
+.loop1:
+    add r2, r2, #1      @ increment number
+    vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3
+    cmp r2, r3          @ is number < eighth_points
+    @pld [r12, #64]   
+    vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7
+    @pld [r4, #64]  
+    vmla.f32    q12, q4, q0 @ real (re*re)
+    vmla.f32    q14, q4, q1 @ imag (re*im)
+    vmls.f32    q15, q5, q1 @ real (im*im)
+    vmla.f32    q13, q5, q0 @ imag (im*re)
+
+    vmla.f32    q8, q2, q6 @ real (re*re)
+    vmla.f32    q9, q2, q7 @ imag (re*im)
+    vmls.f32    q10, q3, q7 @ real (im*im)
+    vmla.f32    q11, q3, q6 @ imag (im*re)
+    bne .loop1
+    lsl r2, r3, #3      @ r2 = eighth_points * 8
+    add r6, r6, r2      @ bVec = bVec + eighth_points -- whyyyyy gcc?!?
+    add r5, r5, r2      @ aVec = aVec + eighth_points
+    @ q12-q13 were original real accumulators
+    @ q14-q15 were original imag accumulators
+    @ reduce 8 accumulators down to 2 (1 real, 1 imag)
+    vadd.f32    q8, q10, q8 @ real + real
+    vadd.f32    q11, q11, q9 @ imag + imag
+    vadd.f32    q12, q12, q15 @ real + real
+    vadd.f32    q14, q14, q13 @ imag + imag
+    vadd.f32    q8, q8, q12
+    vadd.f32    q9, q9, q14
+.smallvector:
+    lsl r4, r3, #3
+    cmp r8, r4
+    vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary!
+    vldr    s15, [sp, #8]
+    vldr    s17, [sp]
+    vldr    s16, [sp, #4]
+    vadd.f32    s17, s17, s15
+    vldr    s11, [sp, #12]
+    vldr    s12, [sp, #24]
+    vldr    s13, [sp, #28]
+    vldr    s14, [sp, #16]
+    vldr    s15, [sp, #20]
+    vadd.f32    s16, s16, s11
+    vadd.f32    s17, s17, s12
+    vadd.f32    s16, s16, s13
+    vadd.f32    s17, s17, s14
+    vadd.f32    s16, s16, s15
+    vstr    s17, [r7]
+    vstr    s16, [r7, #4]
+    bls .done
+.loop2:
+    mov r3, r6
+    add r6, r6, #8
+    vldr    s0, [r3]
+    vldr    s1, [r6, #-4]
+    mov r3, r5
+    add r5, r5, #8
+    vldr    s2, [r3]
+    vldr    s3, [r5, #-4]
+    bl  __mulsc3            @ GCC/Clang built-in. Portability?
+    add r4, r4, #1
+    cmp r4, r8
+    vadd.f32    s17, s17, s0
+    vadd.f32    s16, s16, s1
+    vstr    s17, [r7]
+    vstr    s16, [r7, #4]
+    bne .loop2
+.done: 
+    vpop    {q8-q15}
+    vpop    {q0-q7}
+    pop {r4, r5, r6, r7, r8, pc}
+
diff --git a/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s b/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s

new file mode 100644 (file)

index 0000000..5ef06f2
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s
@@ -0,0 +1,96 @@
+@ static inline void volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@
+.global        volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests
+volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests:
+    push    {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+    vpush   {d8-d15}
+    lsrs    fp, r3, #3
+    sub     sp, sp, #52     @ 0x34
+    mov     r9, r3
+    mov     sl, r0
+    mov     r7, r1
+    mov     r8, r2
+    vorr    q0, q7, q7
+    vorr    q1, q7, q7
+    vorr    q2, q7, q7
+    vorr    q3, q7, q7
+    vorr    q4, q7, q7
+    vorr    q5, q7, q7
+    veor    q6, q7, q7
+    vorr    q7, q7, q7
+    beq     .smallvector
+    mov     r4, r1
+    mov     ip, r2
+    mov     r3, #0
+.mainloop:
+    @mov     r6, ip
+    @mov     r5, r4
+    vld4.32 {d24,d26,d28,d30}, [r6]!
+    @add     ip, ip, #64     @ 0x40
+    @add     r4, r4, #64     @ 0x40
+    vld4.32 {d16,d18,d20,d22}, [r5]!
+    add     r3, r3, #1
+    vld4.32 {d25,d27,d29,d31}, [r6]!
+    vld4.32 {d17,d19,d21,d23}, [r5]!
+    vmla.f32        q6, q8, q12
+    vmla.f32        q0, q9, q12
+    cmp     r3, fp
+    vmls.f32        q5, q13, q9
+    vmla.f32        q2, q13, q8
+    vmla.f32        q7, q10, q14
+    vmla.f32        q1, q11, q14
+    vmls.f32        q4, q15, q11
+    vmla.f32        q3, q15, q10
+    bne     .mainloop
+    lsl     r3, fp, #6
+    add     r8, r8, r3
+    add     r7, r7, r3
+.smallvector:
+    vadd.f32        q3, q2, q3
+    add     r3, sp, #16
+    lsl     r4, fp, #3
+    vadd.f32        q4, q5, q4
+    cmp     r9, r4
+    vadd.f32        q6, q6, q7
+    vadd.f32        q1, q0, q1
+    vadd.f32        q8, q6, q4
+    vadd.f32        q9, q1, q3
+    vst2.32 {d16-d19}, [r3 :64]
+    vldr    s15, [sp, #24]
+    vldr    s16, [sp, #16]
+    vldr    s17, [sp, #20]
+    vadd.f32        s16, s16, s15
+    vldr    s11, [sp, #28]
+    vldr    s12, [sp, #40]  @ 0x28
+    vldr    s13, [sp, #44]  @ 0x2c
+    vldr    s14, [sp, #32]
+    vldr    s15, [sp, #36]  @ 0x24
+    vadd.f32        s17, s17, s11
+    vadd.f32        s16, s16, s12
+    vadd.f32        s17, s17, s13
+    vadd.f32        s16, s16, s14
+    vadd.f32        s17, s17, s15
+    vstr    s16, [sl]
+    vstr    s17, [sl, #4]
+    bls     .epilog
+    add     r5, sp, #8
+.tailcase:
+    ldr     r3, [r7], #8
+    mov     r0, r5
+    ldr     r1, [r8], #8
+    add     r4, r4, #1
+    ldr     ip, [r7, #-4]
+    ldr     r2, [r8, #-4]
+    str     ip, [sp]
+    bl      __mulsc3
+    vldr    s14, [sp, #8]
+    vldr    s15, [sp, #12]
+    vadd.f32        s16, s16, s14
+    cmp     r4, r9
+    vadd.f32        s17, s17, s15
+    vstr    s16, [sl]
+    vstr    s17, [sl, #4]
+    bne     .tailcase
+.epilog:
+    add     sp, sp, #52     @ 0x34
+    vpop    {d8-d15}
+    pop     {r4, r5, r6, r7, r8, r9, sl, fp, pc}
diff --git a/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s b/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s

new file mode 100644 (file)

index 0000000..ab974c0
--- /dev/null
+++ b/kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s
@@ -0,0 +1,47 @@
+@ static inline void volk_32fc_x2_multiply_32fc_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
+       .global volk_32fc_x2_multiply_32fc_a_neonasm
+volk_32fc_x2_multiply_32fc_a_neonasm:
+    push    {r4, r5, r6, r7, r8, r9, r14}
+    lsrs    r7, r3, #2
+    @ r0 is c vector
+    @ r1 is a vector
+    @ r2 is b vector
+    @ r3 is num_points
+    @ r7 is quarter_points
+    beq     .smallvector
+    mov     r5, #0
+.mainloop:
+   vld2.32   {d24-d27}, [r1]!  @ ar=q12, ai=q13
+   add       r5, r5, #1
+   cmp       r5, r7
+   vld2.32   {d20-d23}, [r2]!  @ br=q10, bi=q11
+   pld       [r1]
+   pld       [r2]
+   vmul.f32  q0, q12, q10 @ q15 = ar*br
+   vmul.f32  q1, q13, q11 @ q11 = ai*bi
+   vmul.f32  q2, q12, q11 @ q14 = ar*bi
+   vmul.f32  q3, q13, q10 @ q12 = ai*br
+   vsub.f32  q9, q0, q1  @ real
+   vadd.f32  q10, q2, q3  @ imag
+   vst2.32   {q9-q10}, [r0]!
+   bne     .mainloop
+
+.smallvector:
+   lsl     r5, r7, #2   @ r5 = quarter_points * 4
+   cmp     r3, r5       @ num_points == quarter_points?
+   bls     .done
+.tailcase:
+   add    r5, r5, #1    @ r5 +=1 <- number++
+   vld1.32    d1, [r1]! @ s2, s3 = ar, ai
+   vld1.32    d0, [r2]! @ s0, s1 = br, bi
+   vmul.f32   s4, s0, s2 @ s4 = ar*br
+   vmul.f32   s5, s0, s3 @ s5 = ar*bi
+   vmls.f32   s4, s1, s3 @ s4 = s4 - ai*bi
+   vmla.f32   s5, s1, s2 @ s5 = s5 + ai*br
+   @vst2.32    d2[0], [r0]!
+   vst1.32    {d2}, [r0]!
+   cmp     r3, r5       @ r3 == r5? num_points == number?
+   bne     .tailcase
+.done:
+   pop     {r4, r5, r6, r7, r8, r9, r15}
+   bx lr
diff --git a/kernels/volk/asm/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc b/kernels/volk/asm/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..76faa93
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_16ic_deinterleave_16i_x2_a_orc_impl
+.dest 2 idst
+.dest 2 qdst
+.source 4 src
+splitlw qdst, idst, src
diff --git a/kernels/volk/asm/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc b/kernels/volk/asm/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..8db49fd
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_16ic_deinterleave_real_8i_a_orc_impl
+.dest 1 dst
+.source 4 src
+.temp 2 iw
+select0lw iw, src
+convhwb dst, iw
diff --git a/kernels/volk/asm/orc/volk_16ic_magnitude_16i_a_orc_impl.orc b/kernels/volk/asm/orc/volk_16ic_magnitude_16i_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..fbaebc4
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16ic_magnitude_16i_a_orc_impl.orc
@@ -0,0 +1,23 @@
+.function volk_16ic_magnitude_16i_a_orc_impl
+.source 4 src
+.dest 2 dst
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+.temp 4 rootf
+.temp 4 rootl
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+x2 mulf prodiqf, iqf, iqf
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf rootf, sumf
+mulf rootf, rootf, scalar
+convfl rootl, rootf
+convlw dst, rootl
diff --git a/kernels/volk/asm/orc/volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc b/kernels/volk/asm/orc/volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..fd8915d
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc
@@ -0,0 +1,12 @@
+.function volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl
+.dest 4 idst
+.dest 4 qdst
+.source 4 src
+.floatparam 4 scalar
+.temp 8 iql
+.temp 8 iqf
+
+x2 convswl iql, src
+x2 convlf iqf, iql
+x2 divf iqf, iqf, scalar
+splitql qdst, idst, iqf
diff --git a/kernels/volk/asm/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc b/kernels/volk/asm/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc

new file mode 100644 (file)

index 0000000..66fef7d
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16sc_magnitude_32f_aligned16_orc_impl.orc
@@ -0,0 +1,25 @@
+.function volk_16ic_magnitude_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 reall
+.temp 4 imagl
+.temp 2 reals
+.temp 2 imags
+.temp 4 realf
+.temp 4 imagf
+.temp 4 sumf
+
+
+
+splitlw reals, imags, src
+convswl reall, reals
+convswl imagl, imags
+convlf realf, reall
+convlf imagf, imagl
+divf realf, realf, scalar
+divf imagf, imagf, scalar
+mulf realf, realf, realf
+mulf imagf, imagf, imagf
+addf sumf, realf, imagf
+sqrtf dst, sumf
diff --git a/kernels/volk/asm/orc/volk_16u_byteswap_a_orc_impl.orc b/kernels/volk/asm/orc/volk_16u_byteswap_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..b96ba84
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_16u_byteswap_a_orc_impl.orc
@@ -0,0 +1,3 @@
+.function volk_16u_byteswap_a_orc_impl
+.dest 2 dst
+swapw dst, dst
diff --git a/kernels/volk/asm/orc/volk_32f_s32f_add_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_s32f_add_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..f993d67
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_s32f_add_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_add_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.floatparam 4 scalar
+addf dst, src1, scalar
diff --git a/kernels/volk/asm/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..ea23fc0
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.floatparam 4 scalar
+mulf dst, src1, scalar
diff --git a/kernels/volk/asm/orc/volk_32f_s32f_normalize_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_s32f_normalize_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..986fdf6
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_s32f_normalize_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_s32f_normalize_a_orc_impl
+.source 4 src1
+.floatparam 4 invscalar
+.dest 4 dst
+mulf dst, src1, invscalar
diff --git a/kernels/volk/asm/orc/volk_32f_sqrt_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_sqrt_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..f339b11
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_sqrt_32f_a_orc_impl.orc
@@ -0,0 +1,4 @@
+.function volk_32f_sqrt_32f_a_orc_impl
+.source 4 src
+.dest 4 dst
+sqrtf dst, src
diff --git a/kernels/volk/asm/orc/volk_32f_x2_add_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_add_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..450cc6a
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_add_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_add_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+addf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_divide_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_divide_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..ee3b61b
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_divide_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_divide_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+divf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..b367f30
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_dot_prod_32f_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_32f_x2_dot_prod_32f_a_orc_impl
+.source 4 src1
+.source 4 src2
+.dest 4 dst
+.accumulator 4 accum
+addf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_max_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_max_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..7252016
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_max_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_max_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+maxf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_min_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_min_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..a71ed82
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_min_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_min_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+minf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..c17d539
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_multiply_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+mulf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..b3b0f25
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32f_x2_subtract_32f_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+subf dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..aa82699
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,7 @@
+.function volk_32fc_32f_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 4 src2
+.dest 8 dst
+.temp 8 tmp
+mergelq tmp, src2, src2
+x2 mulf dst, src1, tmp
diff --git a/kernels/volk/asm/orc/volk_32fc_magnitude_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32fc_magnitude_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..032ab2b
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32fc_magnitude_32f_a_orc_impl.orc
@@ -0,0 +1,13 @@
+.function volk_32fc_magnitude_32f_a_orc_impl
+.source 8 src
+.dest 4 dst
+.temp 8 iqf
+.temp 8 prodiqf
+.temp 4 qf
+.temp 4 if
+.temp 4 sumf
+
+x2 mulf prodiqf, src, src
+splitql qf, if, prodiqf
+addf sumf, if, qf
+sqrtf dst, sumf
diff --git a/kernels/volk/asm/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..2577e03
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32fc_s32fc_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,18 @@
+.function volk_32fc_s32fc_multiply_32fc_a_orc_impl
+.source 8 src1
+.floatparam 8 scalar
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, scalar
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, scalar
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff --git a/kernels/volk/asm/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..cb8a12d
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc
@@ -0,0 +1,18 @@
+.function volk_32fc_x2_multiply_32fc_a_orc_impl
+.source 8 src1
+.source 8 src2
+.dest 8 dst
+.temp 8 iqprod
+.temp 4 real
+.temp 4 imag
+.temp 4 ac
+.temp 4 bd
+.temp 8 swapped
+x2 mulf iqprod, src1, src2
+splitql bd, ac, iqprod
+subf real, ac, bd
+swaplq swapped, src1
+x2 mulf iqprod, swapped, src2
+splitql bd, ac, iqprod
+addf imag, ac, bd
+mergelq dst, real, imag
diff --git a/kernels/volk/asm/orc/volk_32i_x2_and_32i_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32i_x2_and_32i_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..1845e46
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32i_x2_and_32i_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_and_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+andl dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_32i_x2_or_32i_a_orc_impl.orc b/kernels/volk/asm/orc/volk_32i_x2_or_32i_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..004663f
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_32i_x2_or_32i_a_orc_impl.orc
@@ -0,0 +1,5 @@
+.function volk_32i_x2_or_32i_a_orc_impl
+.dest 4 dst
+.source 4 src1
+.source 4 src2
+orl dst, src1, src2
diff --git a/kernels/volk/asm/orc/volk_8i_convert_16i_a_orc_impl.orc b/kernels/volk/asm/orc/volk_8i_convert_16i_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..17198bf
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_8i_convert_16i_a_orc_impl.orc
@@ -0,0 +1,6 @@
+.function volk_8i_convert_16i_a_orc_impl
+.source 1 src
+.dest 2 dst
+.temp 2 tmp
+convsbw tmp, src
+shlw dst, tmp, 8
diff --git a/kernels/volk/asm/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc b/kernels/volk/asm/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc

new file mode 100644 (file)

index 0000000..ad54fb1
--- /dev/null
+++ b/kernels/volk/asm/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc
@@ -0,0 +1,11 @@
+.function volk_8i_s32f_convert_32f_a_orc_impl
+.source 1 src
+.dest 4 dst
+.floatparam 4 scalar
+.temp 4 flsrc
+.temp 4 lsrc
+.temp 2 ssrc
+convsbw ssrc, src
+convswl lsrc, ssrc
+convlf flsrc, lsrc
+mulf dst, flsrc, scalar
diff --git a/kernels/volk/volk_16i_32fc_dot_prod_32fc.h b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h

new file mode 100644 (file)

index 0000000..8f39794
--- /dev/null
+++ b/kernels/volk/volk_16i_32fc_dot_prod_32fc.h
@@ -0,0 +1,692 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_32fc_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and will be complex.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_32fc_dot_prod_32fc(lv_32fc_t* result, const short* input, const lv_32fc_t
+ * * taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of shorts.
+ * \li taps:  complex taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_16i_32fc_dot_prod_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                       const short* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    static const int N_UNROLL = 4;
+
+    lv_32fc_t acc0 = 0;
+    lv_32fc_t acc1 = 0;
+    lv_32fc_t acc2 = 0;
+    lv_32fc_t acc3 = 0;
+
+    unsigned i = 0;
+    unsigned n = (num_points / N_UNROLL) * N_UNROLL;
+
+    for (i = 0; i < n; i += N_UNROLL) {
+        acc0 += taps[i + 0] * (float)input[i + 0];
+        acc1 += taps[i + 1] * (float)input[i + 1];
+        acc2 += taps[i + 2] * (float)input[i + 2];
+        acc3 += taps[i + 3] * (float)input[i + 3];
+    }
+
+    for (; i < num_points; i++) {
+        acc0 += taps[i] * (float)input[i];
+    }
+
+    *result = acc0 + acc1 + acc2 + acc3;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                    const short* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
+
+    unsigned ii;
+    unsigned quarter_points = num_points / 4;
+    lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
+    short* inputPtr = (short*)input;
+    lv_32fc_t accumulator_vec[4];
+
+    float32x4x2_t tapsVal, accumulator_val;
+    int16x4_t input16;
+    int32x4_t input32;
+    float32x4_t input_float, prod_re, prod_im;
+
+    accumulator_val.val[0] = vdupq_n_f32(0.0);
+    accumulator_val.val[1] = vdupq_n_f32(0.0);
+
+    for (ii = 0; ii < quarter_points; ++ii) {
+        tapsVal = vld2q_f32((float*)tapsPtr);
+        input16 = vld1_s16(inputPtr);
+        // widen 16-bit int to 32-bit int
+        input32 = vmovl_s16(input16);
+        // convert 32-bit int to float with scale
+        input_float = vcvtq_f32_s32(input32);
+
+        prod_re = vmulq_f32(input_float, tapsVal.val[0]);
+        prod_im = vmulq_f32(input_float, tapsVal.val[1]);
+
+        accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
+        accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
+
+        tapsPtr += 4;
+        inputPtr += 4;
+    }
+    vst2q_f32((float*)accumulator_vec, accumulator_val);
+    accumulator_vec[0] += accumulator_vec[1];
+    accumulator_vec[2] += accumulator_vec[3];
+    accumulator_vec[0] += accumulator_vec[2];
+
+    for (ii = quarter_points * 4; ii < num_points; ++ii) {
+        accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
+    }
+
+    *result = accumulator_vec[0];
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
+                                                     const short* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m64 m0, m1;
+    __m128 f0, f1, f2, f3;
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+        m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+        f0 = _mm_cvtpi16_ps(m0);
+        f1 = _mm_cvtpi16_ps(m0);
+        f2 = _mm_cvtpi16_ps(m1);
+        f3 = _mm_cvtpi16_ps(m1);
+
+        a0Val = _mm_unpacklo_ps(f0, f1);
+        a1Val = _mm_unpackhi_ps(f0, f1);
+        a2Val = _mm_unpacklo_ps(f2, f3);
+        a3Val = _mm_unpackhi_ps(f2, f3);
+
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 8;
+        bPtr += 16;
+    }
+
+    _mm_empty(); // clear the mmx technology state
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+                                                          const short* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_loadu_si128((__m128i const*)aPtr);
+        m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
+
+
+#ifdef LV_HAVE_AVX2
+
+static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
+                                                      const short* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_loadu_si128((__m128i const*)aPtr);
+        m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_MMX
+
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                     const short* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m64 m0, m1;
+    __m128 f0, f1, f2, f3;
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
+        m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
+        f0 = _mm_cvtpi16_ps(m0);
+        f1 = _mm_cvtpi16_ps(m0);
+        f2 = _mm_cvtpi16_ps(m1);
+        f3 = _mm_cvtpi16_ps(m1);
+
+        a0Val = _mm_unpacklo_ps(f0, f1);
+        a1Val = _mm_unpackhi_ps(f0, f1);
+        a2Val = _mm_unpacklo_ps(f2, f3);
+        a3Val = _mm_unpackhi_ps(f2, f3);
+
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 8;
+        bPtr += 16;
+    }
+
+    _mm_empty(); // clear the mmx technology state
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
+
+#ifdef LV_HAVE_AVX2
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
+                                                      const short* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_load_si128((__m128i const*)aPtr);
+        m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+
+#endif /*LV_HAVE_AVX2*/
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+                                                          const short* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const short* aPtr = input;
+    const float* bPtr = (float*)taps;
+
+    __m128i m0, m1;
+    __m256i f0, f1;
+    __m256 g0, g1, h0, h1, h2, h3;
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        m0 = _mm_load_si128((__m128i const*)aPtr);
+        m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
+
+        f0 = _mm256_cvtepi16_epi32(m0);
+        g0 = _mm256_cvtepi32_ps(f0);
+        f1 = _mm256_cvtepi16_epi32(m1);
+        g1 = _mm256_cvtepi32_ps(f1);
+
+        h0 = _mm256_unpacklo_ps(g0, g0);
+        h1 = _mm256_unpackhi_ps(g0, g0);
+        h2 = _mm256_unpacklo_ps(g1, g1);
+        h3 = _mm256_unpackhi_ps(g1, g1);
+
+        a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
+        a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
+        a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
+        a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
+
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr) * (*bPtr++));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+
+#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+
+
+#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
diff --git a/kernels/volk/volk_16i_branch_4_state_8.h b/kernels/volk/volk_16i_branch_4_state_8.h

new file mode 100644 (file)

index 0000000..b0f4d3b
--- /dev/null
+++ b/kernels/volk/volk_16i_branch_4_state_8.h
@@ -0,0 +1,204 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_branch_4_state_8
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_branch_4_state_8(short* target, short* src0, char** permuters, short*
+ * cntl2, short* cntl3, short* scalars) \endcode
+ *
+ * \b Inputs
+ * \li src0: <FIXME>
+ * \li permuters: <FIXME>
+ * \li cntl2: <FIXME>
+ * \li cntl3: <FIXME>
+ * \li scalars: <FIXME>
+ *
+ * \b Outputs
+ * \li target: <FIXME>
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_branch_4_state_8();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_branch_4_state_8_a_H
+#define INCLUDED_volk_16i_branch_4_state_8_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_branch_4_state_8_a_ssse3(short* target,
+                                                     short* src0,
+                                                     char** permuters,
+                                                     short* cntl2,
+                                                     short* cntl3,
+                                                     short* scalars)
+{
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11;
+    __m128i *p_target, *p_src0, *p_cntl2, *p_cntl3, *p_scalars;
+
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
+    p_cntl2 = (__m128i*)cntl2;
+    p_cntl3 = (__m128i*)cntl3;
+    p_scalars = (__m128i*)scalars;
+
+    xmm0 = _mm_load_si128(p_scalars);
+
+    xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+    xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+    xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+    xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+    xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+    xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+    xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+    xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+    xmm0 = _mm_load_si128((__m128i*)permuters[0]);
+    xmm6 = _mm_load_si128((__m128i*)permuters[1]);
+    xmm8 = _mm_load_si128((__m128i*)permuters[2]);
+    xmm10 = _mm_load_si128((__m128i*)permuters[3]);
+
+    xmm5 = _mm_load_si128(p_src0);
+    xmm0 = _mm_shuffle_epi8(xmm5, xmm0);
+    xmm6 = _mm_shuffle_epi8(xmm5, xmm6);
+    xmm8 = _mm_shuffle_epi8(xmm5, xmm8);
+    xmm10 = _mm_shuffle_epi8(xmm5, xmm10);
+
+    xmm5 = _mm_add_epi16(xmm1, xmm2);
+
+    xmm6 = _mm_add_epi16(xmm2, xmm6);
+    xmm8 = _mm_add_epi16(xmm1, xmm8);
+
+    xmm7 = _mm_load_si128(p_cntl2);
+    xmm9 = _mm_load_si128(p_cntl3);
+
+    xmm0 = _mm_add_epi16(xmm5, xmm0);
+
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+
+    xmm5 = _mm_load_si128(&p_cntl2[1]);
+    xmm11 = _mm_load_si128(&p_cntl3[1]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm0 = _mm_add_epi16(xmm0, xmm7);
+
+
+    xmm7 = _mm_load_si128(&p_cntl2[2]);
+    xmm9 = _mm_load_si128(&p_cntl3[2]);
+
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+    xmm7 = _mm_and_si128(xmm7, xmm3);
+    xmm9 = _mm_and_si128(xmm9, xmm4);
+
+    xmm6 = _mm_add_epi16(xmm6, xmm5);
+
+
+    xmm5 = _mm_load_si128(&p_cntl2[3]);
+    xmm11 = _mm_load_si128(&p_cntl3[3]);
+
+    xmm7 = _mm_add_epi16(xmm7, xmm9);
+
+    xmm5 = _mm_and_si128(xmm5, xmm3);
+    xmm11 = _mm_and_si128(xmm11, xmm4);
+
+    xmm8 = _mm_add_epi16(xmm8, xmm7);
+
+    xmm5 = _mm_add_epi16(xmm5, xmm11);
+
+    _mm_store_si128(p_target, xmm0);
+    _mm_store_si128(&p_target[1], xmm6);
+
+    xmm10 = _mm_add_epi16(xmm5, xmm10);
+
+    _mm_store_si128(&p_target[2], xmm8);
+
+    _mm_store_si128(&p_target[3], xmm10);
+}
+
+
+#endif /*LV_HAVE_SSEs*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_branch_4_state_8_generic(short* target,
+                                                     short* src0,
+                                                     char** permuters,
+                                                     short* cntl2,
+                                                     short* cntl3,
+                                                     short* scalars)
+{
+    int i = 0;
+
+    int bound = 4;
+
+    for (; i < bound; ++i) {
+        target[i * 8] = src0[((char)permuters[i][0]) / 2] + ((i + 1) % 2 * scalars[0]) +
+                        (((i >> 1) ^ 1) * scalars[1]) + (cntl2[i * 8] & scalars[2]) +
+                        (cntl3[i * 8] & scalars[3]);
+        target[i * 8 + 1] = src0[((char)permuters[i][1 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 1] & scalars[2]) +
+                            (cntl3[i * 8 + 1] & scalars[3]);
+        target[i * 8 + 2] = src0[((char)permuters[i][2 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 2] & scalars[2]) +
+                            (cntl3[i * 8 + 2] & scalars[3]);
+        target[i * 8 + 3] = src0[((char)permuters[i][3 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 3] & scalars[2]) +
+                            (cntl3[i * 8 + 3] & scalars[3]);
+        target[i * 8 + 4] = src0[((char)permuters[i][4 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 4] & scalars[2]) +
+                            (cntl3[i * 8 + 4] & scalars[3]);
+        target[i * 8 + 5] = src0[((char)permuters[i][5 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 5] & scalars[2]) +
+                            (cntl3[i * 8 + 5] & scalars[3]);
+        target[i * 8 + 6] = src0[((char)permuters[i][6 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 6] & scalars[2]) +
+                            (cntl3[i * 8 + 6] & scalars[3]);
+        target[i * 8 + 7] = src0[((char)permuters[i][7 * 2]) / 2] +
+                            ((i + 1) % 2 * scalars[0]) + (((i >> 1) ^ 1) * scalars[1]) +
+                            (cntl2[i * 8 + 7] & scalars[2]) +
+                            (cntl3[i * 8 + 7] & scalars[3]);
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_branch_4_state_8_a_H*/
diff --git a/kernels/volk/volk_16i_convert_8i.h b/kernels/volk/volk_16i_convert_8i.h

new file mode 100644 (file)

index 0000000..652d24c
--- /dev/null
+++ b/kernels/volk/volk_16i_convert_8i.h
@@ -0,0 +1,295 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_convert_8i
+ *
+ * \b Overview
+ *
+ * Converts 16-bit shorts to 8-bit chars.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_convert_8i(int8_t* outputVector, const int16_t* inputVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_convert_8i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_convert_8i_u_H
+#define INCLUDED_volk_16i_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16i_convert_8i_u_avx2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m256i inputVal1;
+    __m256i inputVal2;
+    __m256i ret;
+
+    for (; number < thirtysecondPoints; number++) {
+
+        // Load the 16 values
+        inputVal1 = _mm256_loadu_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+        inputVal2 = _mm256_loadu_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+
+        inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+
+        ret = _mm256_packs_epi16(inputVal1, inputVal2);
+        ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, ret);
+
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16i_convert_8i_u_sse2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for (; number < sixteenthPoints; number++) {
+
+        // Load the 16 values
+        inputVal1 = _mm_loadu_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+        inputVal2 = _mm_loadu_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+
+        inputVal1 = _mm_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+        ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, ret);
+
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_convert_8i_generic(int8_t* outputVector,
+                                               const int16_t* inputVector,
+                                               unsigned int num_points)
+{
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_16i_convert_8i_u_H */
+#ifndef INCLUDED_volk_16i_convert_8i_a_H
+#define INCLUDED_volk_16i_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16i_convert_8i_a_avx2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m256i inputVal1;
+    __m256i inputVal2;
+    __m256i ret;
+
+    for (; number < thirtysecondPoints; number++) {
+
+        // Load the 16 values
+        inputVal1 = _mm256_load_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+        inputVal2 = _mm256_load_si256((__m256i*)inputPtr);
+        inputPtr += 16;
+
+        inputVal1 = _mm256_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm256_srai_epi16(inputVal2, 8);
+
+        ret = _mm256_packs_epi16(inputVal1, inputVal2);
+        ret = _mm256_permute4x64_epi64(ret, 0b11011000);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, ret);
+
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16i_convert_8i_a_sse2(int8_t* outputVector,
+                                              const int16_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int8_t* outputVectorPtr = outputVector;
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal1;
+    __m128i inputVal2;
+    __m128i ret;
+
+    for (; number < sixteenthPoints; number++) {
+
+        // Load the 16 values
+        inputVal1 = _mm_load_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+        inputVal2 = _mm_load_si128((__m128i*)inputPtr);
+        inputPtr += 8;
+
+        inputVal1 = _mm_srai_epi16(inputVal1, 8);
+        inputVal2 = _mm_srai_epi16(inputVal2, 8);
+
+        ret = _mm_packs_epi16(inputVal1, inputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, ret);
+
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int8_t)(inputVector[number] >> 8);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16i_convert_8i_neon(int8_t* outputVector,
+                                            const int16_t* inputVector,
+                                            unsigned int num_points)
+{
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    unsigned int sixteenth_points = num_points / 16;
+
+    int16x8_t inputVal0;
+    int16x8_t inputVal1;
+    int8x8_t outputVal0;
+    int8x8_t outputVal1;
+    int8x16_t outputVal;
+
+    for (number = 0; number < sixteenth_points; number++) {
+        // load two input vectors
+        inputVal0 = vld1q_s16(inputVectorPtr);
+        inputVal1 = vld1q_s16(inputVectorPtr + 8);
+        // shift right
+        outputVal0 = vshrn_n_s16(inputVal0, 8);
+        outputVal1 = vshrn_n_s16(inputVal1, 8);
+        // squash two vectors and write output
+        outputVal = vcombine_s8(outputVal0, outputVal1);
+        vst1q_s8(outputVectorPtr, outputVal);
+        inputVectorPtr += 16;
+        outputVectorPtr += 16;
+    }
+
+    for (number = sixteenth_points * 16; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_convert_8i_a_generic(int8_t* outputVector,
+                                                 const int16_t* inputVector,
+                                                 unsigned int num_points)
+{
+    int8_t* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int8_t)(*inputVectorPtr++ >> 8));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16i_convert_8i_a_H */
diff --git a/kernels/volk/volk_16i_max_star_16i.h b/kernels/volk/volk_16i_max_star_16i.h

new file mode 100644 (file)

index 0000000..912f259
--- /dev/null
+++ b/kernels/volk/volk_16i_max_star_16i.h
@@ -0,0 +1,172 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_max_star_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_max_star_16i(short* target, short* src0, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value of the max* operation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_max_star_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_max_star_16i_a_H
+#define INCLUDED_volk_16i_max_star_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSSE3
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void
+volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    short candidate = src0[0];
+    short cands[8];
+    __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6;
+
+    __m128i* p_src0;
+
+    p_src0 = (__m128i*)src0;
+
+    int bound = num_bytes >> 4;
+    int leftovers = (num_bytes >> 1) & 7;
+
+    int i = 0;
+
+    xmm1 = _mm_setzero_si128();
+    xmm0 = _mm_setzero_si128();
+    //_mm_insert_epi16(xmm0, candidate, 0);
+
+    xmm0 = _mm_shuffle_epi8(xmm0, xmm1);
+
+    for (i = 0; i < bound; ++i) {
+        xmm1 = _mm_load_si128(p_src0);
+        p_src0 += 1;
+        // xmm2 = _mm_sub_epi16(xmm1, xmm0);
+
+        xmm3 = _mm_cmpgt_epi16(xmm0, xmm1);
+        xmm4 = _mm_cmpeq_epi16(xmm0, xmm1);
+        xmm5 = _mm_cmpgt_epi16(xmm1, xmm0);
+
+        xmm6 = _mm_xor_si128(xmm4, xmm5);
+
+        xmm3 = _mm_and_si128(xmm3, xmm0);
+        xmm4 = _mm_and_si128(xmm6, xmm1);
+
+        xmm0 = _mm_add_epi16(xmm3, xmm4);
+    }
+
+    _mm_store_si128((__m128i*)cands, xmm0);
+
+    for (i = 0; i < 8; ++i) {
+        candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i];
+    }
+
+    for (i = 0; i < leftovers; ++i) {
+        candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0)
+                        ? candidate
+                        : src0[(bound << 3) + i];
+    }
+
+    target[0] = candidate;
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_16i_max_star_16i_neon(short* target, short* src0, unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    unsigned number;
+    int16x8_t input_vec;
+    int16x8_t diff, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = vdupq_n_s16(0);
+
+    int16x8x2_t tmpvec;
+
+    int16x8_t candidate_vec = vld1q_dup_s16(src0);
+    short candidate;
+    ++src0;
+
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld1q_s16(src0);
+        __VOLK_PREFETCH(src0 + 16);
+        diff = vsubq_s16(candidate_vec, input_vec);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+
+        tmpvec.val[0] = vandq_s16(candidate_vec, (int16x8_t)comp1);
+        tmpvec.val[1] = vandq_s16(input_vec, (int16x8_t)comp2);
+
+        candidate_vec = vaddq_s16(tmpvec.val[0], tmpvec.val[1]);
+        src0 += 8;
+    }
+    vst1q_s16(&candidate, candidate_vec);
+
+    for (number = 0; number < num_points % 8; number++) {
+        candidate = ((int16_t)(candidate - src0[number]) > 0) ? candidate : src0[number];
+    }
+    target[0] = candidate;
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = num_bytes >> 1;
+
+    short candidate = src0[0];
+    for (i = 1; i < bound; ++i) {
+        candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i];
+    }
+    target[0] = candidate;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_16i_max_star_16i_a_H*/
diff --git a/kernels/volk/volk_16i_max_star_horizontal_16i.h b/kernels/volk/volk_16i_max_star_horizontal_16i.h

new file mode 100644 (file)

index 0000000..0b825dc
--- /dev/null
+++ b/kernels/volk/volk_16i_max_star_horizontal_16i.h
@@ -0,0 +1,216 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_max_star_horizontal_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_max_star_horizontal_16i(short* target, short* src0, unsigned int
+ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value of the max* operation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_max_star_horizontal_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+#define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
+
+#include <volk/volk_common.h>
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_SSSE3
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target,
+                                                            int16_t* src0,
+                                                            unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    static const uint8_t shufmask0[16] = {
+        0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d,
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+    };
+    static const uint8_t shufmask1[16] = {
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d
+    };
+    static const uint8_t andmask0[16] = {
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+    };
+    static const uint8_t andmask1[16] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
+    };
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+    __m128i xmm5, xmm6, xmm7, xmm8;
+
+    xmm4 = _mm_load_si128((__m128i*)shufmask0);
+    xmm5 = _mm_load_si128((__m128i*)shufmask1);
+    xmm6 = _mm_load_si128((__m128i*)andmask0);
+    xmm7 = _mm_load_si128((__m128i*)andmask1);
+
+    __m128i *p_target, *p_src0;
+
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
+
+    int bound = num_bytes >> 5;
+    int intermediate = (num_bytes >> 4) & 1;
+    int leftovers = (num_bytes >> 1) & 7;
+
+    int i = 0;
+
+    for (i = 0; i < bound; ++i) {
+        xmm0 = _mm_load_si128(p_src0);
+        xmm1 = _mm_load_si128(&p_src0[1]);
+
+        xmm2 = _mm_xor_si128(xmm2, xmm2);
+        p_src0 += 2;
+
+        xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+
+        xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+        xmm8 = _mm_and_si128(xmm2, xmm6);
+        xmm3 = _mm_and_si128(xmm2, xmm7);
+
+
+        xmm8 = _mm_add_epi8(xmm8, xmm4);
+        xmm3 = _mm_add_epi8(xmm3, xmm5);
+
+        xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
+        xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
+
+
+        xmm3 = _mm_add_epi16(xmm0, xmm1);
+
+
+        _mm_store_si128(p_target, xmm3);
+
+        p_target += 1;
+    }
+
+    if (intermediate) {
+        xmm0 = _mm_load_si128(p_src0);
+
+        xmm2 = _mm_xor_si128(xmm2, xmm2);
+        p_src0 += 1;
+
+        xmm3 = _mm_hsub_epi16(xmm0, xmm1);
+        xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
+
+        xmm8 = _mm_and_si128(xmm2, xmm6);
+
+        xmm3 = _mm_add_epi8(xmm8, xmm4);
+
+        xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
+
+        _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec);
+
+        p_target = (__m128i*)((int8_t*)p_target + 8);
+    }
+
+    for (i = (bound << 4) + (intermediate << 3);
+         i < (bound << 4) + (intermediate << 3) + leftovers;
+         i += 2) {
+        target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+    }
+}
+
+#endif /*LV_HAVE_SSSE3*/
+
+#ifdef LV_HAVE_NEON
+
+#include <arm_neon.h>
+static inline void volk_16i_max_star_horizontal_16i_neon(int16_t* target,
+                                                         int16_t* src0,
+                                                         unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 16;
+    unsigned number;
+    int16x8x2_t input_vec;
+    int16x8_t diff, max_vec, zeros;
+    uint16x8_t comp1, comp2;
+    zeros = vdupq_n_s16(0);
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld2q_s16(src0);
+        //__VOLK_PREFETCH(src0+16);
+        diff = vsubq_s16(input_vec.val[0], input_vec.val[1]);
+        comp1 = vcgeq_s16(diff, zeros);
+        comp2 = vcltq_s16(diff, zeros);
+
+        input_vec.val[0] = vandq_s16(input_vec.val[0], (int16x8_t)comp1);
+        input_vec.val[1] = vandq_s16(input_vec.val[1], (int16x8_t)comp2);
+
+        max_vec = vaddq_s16(input_vec.val[0], input_vec.val[1]);
+        vst1q_s16(target, max_vec);
+        src0 += 16;
+        target += 8;
+    }
+    for (number = 0; number < num_points % 16; number += 2) {
+        target[number >> 1] = ((int16_t)(src0[number] - src0[number + 1]) > 0)
+                                  ? src0[number]
+                                  : src0[number + 1];
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_16i_max_star_horizontal_16i_a_neonasm(int16_t* target,
+                                                       int16_t* src0,
+                                                       unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target,
+                                                            int16_t* src0,
+                                                            unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = num_bytes >> 1;
+
+    for (i = 0; i < bound; i += 2) {
+        target[i >> 1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/
diff --git a/kernels/volk/volk_16i_permute_and_scalar_add.h b/kernels/volk/volk_16i_permute_and_scalar_add.h

new file mode 100644 (file)

index 0000000..077c37b
--- /dev/null
+++ b/kernels/volk/volk_16i_permute_and_scalar_add.h
@@ -0,0 +1,190 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_permute_and_scalar_add
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_permute_and_scalar_add(short* target,  short* src0, short*
+ * permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short*
+ * scalars, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector.
+ * \li permute_indexes: <FIXME>
+ * \li cntl0: <FIXME>
+ * \li cntl1: <FIXME>
+ * \li cntl2: <FIXME>
+ * \li cntl3: <FIXME>
+ * \li scalars: <FIXME>
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li target: The output value.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_permute_and_scalar_add();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
+#define INCLUDED_volk_16i_permute_and_scalar_add_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target,
+                                                          short* src0,
+                                                          short* permute_indexes,
+                                                          short* cntl0,
+                                                          short* cntl1,
+                                                          short* cntl2,
+                                                          short* cntl3,
+                                                          short* scalars,
+                                                          unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 2;
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
+
+    short* p_permute_indexes = permute_indexes;
+
+    p_target = (__m128i*)target;
+    p_cntl0 = (__m128i*)cntl0;
+    p_cntl1 = (__m128i*)cntl1;
+    p_cntl2 = (__m128i*)cntl2;
+    p_cntl3 = (__m128i*)cntl3;
+    p_scalars = (__m128i*)scalars;
+
+    int i = 0;
+
+    int bound = (num_bytes >> 4);
+    int leftovers = (num_bytes >> 1) & 7;
+
+    xmm0 = _mm_load_si128(p_scalars);
+
+    xmm1 = _mm_shufflelo_epi16(xmm0, 0);
+    xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
+    xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
+    xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
+
+    xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
+    xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
+    xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
+    xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
+
+
+    for (; i < bound; ++i) {
+        xmm0 = _mm_setzero_si128();
+        xmm5 = _mm_setzero_si128();
+        xmm6 = _mm_setzero_si128();
+        xmm7 = _mm_setzero_si128();
+
+        xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
+        xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
+        xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
+        xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
+        xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
+        xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
+        xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
+        xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
+
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
+        xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+        p_permute_indexes += 8;
+
+        xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+        xmm5 = _mm_load_si128(p_cntl0);
+        xmm6 = _mm_load_si128(p_cntl1);
+        xmm7 = _mm_load_si128(p_cntl2);
+
+        xmm5 = _mm_and_si128(xmm5, xmm1);
+        xmm6 = _mm_and_si128(xmm6, xmm2);
+        xmm7 = _mm_and_si128(xmm7, xmm3);
+
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+        xmm5 = _mm_load_si128(p_cntl3);
+
+        xmm6 = _mm_add_epi16(xmm6, xmm7);
+
+        p_cntl0 += 1;
+
+        xmm5 = _mm_and_si128(xmm5, xmm4);
+
+        xmm0 = _mm_add_epi16(xmm0, xmm6);
+
+        p_cntl1 += 1;
+        p_cntl2 += 1;
+
+        xmm0 = _mm_add_epi16(xmm0, xmm5);
+
+        p_cntl3 += 1;
+
+        _mm_store_si128(p_target, xmm0);
+
+        p_target += 1;
+    }
+
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+                    (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+                    (cntl3[i] & scalars[3]);
+    }
+}
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_permute_and_scalar_add_generic(short* target,
+                                                           short* src0,
+                                                           short* permute_indexes,
+                                                           short* cntl0,
+                                                           short* cntl1,
+                                                           short* cntl2,
+                                                           short* cntl3,
+                                                           short* scalars,
+                                                           unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = num_bytes >> 1;
+
+    for (i = 0; i < bound; ++i) {
+        target[i] = src0[permute_indexes[i]] + (cntl0[i] & scalars[0]) +
+                    (cntl1[i] & scalars[1]) + (cntl2[i] & scalars[2]) +
+                    (cntl3[i] & scalars[3]);
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/
diff --git a/kernels/volk/volk_16i_s32f_convert_32f.h b/kernels/volk/volk_16i_s32f_convert_32f.h

new file mode 100644 (file)

index 0000000..22afd68
--- /dev/null
+++ b/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -0,0 +1,503 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts 16-bit shorts to scaled 32-bit floating point values.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_s32f_convert_32f(float* outputVector, const int16_t* inputVector, const
+ * float scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 16-bit shorts.
+ * \li scalar: The value divided against each point in the output buffer.
+ * \li num_points: The number of complex data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_s32f_convert_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
+#define INCLUDED_volk_16i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                    const int16_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m256i inputVal2;
+    __m256 ret;
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Convert
+        inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+
+        ret = _mm256_cvtepi32_ps(inputVal2);
+        ret = _mm256_mul_ps(ret, invScalar);
+
+        _mm256_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 8;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
+
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(dummy, ret, 0);
+
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(output, ret, 1);
+
+        _mm256_storeu_ps(outputVectorPtr, output);
+
+        outputVectorPtr += 8;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
+                                                      const int16_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
+
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_set_ps((float)(inputPtr[3]),
+                         (float)(inputPtr[2]),
+                         (float)(inputPtr[1]),
+                         (float)(inputPtr[0]));
+
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        inputPtr += 4;
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
+                                                     const int16_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
+                                                  const int16_t* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
+{
+    float* outputPtr = outputVector;
+    const int16_t* inputPtr = inputVector;
+    unsigned int number = 0;
+    unsigned int eighth_points = num_points / 8;
+
+    int16x4x2_t input16;
+    int32x4_t input32_0, input32_1;
+    float32x4_t input_float_0, input_float_1;
+    float32x4x2_t output_float;
+    float32x4_t inv_scale;
+
+    inv_scale = vdupq_n_f32(1.0 / scalar);
+
+    // the generic disassembles to a 128-bit load
+    // and duplicates every instruction to operate on 64-bits
+    // at a time. This is only possible with lanes, which is faster
+    // than just doing a vld1_s16, but still slower.
+    for (number = 0; number < eighth_points; number++) {
+        input16 = vld2_s16(inputPtr);
+        // widen 16-bit int to 32-bit int
+        input32_0 = vmovl_s16(input16.val[0]);
+        input32_1 = vmovl_s16(input16.val[1]);
+        // convert 32-bit int to float with scale
+        input_float_0 = vcvtq_f32_s32(input32_0);
+        input_float_1 = vcvtq_f32_s32(input32_1);
+        output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
+        output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
+        vst2q_f32(outputPtr, output_float);
+        inputPtr += 8;
+        outputPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *outputPtr++ = ((float)(*inputPtr++)) / scalar;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
+#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
+#define INCLUDED_volk_16i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                    const int16_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m256i inputVal2;
+    __m256 ret;
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+        // Convert
+        inputVal2 = _mm256_cvtepi16_epi32(inputVal);
+
+        ret = _mm256_cvtepi32_ps(inputVal2);
+        ret = _mm256_mul_ps(ret, invScalar);
+
+        _mm256_store_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 8;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal, inputVal2;
+    __m128 ret;
+    __m256 output;
+    __m256 dummy = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
+
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(dummy, ret, 0);
+
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        output = _mm256_insertf128_ps(output, ret, 1);
+
+        _mm256_store_ps(outputVectorPtr, output);
+
+        outputVectorPtr += 8;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
+                                                      const int16_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128i inputVal;
+    __m128i inputVal2;
+    __m128 ret;
+
+    for (; number < eighthPoints; number++) {
+
+        // Load the 8 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Shift the input data to the right by 64 bits ( 8 bytes )
+        inputVal2 = _mm_srli_si128(inputVal, 8);
+
+        // Convert the lower 4 values into 32 bit words
+        inputVal = _mm_cvtepi16_epi32(inputVal);
+        inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        ret = _mm_cvtepi32_ps(inputVal2);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
+                                                   const int16_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* inputPtr = (int16_t*)inputVector;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_set_ps((float)(inputPtr[3]),
+                         (float)(inputPtr[2]),
+                         (float)(inputPtr[1]),
+                         (float)(inputPtr[0]));
+
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        inputPtr += 4;
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
+                                                       const int16_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int16_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
diff --git a/kernels/volk/volk_16i_x4_quad_max_star_16i.h b/kernels/volk/volk_16i_x4_quad_max_star_16i.h

new file mode 100644 (file)

index 0000000..4397d12
--- /dev/null
+++ b/kernels/volk/volk_16i_x4_quad_max_star_16i.h
@@ -0,0 +1,288 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_x4_quad_max_star_16i
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_x4_quad_max_star_16i(short* target, short* src0, short* src1, short*
+ * src2, short* src3, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+ * \li src1: The input vector 1.
+ * \li src2: The input vector 2.
+ * \li src3: The input vector 3.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The output value.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_x4_quad_max_star_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+#define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target,
+                                                        short* src0,
+                                                        short* src1,
+                                                        short* src2,
+                                                        short* src3,
+                                                        unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = (num_bytes >> 4);
+    int bound_copy = bound;
+    int leftovers = (num_bytes >> 1) & 7;
+
+    __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
+    p_target = (__m128i*)target;
+    p_src0 = (__m128i*)src0;
+    p_src1 = (__m128i*)src1;
+    p_src2 = (__m128i*)src2;
+    p_src3 = (__m128i*)src3;
+
+    __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+
+    while (bound_copy > 0) {
+        xmm1 = _mm_load_si128(p_src0);
+        xmm2 = _mm_load_si128(p_src1);
+        xmm3 = _mm_load_si128(p_src2);
+        xmm4 = _mm_load_si128(p_src3);
+
+        xmm5 = _mm_setzero_si128();
+        xmm6 = _mm_setzero_si128();
+        xmm7 = xmm1;
+        xmm8 = xmm3;
+
+        xmm1 = _mm_sub_epi16(xmm2, xmm1);
+
+        xmm3 = _mm_sub_epi16(xmm4, xmm3);
+
+        xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
+        xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
+
+        xmm2 = _mm_and_si128(xmm5, xmm2);
+        xmm4 = _mm_and_si128(xmm6, xmm4);
+        xmm5 = _mm_andnot_si128(xmm5, xmm7);
+        xmm6 = _mm_andnot_si128(xmm6, xmm8);
+
+        xmm5 = _mm_add_epi16(xmm2, xmm5);
+        xmm6 = _mm_add_epi16(xmm4, xmm6);
+
+        xmm1 = _mm_xor_si128(xmm1, xmm1);
+        xmm2 = xmm5;
+        xmm5 = _mm_sub_epi16(xmm6, xmm5);
+        p_src0 += 1;
+        bound_copy -= 1;
+
+        xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
+        p_src1 += 1;
+
+        xmm6 = _mm_and_si128(xmm1, xmm6);
+
+        xmm1 = _mm_andnot_si128(xmm1, xmm2);
+        p_src2 += 1;
+
+        xmm1 = _mm_add_epi16(xmm6, xmm1);
+        p_src3 += 1;
+
+        _mm_store_si128(p_target, xmm1);
+        p_target += 1;
+    }
+
+
+    /*__VOLK_ASM __VOLK_VOLATILE
+      (
+      "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
+      "cmp $0, %[bound]\n\t"
+      "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
+
+      "movaps (%[src0]), %%xmm1\n\t"
+      "movaps (%[src1]), %%xmm2\n\t"
+      "movaps (%[src2]), %%xmm3\n\t"
+      "movaps (%[src3]), %%xmm4\n\t"
+
+      "pxor %%xmm5, %%xmm5\n\t"
+      "pxor %%xmm6, %%xmm6\n\t"
+      "movaps %%xmm1, %%xmm7\n\t"
+      "movaps %%xmm3, %%xmm8\n\t"
+      "psubw %%xmm2, %%xmm1\n\t"
+      "psubw %%xmm4, %%xmm3\n\t"
+
+      "pcmpgtw %%xmm1, %%xmm5\n\t"
+      "pcmpgtw %%xmm3, %%xmm6\n\t"
+
+      "pand %%xmm5, %%xmm2\n\t"
+      "pand %%xmm6, %%xmm4\n\t"
+      "pandn %%xmm7, %%xmm5\n\t"
+      "pandn %%xmm8, %%xmm6\n\t"
+
+      "paddw %%xmm2, %%xmm5\n\t"
+      "paddw %%xmm4, %%xmm6\n\t"
+
+      "pxor %%xmm1, %%xmm1\n\t"
+      "movaps %%xmm5, %%xmm2\n\t"
+
+      "psubw %%xmm6, %%xmm5\n\t"
+      "add $16, %[src0]\n\t"
+      "add $-1, %[bound]\n\t"
+
+      "pcmpgtw %%xmm5, %%xmm1\n\t"
+      "add $16, %[src1]\n\t"
+
+      "pand %%xmm1, %%xmm6\n\t"
+
+      "pandn %%xmm2, %%xmm1\n\t"
+      "add $16, %[src2]\n\t"
+
+      "paddw %%xmm6, %%xmm1\n\t"
+      "add $16, %[src3]\n\t"
+
+      "movaps %%xmm1, (%[target])\n\t"
+      "addw $16, %[target]\n\t"
+      "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
+
+      "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
+      :
+      :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+      [src3]"r"(src3), [target]"r"(target)
+      :
+      );
+    */
+
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+        temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+        target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+    }
+    return;
+}
+
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_NEON
+
+#include <arm_neon.h>
+
+static inline void volk_16i_x4_quad_max_star_16i_neon(short* target,
+                                                      short* src0,
+                                                      short* src1,
+                                                      short* src2,
+                                                      short* src3,
+                                                      unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    unsigned i;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec;
+    int16x8_t diff12, diff34;
+    int16x8_t comp0, comp1, comp2, comp3;
+    int16x8_t result1_vec, result2_vec;
+    int16x8_t zeros;
+    zeros = vdupq_n_s16(0);
+    for (i = 0; i < eighth_points; ++i) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        diff12 = vsubq_s16(src0_vec, src1_vec);
+        diff34 = vsubq_s16(src2_vec, src3_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp2 = (int16x8_t)vcgeq_s16(diff34, zeros);
+        comp3 = (int16x8_t)vcltq_s16(diff34, zeros);
+        comp0 = vandq_s16(src0_vec, comp0);
+        comp1 = vandq_s16(src1_vec, comp1);
+        comp2 = vandq_s16(src2_vec, comp2);
+        comp3 = vandq_s16(src3_vec, comp3);
+
+        result1_vec = vaddq_s16(comp0, comp1);
+        result2_vec = vaddq_s16(comp2, comp3);
+
+        diff12 = vsubq_s16(result1_vec, result2_vec);
+        comp0 = (int16x8_t)vcgeq_s16(diff12, zeros);
+        comp1 = (int16x8_t)vcltq_s16(diff12, zeros);
+        comp0 = vandq_s16(result1_vec, comp0);
+        comp1 = vandq_s16(result2_vec, comp1);
+        result1_vec = vaddq_s16(comp0, comp1);
+        vst1q_s16(target, result1_vec);
+        src0 += 8;
+        src1 += 8;
+        src2 += 8;
+        src3 += 8;
+        target += 8;
+    }
+
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        temp0 = ((short)(*src0 - *src1) > 0) ? *src0 : *src1;
+        temp1 = ((short)(*src2 - *src3) > 0) ? *src2 : *src3;
+        *target++ = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+        src0++;
+        src1++;
+        src2++;
+        src3++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16i_x4_quad_max_star_16i_generic(short* target,
+                                                         short* src0,
+                                                         short* src1,
+                                                         short* src2,
+                                                         short* src3,
+                                                         unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = num_bytes >> 1;
+
+    short temp0 = 0;
+    short temp1 = 0;
+    for (i = 0; i < bound; ++i) {
+        temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
+        temp1 = ((short)(src2[i] - src3[i]) > 0) ? src2[i] : src3[i];
+        target[i] = ((short)(temp0 - temp1) > 0) ? temp0 : temp1;
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/
diff --git a/kernels/volk/volk_16i_x5_add_quad_16i_x4.h b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h

new file mode 100644 (file)

index 0000000..6d47c7b
--- /dev/null
+++ b/kernels/volk/volk_16i_x5_add_quad_16i_x4.h
@@ -0,0 +1,251 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16i_x5_add_quad_16i_x4
+ *
+ * \b Overview
+ *
+ * <FIXME>
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16i_x5_add_quad_16i_x4(short* target0, short* target1, short* target2, short*
+ * target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int
+ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector 0.
+ * \li src1: The input vector 1.
+ * \li src2: The input vector 2.
+ * \li src3: The input vector 3.
+ * \li src4: The input vector 4.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target0: The output value 0.
+ * \li target1: The output value 1.
+ * \li target2: The output value 2.
+ * \li target3: The output value 3.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16i_x5_add_quad_16i_x4();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
+                                                      short* target1,
+                                                      short* target2,
+                                                      short* target3,
+                                                      short* src0,
+                                                      short* src1,
+                                                      short* src2,
+                                                      short* src3,
+                                                      short* src4,
+                                                      unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
+    __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
+        *p_src3, *p_src4;
+    p_target0 = (__m128i*)target0;
+    p_target1 = (__m128i*)target1;
+    p_target2 = (__m128i*)target2;
+    p_target3 = (__m128i*)target3;
+
+    p_src0 = (__m128i*)src0;
+    p_src1 = (__m128i*)src1;
+    p_src2 = (__m128i*)src2;
+    p_src3 = (__m128i*)src3;
+    p_src4 = (__m128i*)src4;
+
+    int i = 0;
+
+    int bound = (num_bytes >> 4);
+    int leftovers = (num_bytes >> 1) & 7;
+
+    for (; i < bound; ++i) {
+        xmm0 = _mm_load_si128(p_src0);
+        xmm1 = _mm_load_si128(p_src1);
+        xmm2 = _mm_load_si128(p_src2);
+        xmm3 = _mm_load_si128(p_src3);
+        xmm4 = _mm_load_si128(p_src4);
+
+        p_src0 += 1;
+        p_src1 += 1;
+
+        xmm1 = _mm_add_epi16(xmm0, xmm1);
+        xmm2 = _mm_add_epi16(xmm0, xmm2);
+        xmm3 = _mm_add_epi16(xmm0, xmm3);
+        xmm4 = _mm_add_epi16(xmm0, xmm4);
+
+
+        p_src2 += 1;
+        p_src3 += 1;
+        p_src4 += 1;
+
+        _mm_store_si128(p_target0, xmm1);
+        _mm_store_si128(p_target1, xmm2);
+        _mm_store_si128(p_target2, xmm3);
+        _mm_store_si128(p_target3, xmm4);
+
+        p_target0 += 1;
+        p_target1 += 1;
+        p_target2 += 1;
+        p_target3 += 1;
+    }
+    /*__VOLK_ASM __VOLK_VOLATILE
+      (
+      ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
+      "cmp $0, %[bound]\n\t"
+      "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
+      "movaps (%[src0]), %%xmm1\n\t"
+      "movaps (%[src1]), %%xmm2\n\t"
+      "movaps (%[src2]), %%xmm3\n\t"
+      "movaps (%[src3]), %%xmm4\n\t"
+      "movaps (%[src4]), %%xmm5\n\t"
+      "add $16, %[src0]\n\t"
+      "add $16, %[src1]\n\t"
+      "add $16, %[src2]\n\t"
+      "add $16, %[src3]\n\t"
+      "add $16, %[src4]\n\t"
+      "paddw %%xmm1, %%xmm2\n\t"
+      "paddw %%xmm1, %%xmm3\n\t"
+      "paddw %%xmm1, %%xmm4\n\t"
+      "paddw %%xmm1, %%xmm5\n\t"
+      "add $-1, %[bound]\n\t"
+      "movaps %%xmm2, (%[target0])\n\t"
+      "movaps %%xmm3, (%[target1])\n\t"
+      "movaps %%xmm4, (%[target2])\n\t"
+      "movaps %%xmm5, (%[target3])\n\t"
+      "add $16, %[target0]\n\t"
+      "add $16, %[target1]\n\t"
+      "add $16, %[target2]\n\t"
+      "add $16, %[target3]\n\t"
+      "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
+      ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
+      :
+      :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
+      [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
+      [target2]"r"(target2), [target3]"r"(target3)
+      :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      );
+    */
+
+    for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
+        target0[i] = src0[i] + src1[i];
+        target1[i] = src0[i] + src2[i];
+        target2[i] = src0[i] + src3[i];
+        target3[i] = src0[i] + src4[i];
+    }
+}
+#endif /*LV_HAVE_SSE2*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
+                                                    short* target1,
+                                                    short* target2,
+                                                    short* target3,
+                                                    short* src0,
+                                                    short* src1,
+                                                    short* src2,
+                                                    short* src3,
+                                                    short* src4,
+                                                    unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    unsigned int number = 0;
+
+    int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
+    int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
+    for (number = 0; number < eighth_points; ++number) {
+        src0_vec = vld1q_s16(src0);
+        src1_vec = vld1q_s16(src1);
+        src2_vec = vld1q_s16(src2);
+        src3_vec = vld1q_s16(src3);
+        src4_vec = vld1q_s16(src4);
+
+        target0_vec = vaddq_s16(src0_vec, src1_vec);
+        target1_vec = vaddq_s16(src0_vec, src2_vec);
+        target2_vec = vaddq_s16(src0_vec, src3_vec);
+        target3_vec = vaddq_s16(src0_vec, src4_vec);
+
+        vst1q_s16(target0, target0_vec);
+        vst1q_s16(target1, target1_vec);
+        vst1q_s16(target2, target2_vec);
+        vst1q_s16(target3, target3_vec);
+        src0 += 8;
+        src1 += 8;
+        src2 += 8;
+        src3 += 8;
+        src4 += 8;
+        target0 += 8;
+        target1 += 8;
+        target2 += 8;
+        target3 += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; ++number) {
+        *target0++ = *src0 + *src1++;
+        *target1++ = *src0 + *src2++;
+        *target2++ = *src0 + *src3++;
+        *target3++ = *src0++ + *src4++;
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
+                                                       short* target1,
+                                                       short* target2,
+                                                       short* target3,
+                                                       short* src0,
+                                                       short* src1,
+                                                       short* src2,
+                                                       short* src3,
+                                                       short* src4,
+                                                       unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 2;
+
+    int i = 0;
+
+    int bound = num_bytes >> 1;
+
+    for (i = 0; i < bound; ++i) {
+        target0[i] = src0[i] + src1[i];
+        target1[i] = src0[i] + src2[i];
+        target2[i] = src0[i] + src3[i];
+        target3[i] = src0[i] + src4[i];
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
diff --git a/kernels/volk/volk_16ic_convert_32fc.h b/kernels/volk/volk_16ic_convert_32fc.h

new file mode 100644 (file)

index 0000000..7a779bf
--- /dev/null
+++ b/kernels/volk/volk_16ic_convert_32fc.h
@@ -0,0 +1,318 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_convert_32fc
+ *
+ * \b Overview
+ *
+ * Converts a complex vector of 16-bits integer each component
+ * into a complex vector of 32-bits float each component.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_convert_32fc(lv_32fc_t* outputVector, const lv_16sc_t* inputVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector:  The complex 16-bit integer input data buffer.
+ * \li num_points:   The number of data values to be converted.
+ *
+ * \b Outputs
+ * \li outputVector: pointer to a vector holding the converted vector.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * unsigned int alignment = volk_get_alignment();
+ * lv_16sc_t* input  = (lv_16sc_t*)volk_malloc(sizeof(lv_16sc_t)*N, alignment);
+ * lv_32fc_t* output  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ * volk_16ic_convert_32f(output, input, N);
+ *
+ * volk_free(input);
+ * volk_free(output);
+ * \endcode
+ */
+
+
+#ifndef INCLUDED_volk_16ic_convert_32fc_a_H
+#define INCLUDED_volk_16ic_convert_32fc_a_H
+
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_a_avx2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 8;
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)inputVector;
+    float* outputVectorPtr = (float*)outputVector;
+    __m256 outVal;
+    __m256i outValInt;
+    __m128i cplxValue;
+
+    for (number = 0; number < avx_iters; number++) {
+        cplxValue = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        outValInt = _mm256_cvtepi16_epi32(cplxValue);
+        outVal = _mm256_cvtepi32_ps(outValInt);
+        _mm256_store_ps((float*)outputVectorPtr, outVal);
+
+        outputVectorPtr += 8;
+    }
+
+    number = avx_iters * 8;
+    for (; number < num_points * 2; number++) {
+        *outputVectorPtr++ = (float)*complexVectorPtr++;
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_convert_32fc_generic(lv_32fc_t* outputVector,
+                                                  const lv_16sc_t* inputVector,
+                                                  unsigned int num_points)
+{
+    unsigned int i;
+    for (i = 0; i < num_points; i++) {
+        outputVector[i] =
+            lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 2;
+
+    const lv_16sc_t* _in = inputVector;
+    lv_32fc_t* _out = outputVector;
+    __m128 a;
+    unsigned int number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_set_ps(
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm_store_ps((float*)_out, a);
+        _in += 2;
+        _out += 2;
+    }
+    if (num_points & 1) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_a_avx(lv_32fc_t* outputVector,
+                                                const lv_16sc_t* inputVector,
+                                                unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+
+    const lv_16sc_t* _in = inputVector;
+    lv_32fc_t* _out = outputVector;
+    __m256 a;
+    unsigned int i, number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm256_set_ps(
+            (float)(lv_cimag(_in[3])),
+            (float)(lv_creal(_in[3])),
+            (float)(lv_cimag(_in[2])),
+            (float)(lv_creal(_in[2])),
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm256_store_ps((float*)_out, a);
+        _in += 4;
+        _out += 4;
+    }
+
+    for (i = 0; i < (num_points % 4); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
+                                               const lv_16sc_t* inputVector,
+                                               unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 2;
+
+    const lv_16sc_t* _in = inputVector;
+    lv_32fc_t* _out = outputVector;
+
+    int16x4_t a16x4;
+    int32x4_t a32x4;
+    float32x4_t f32x4;
+    unsigned int i, number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a16x4 = vld1_s16((const int16_t*)_in);
+        __VOLK_PREFETCH(_in + 4);
+        a32x4 = vmovl_s16(a16x4);
+        f32x4 = vcvtq_f32_s32(a32x4);
+        vst1q_f32((float32_t*)_out, f32x4);
+        _in += 2;
+        _out += 2;
+    }
+    for (i = 0; i < (num_points % 2); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
+
+#ifndef INCLUDED_volk_16ic_convert_32fc_u_H
+#define INCLUDED_volk_16ic_convert_32fc_u_H
+
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 8;
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)inputVector;
+    float* outputVectorPtr = (float*)outputVector;
+    __m256 outVal;
+    __m256i outValInt;
+    __m128i cplxValue;
+
+    for (number = 0; number < avx_iters; number++) {
+        cplxValue = _mm_loadu_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        outValInt = _mm256_cvtepi16_epi32(cplxValue);
+        outVal = _mm256_cvtepi32_ps(outValInt);
+        _mm256_storeu_ps((float*)outputVectorPtr, outVal);
+
+        outputVectorPtr += 8;
+    }
+
+    number = avx_iters * 8;
+    for (; number < num_points * 2; number++) {
+        *outputVectorPtr++ = (float)*complexVectorPtr++;
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector,
+                                                 const lv_16sc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 2;
+
+    const lv_16sc_t* _in = inputVector;
+    lv_32fc_t* _out = outputVector;
+    __m128 a;
+    unsigned int number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_set_ps(
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm_storeu_ps((float*)_out, a);
+        _in += 2;
+        _out += 2;
+    }
+    if (num_points & 1) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_16ic_convert_32fc_u_avx(lv_32fc_t* outputVector,
+                                                const lv_16sc_t* inputVector,
+                                                unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+
+    const lv_16sc_t* _in = inputVector;
+    lv_32fc_t* _out = outputVector;
+    __m256 a;
+    unsigned int i, number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm256_set_ps(
+            (float)(lv_cimag(_in[3])),
+            (float)(lv_creal(_in[3])),
+            (float)(lv_cimag(_in[2])),
+            (float)(lv_creal(_in[2])),
+            (float)(lv_cimag(_in[1])),
+            (float)(lv_creal(_in[1])),
+            (float)(lv_cimag(_in[0])),
+            (float)(lv_creal(
+                _in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+        _mm256_storeu_ps((float*)_out, a);
+        _in += 4;
+        _out += 4;
+    }
+
+    for (i = 0; i < (num_points % 4); ++i) {
+        *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
+        _in++;
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_16i_x2.h b/kernels/volk/volk_16ic_deinterleave_16i_x2.h

new file mode 100644 (file)

index 0000000..f3b53d1
--- /dev/null
+++ b/kernels/volk/volk_16ic_deinterleave_16i_x2.h
@@ -0,0 +1,378 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_16i_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector into I & Q vector data.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_16sc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_16i_x2();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    __m256i iMove2, iMove1;
+    __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+        iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+        iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+                                               _mm256_permute4x64_epi64(iMove2, 0x80),
+                                               0x30);
+        qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+                                               _mm256_permute4x64_epi64(iMove2, 0xd0),
+                                               0x30);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_16ic_deinterleave_16i_x2_a_ssse3(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+    __m128i qMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 14, 11, 10, 7, 6, 3, 2);
+    __m128i qMoveMask2 = _mm_set_epi8(
+        15, 14, 11, 10, 7, 6, 3, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+    __m128i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        iOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, iMoveMask1),
+                                  _mm_shuffle_epi8(complexVal2, iMoveMask2));
+        qOutputVal = _mm_or_si128(_mm_shuffle_epi8(complexVal1, qMoveMask1),
+                                  _mm_shuffle_epi8(complexVal2, qMoveMask2));
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_deinterleave_16i_x2_a_sse2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i complexVal1, complexVal2, iComplexVal1, iComplexVal2, qComplexVal1,
+        qComplexVal2, iOutputVal, qOutputVal;
+    __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+    __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        iComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        iComplexVal1 = _mm_shufflehi_epi16(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        iComplexVal1 = _mm_shuffle_epi32(iComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        iComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+        iComplexVal2 = _mm_shufflehi_epi16(iComplexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+        iComplexVal2 = _mm_shuffle_epi32(iComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+        iOutputVal = _mm_or_si128(_mm_and_si128(iComplexVal1, lowMask),
+                                  _mm_and_si128(iComplexVal2, highMask));
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+        qComplexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(2, 0, 3, 1));
+
+        qComplexVal1 = _mm_shufflehi_epi16(qComplexVal1, _MM_SHUFFLE(2, 0, 3, 1));
+
+        qComplexVal1 = _mm_shuffle_epi32(qComplexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        qComplexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+        qComplexVal2 = _mm_shufflehi_epi16(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+        qComplexVal2 = _mm_shuffle_epi32(qComplexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+        qOutputVal = _mm_or_si128(_mm_and_si128(qComplexVal1, lowMask),
+                                  _mm_and_si128(qComplexVal2, highMask));
+
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_16ic_deinterleave_16i_x2_a_orc_impl(int16_t* iBuffer,
+                                                     int16_t* qBuffer,
+                                                     const lv_16sc_t* complexVector,
+                                                     unsigned int num_points);
+static inline void volk_16ic_deinterleave_16i_x2_u_orc(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_16sc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    volk_16ic_deinterleave_16i_x2_a_orc_impl(iBuffer, qBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_a_H */
+
+
+#ifndef INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
+#define INCLUDED_volk_16ic_deinterleave_16i_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       15,
+                                       14,
+                                       11,
+                                       10,
+                                       7,
+                                       6,
+                                       3,
+                                       2,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    __m256i iMove2, iMove1;
+    __m256i complexVal1, complexVal2, iOutputVal, qOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        iMove2 = _mm256_shuffle_epi8(complexVal2, MoveMask);
+        iMove1 = _mm256_shuffle_epi8(complexVal1, MoveMask);
+
+        iOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x08),
+                                               _mm256_permute4x64_epi64(iMove2, 0x80),
+                                               0x30);
+        qOutputVal = _mm256_permute2x128_si256(_mm256_permute4x64_epi64(iMove1, 0x0d),
+                                               _mm256_permute4x64_epi64(iMove2, 0xd0),
+                                               0x30);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *int16ComplexVectorPtr++;
+        *qBufferPtr++ = *int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_16ic_deinterleave_16i_x2_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_real_16i.h b/kernels/volk/volk_16ic_deinterleave_real_16i.h

new file mode 100644 (file)

index 0000000..92110a3
--- /dev/null
+++ b/kernels/volk/volk_16ic_deinterleave_real_16i.h
@@ -0,0 +1,380 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns the real (inphase) part of the
+ * signal.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_real_16i(int16_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_real_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+
+    __m256i complexVal1, complexVal2, iOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+    __m128i complexVal1, complexVal2, iOutputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+        iOutputVal = _mm_or_si128(complexVal1, complexVal2);
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i complexVal1, complexVal2, iOutputVal;
+    __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
+    __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
+
+        complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+        complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
+
+        complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
+
+        iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
+                                  _mm_and_si128(complexVal2, highMask));
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
+
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
+#define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+
+    __m256i complexVal1, complexVal2, iOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_16ic_deinterleave_real_8i.h b/kernels/volk/volk_16ic_deinterleave_real_8i.h

new file mode 100644 (file)

index 0000000..b03cdc6
--- /dev/null
+++ b/kernels/volk/volk_16ic_deinterleave_real_8i.h
@@ -0,0 +1,418 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_deinterleave_real_8i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns the real
+ * (inphase) part of the signal as an 8-bit value.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_deinterleave_real_8i(int8_t* iBuffer, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data with 8-bit precision.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_deinterleave_real_8i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+    __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+        complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+        complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+        complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+        complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+        complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+        iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i iMoveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+    __m128i iMoveMask2 = _mm_set_epi8(
+        13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
+
+        complexVal1 = _mm_or_si128(complexVal1, complexVal2);
+
+        complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
+
+        complexVal3 = _mm_or_si128(complexVal3, complexVal4);
+
+
+        complexVal1 = _mm_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm_srai_epi16(complexVal3, 8);
+
+        iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+                                                       const lv_16sc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    unsigned int eighth_points = num_points / 8;
+    unsigned int number;
+
+    int16x8x2_t complexInput;
+    int8x8_t realOutput;
+    for (number = 0; number < eighth_points; number++) {
+        complexInput = vld2q_s16(complexVectorPtr);
+        realOutput = vshrn_n_s16(complexInput.val[0], 8);
+        vst1_s8(iBufferPtr, realOutput);
+        complexVectorPtr += 16;
+        iBufferPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
+        complexVectorPtr++;
+    }
+}
+#endif
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
+                                                      const lv_16sc_t* complexVector,
+                                                      unsigned int num_points);
+
+static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
+                                                        const lv_16sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
+
+#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
+#define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+                                                         const lv_16sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i iMoveMask1 = _mm256_set_epi8(0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0);
+    __m256i iMoveMask2 = _mm256_set_epi8(13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         13,
+                                         12,
+                                         9,
+                                         8,
+                                         5,
+                                         4,
+                                         1,
+                                         0,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80,
+                                         0x80);
+    __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
+
+        complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
+        complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
+
+        complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
+        complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
+
+        complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
+        complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
+
+        complexVal1 = _mm256_srai_epi16(complexVal1, 8);
+        complexVal3 = _mm256_srai_epi16(complexVal3, 8);
+
+        iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
+        iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
+        int16ComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
diff --git a/kernels/volk/volk_16ic_magnitude_16i.h b/kernels/volk/volk_16ic_magnitude_16i.h

new file mode 100644 (file)

index 0000000..72a55f6
--- /dev/null
+++ b/kernels/volk/volk_16ic_magnitude_16i.h
@@ -0,0 +1,429 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_magnitude_16i
+ *
+ * \b Overview
+ *
+ * Computes the magnitude of the complexVector and stores the results
+ * in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_magnitude_16i(int16_t* magnitudeVector, const lv_16sc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The magnitude of the complex values.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_magnitude_16i();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
+#define INCLUDED_volk_16ic_magnitude_16i_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+    __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+        int1 = _mm256_cvtps_epi32(result);
+        int1 = _mm256_packs_epi32(int1, int1);
+        int1 = _mm256_permutevar8x32_epi32(
+            int1, idx); // permute to compensate for shuffling in hadd and packs
+        short1 = _mm256_extracti128_si256(int1, 0);
+        _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+    __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
+
+    __m128 cplxValue1, cplxValue2, result;
+
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result); // Square root the values
+
+        result = _mm_mul_ps(result, vScalar); // Scale the results
+
+        _mm_store_ps(outputFloatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector,
+                                                 const lv_16sc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 vScalar = _mm_set_ps1(SHRT_MAX);
+    __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
+
+    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
+
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        cplxValue1 = _mm_load_ps(inputFloatBuffer);
+        complexVectorPtr += 4;
+
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        cplxValue2 = _mm_load_ps(inputFloatBuffer);
+        complexVectorPtr += 4;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        iValue = _mm_mul_ps(iValue, iValue); // Square the I values
+        qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+        result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result); // Square root the values
+
+        result = _mm_mul_ps(result, vScalar); // Scale the results
+
+        _mm_store_ps(outputFloatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector,
+                                                   const lv_16sc_t* complexVector,
+                                                   unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    const float scalar = SHRT_MAX;
+    for (number = 0; number < num_points; number++) {
+        float real = ((float)(*complexVectorPtr++)) / scalar;
+        float imag = ((float)(*complexVectorPtr++)) / scalar;
+        *magnitudeVectorPtr++ =
+            (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
+                                               const lv_16sc_t* complexVector,
+                                               float scalar,
+                                               unsigned int num_points);
+
+static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
+                                                 const lv_16sc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    volk_16ic_magnitude_16i_a_orc_impl(
+        magnitudeVector, complexVector, SHRT_MAX, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
+
+
+#ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
+#define INCLUDED_volk_16ic_magnitude_16i_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
+    __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        result = _mm256_mul_ps(result, vScalar); // Scale the results
+
+        int1 = _mm256_cvtps_epi32(result);
+        int1 = _mm256_packs_epi32(int1, int1);
+        int1 = _mm256_permutevar8x32_epi32(
+            int1, idx); // permute to compensate for shuffling in hadd and packs
+        short1 = _mm256_extracti128_si256(int1, 0);
+        _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
+        const float val1Result =
+            sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
+        *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_NEONV7
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
+                                                  const lv_16sc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    const float scalar = SHRT_MAX;
+    const float inv_scalar = 1.0f / scalar;
+
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+    const lv_16sc_t* complexVectorPtr = complexVector;
+
+    float32x4_t mag_vec;
+    float32x4x2_t c_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
+        __VOLK_PREFETCH(complexVectorPtr + 4);
+        c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
+        c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
+        // Scale to close to 0-1
+        c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
+        c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
+        // vsqrtq_f32 is armv8
+        const float32x4_t mag_vec_squared = _vmagnitudesquaredq_f32(c_vec);
+        mag_vec = vmulq_f32(mag_vec_squared, _vinvsqrtq_f32(mag_vec_squared));
+        // Reconstruct
+        mag_vec = vmulq_n_f32(mag_vec, scalar);
+        // Add 0.5 for correct rounding because vcvtq_s32_f32 truncates.
+        // This works because the magnitude is always positive.
+        mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
+        const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
+        vst1_s16(magnitudeVectorPtr, mag16_vec);
+        // Advance pointers
+        magnitudeVectorPtr += 4;
+        complexVectorPtr += 4;
+    }
+
+    // Deal with the rest
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = lv_creal(*complexVectorPtr) * inv_scalar;
+        const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
+        *magnitudeVectorPtr =
+            (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
+        complexVectorPtr++;
+        magnitudeVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEONV7 */
+
+#endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h

new file mode 100644 (file)

index 0000000..f78ba62
--- /dev/null
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_32f_x2.h
@@ -0,0 +1,330 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_s32f_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector into I & Q vector data and
+ * returns the result as two vectors of floats that have been scaled.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ *  void volk_16ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const
+ * lv_16sc_t* complexVector, const float scalar, unsigned int num_points){ \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The floating point I buffer output data.
+ * \li qBuffer: The floating point Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_deinterleave_32f_x2();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+                                          float* qBuffer,
+                                          const lv_16sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, iValue, qValue;
+    __m256i cplxValueA, cplxValueB;
+    __m128i cplxValue128;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        cplxValueA = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        // cvt
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+        qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+        _mm256_store_ps(iBufferPtr, iValue);
+        _mm256_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                         float* qBuffer,
+                                         const lv_16sc_t* complexVector,
+                                         const float scalar,
+                                         unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+    for (; number < quarterPoints; number++) {
+
+        floatBuffer[0] = (float)(complexVectorPtr[0]);
+        floatBuffer[1] = (float)(complexVectorPtr[1]);
+        floatBuffer[2] = (float)(complexVectorPtr[2]);
+        floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        floatBuffer[4] = (float)(complexVectorPtr[4]);
+        floatBuffer[5] = (float)(complexVectorPtr[5]);
+        floatBuffer[6] = (float)(complexVectorPtr[6]);
+        floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+        cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+                                           float* qBuffer,
+                                           const lv_16sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_16ic_s32f_deinterleave_32f_x2_neon(float* iBuffer,
+                                                           float* qBuffer,
+                                                           const lv_16sc_t* complexVector,
+                                                           const float scalar,
+                                                           unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int eighth_points = num_points / 4;
+    unsigned int number;
+    float iScalar = 1.f / scalar;
+    float32x4_t invScalar;
+    invScalar = vld1q_dup_f32(&iScalar);
+
+    int16x4x2_t complexInput_s16;
+    int32x4x2_t complexInput_s32;
+    float32x4x2_t complexFloat;
+
+    for (number = 0; number < eighth_points; number++) {
+        complexInput_s16 = vld2_s16(complexVectorPtr);
+        complexInput_s32.val[0] = vmovl_s16(complexInput_s16.val[0]);
+        complexInput_s32.val[1] = vmovl_s16(complexInput_s16.val[1]);
+        complexFloat.val[0] = vcvtq_f32_s32(complexInput_s32.val[0]);
+        complexFloat.val[1] = vcvtq_f32_s32(complexInput_s32.val[1]);
+        complexFloat.val[0] = vmulq_f32(complexFloat.val[0], invScalar);
+        complexFloat.val[1] = vmulq_f32(complexFloat.val[1], invScalar);
+        vst1q_f32(iBufferPtr, complexFloat.val[0]);
+        vst1q_f32(qBufferPtr, complexFloat.val[1]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    for (number = eighth_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+extern void volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(float* iBuffer,
+                                                          float* qBuffer,
+                                                          const lv_16sc_t* complexVector,
+                                                          const float scalar,
+                                                          unsigned int num_points);
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_u_orc(float* iBuffer,
+                                         float* qBuffer,
+                                         const lv_16sc_t* complexVector,
+                                         const float scalar,
+                                         unsigned int num_points)
+{
+    volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl(
+        iBuffer, qBuffer, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_a_H */
+
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+                                          float* qBuffer,
+                                          const lv_16sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    uint64_t number = 0;
+    const uint64_t eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, iValue, qValue;
+    __m256i cplxValueA, cplxValueB;
+    __m128i cplxValue128;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        cplxValueA = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        // cvt
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 0);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue1 = _mm256_cvtepi32_ps(cplxValueB);
+        cplxValue128 = _mm256_extracti128_si256(cplxValueA, 1);
+        cplxValueB = _mm256_cvtepi16_epi32(cplxValue128);
+        cplxValue2 = _mm256_cvtepi32_ps(cplxValueB);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+        qValue = _mm256_permutevar8x32_ps(qValue, idx);
+
+        _mm256_storeu_ps(iBufferPtr, iValue);
+        _mm256_storeu_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h

new file mode 100644 (file)

index 0000000..5568832
--- /dev/null
+++ b/kernels/volk/volk_16ic_s32f_deinterleave_real_32f.h
@@ -0,0 +1,337 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_s32f_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 16 bit vector and returns just the real
+ * part (inphase) of the data as a vector of floats that have been
+ * scaled.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ *  void volk_16ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points){ \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The floating point I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_deinterleave_real_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                            const lv_16sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i complexVal128;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+        iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+                                              const lv_16sc_t* complexVector,
+                                              const float scalar,
+                                              unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
+
+    for (; number < quarterPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+        iIntVal = _mm_cvtepi16_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+        _mm_store_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+                                           const lv_16sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 iValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[1] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[2] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[3] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+
+        iValue = _mm_load_ps(floatBuffer);
+
+        iValue = _mm_mul_ps(iValue, invScalar);
+
+        _mm_store_ps(iBufferPtr, iValue);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * iScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+                                             const lv_16sc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_a_H */
+
+#ifndef INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
+#define INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_16ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                            const lv_16sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i complexVal128;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       13,
+                                       12,
+                                       9,
+                                       8,
+                                       5,
+                                       4,
+                                       1,
+                                       0);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        complexVal128 = _mm256_extracti128_si256(complexVal, 0);
+
+        iIntVal = _mm256_cvtepi16_epi32(complexVal128);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    int16_t* sixteenTComplexVectorPtr = (int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*sixteenTComplexVectorPtr++)) * iScalar;
+        sixteenTComplexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_16ic_s32f_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_16ic_s32f_magnitude_32f.h b/kernels/volk/volk_16ic_s32f_magnitude_32f.h

new file mode 100644 (file)

index 0000000..6d6fd33
--- /dev/null
+++ b/kernels/volk/volk_16ic_s32f_magnitude_32f.h
@@ -0,0 +1,349 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_s32f_magnitude_32f
+ *
+ * \b Overview
+ *
+ * Computes the magnitude of the complexVector and stores the results
+ * in the magnitudeVector as a scaled floating point number.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_s32f_magnitude_32f(float* magnitudeVector, const lv_16sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector of complex 16-bit shorts.
+ * \li scalar: The value to be divided against each sample of the input complex vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The magnitude of the complex values.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_16ic_s32f_magnitude_32f();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_s32f_magnitude_32f_a_avx2(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_permutevar8x32_ps(result, idx);
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        _mm256_store_ps(magnitudeVectorPtr, result);
+
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_16ic_s32f_magnitude_32f_a_sse3(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+
+    __m128 cplxValue1, cplxValue2, result;
+
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+    for (; number < quarterPoints; number++) {
+
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result); // Square root the values
+
+        _mm_store_ps(magnitudeVectorPtr, result);
+
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_16ic_s32f_magnitude_32f_a_sse(float* magnitudeVector,
+                                                      const lv_16sc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+
+    __m128 cplxValue1, cplxValue2, result, re, im;
+
+    __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
+
+    for (; number < quarterPoints; number++) {
+        inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
+        inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
+        inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
+        inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
+        inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
+        inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
+        inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
+
+        cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
+
+        re = _mm_shuffle_ps(cplxValue1, cplxValue2, 0x88);
+        im = _mm_shuffle_ps(cplxValue1, cplxValue2, 0xdd);
+
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm_mul_ps(re, invScalar);
+        cplxValue2 = _mm_mul_ps(im, invScalar);
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm_add_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result); // Square root the values
+
+        _mm_store_ps(magnitudeVectorPtr, result);
+
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) * iScalar;
+        float val1Imag = (float)(*complexVectorPtr++) * iScalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+
+
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_s32f_magnitude_32f_generic(float* magnitudeVector,
+                                                        const lv_16sc_t* complexVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
+{
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        float real = ((float)(*complexVectorPtr++)) * invScalar;
+        float imag = ((float)(*complexVectorPtr++)) * invScalar;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC_DISABLED
+
+extern void volk_16ic_s32f_magnitude_32f_a_orc_impl(float* magnitudeVector,
+                                                    const lv_16sc_t* complexVector,
+                                                    const float scalar,
+                                                    unsigned int num_points);
+
+static inline void volk_16ic_s32f_magnitude_32f_u_orc(float* magnitudeVector,
+                                                      const lv_16sc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    volk_16ic_s32f_magnitude_32f_a_orc_impl(
+        magnitudeVector, complexVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_a_H */
+
+#ifndef INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
+#define INCLUDED_volk_16ic_s32f_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_s32f_magnitude_32f_u_avx2(float* magnitudeVector,
+                                                       const lv_16sc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const int16_t* complexVectorPtr = (const int16_t*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i int1, int2;
+    __m128i short1, short2;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        short1 = _mm256_extracti128_si256(int1, 0);
+        short2 = _mm256_extracti128_si256(int1, 1);
+
+        int1 = _mm256_cvtepi16_epi32(short1);
+        int2 = _mm256_cvtepi16_epi32(short2);
+        cplxValue1 = _mm256_cvtepi32_ps(int1);
+        cplxValue2 = _mm256_cvtepi32_ps(int2);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+        result = _mm256_permutevar8x32_ps(result, idx);
+
+        result = _mm256_sqrt_ps(result); // Square root the values
+
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    magnitudeVectorPtr = &magnitudeVector[number];
+    complexVectorPtr = (const int16_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        float val1Real = (float)(*complexVectorPtr++) / scalar;
+        float val1Imag = (float)(*complexVectorPtr++) / scalar;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_16ic_s32f_magnitude_32f_u_H */
diff --git a/kernels/volk/volk_16ic_x2_dot_prod_16ic.h b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h

new file mode 100644 (file)

index 0000000..48e33ab
--- /dev/null
+++ b/kernels/volk/volk_16ic_x2_dot_prod_16ic.h
@@ -0,0 +1,693 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_x2_dot_prod_16ic
+ *
+ * \b Overview
+ *
+ * Multiplies two input complex vectors (16-bit integer each component) and accumulates
+ * them, storing the result. Results are saturated so never go beyond the limits of the
+ * data type.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_x2_dot_prod_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li in_a:          One of the vectors to be multiplied and accumulated.
+ * \li in_b:          The other vector to be multiplied and accumulated.
+ * \li num_points:    Number of complex values to be multiplied together, accumulated and
+ * stored into \p result
+ *
+ * \b Outputs
+ * \li result:        Value of the accumulated result.
+ *
+ */
+
+#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
+
+#include <volk/saturation_arithmetic.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
+                                                      const lv_16sc_t* in_a,
+                                                      const lv_16sc_t* in_b,
+                                                      unsigned int num_points)
+{
+    result[0] = lv_cmake((int16_t)0, (int16_t)0);
+    unsigned int n;
+    for (n = 0; n < num_points; n++) {
+        lv_16sc_t tmp = in_a[n] * in_b[n];
+        result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)),
+                             sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+    const unsigned int sse_iters = num_points / 4;
+    unsigned int number;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+
+    if (sse_iters > 0) {
+        __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+        realcacc = _mm_setzero_si128();
+        imagcacc = _mm_setzero_si128();
+
+        mask_imag = _mm_set_epi8(
+            0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+        mask_real = _mm_set_epi8(
+            0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+        for (number = 0; number < sse_iters; number++) {
+            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+            a = _mm_load_si128(
+                (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+            __VOLK_PREFETCH(_in_a + 8);
+            b = _mm_load_si128((__m128i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 8);
+            c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+            c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                         // zeros, and store the results in dst.
+            real = _mm_subs_epi16(c, c_sr);
+
+            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+            imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm_adds_epi16(realcacc, real);
+            imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+            _in_a += 4;
+            _in_b += 4;
+        }
+
+        realcacc = _mm_and_si128(realcacc, mask_real);
+        imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+        a = _mm_or_si128(realcacc, imagcacc);
+
+        _mm_store_si128((__m128i*)dotProductVector,
+                        a); // Store the results back into the dot product vector
+
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+        }
+    }
+
+    for (number = 0; number < (num_points % 4); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
+
+    *_out = dotProduct;
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+    const unsigned int sse_iters = num_points / 4;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+    unsigned int number;
+
+    if (sse_iters > 0) {
+        __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+        realcacc = _mm_setzero_si128();
+        imagcacc = _mm_setzero_si128();
+
+        mask_imag = _mm_set_epi8(
+            0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+        mask_real = _mm_set_epi8(
+            0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+        for (number = 0; number < sse_iters; number++) {
+            // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
+            a = _mm_loadu_si128(
+                (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+            __VOLK_PREFETCH(_in_a + 8);
+            b = _mm_loadu_si128((__m128i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 8);
+            c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+            c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                         // zeros, and store the results in dst.
+            real = _mm_subs_epi16(c, c_sr);
+
+            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+            imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm_adds_epi16(realcacc, real);
+            imagcacc = _mm_adds_epi16(imagcacc, imag);
+
+            _in_a += 4;
+            _in_b += 4;
+        }
+
+        realcacc = _mm_and_si128(realcacc, mask_real);
+        imagcacc = _mm_and_si128(imagcacc, mask_imag);
+
+        result = _mm_or_si128(realcacc, imagcacc);
+
+        _mm_storeu_si128((__m128i*)dotProductVector,
+                         result); // Store the results back into the dot product vector
+
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+        }
+    }
+
+    for (number = 0; number < (num_points % 4); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
+
+    *_out = dotProduct;
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+    const unsigned int avx_iters = num_points / 8;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+    unsigned int number;
+
+    if (avx_iters > 0) {
+        __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+        realcacc = _mm256_setzero_si256();
+        imagcacc = _mm256_setzero_si256();
+
+        mask_imag = _mm256_set_epi8(0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0);
+        mask_real = _mm256_set_epi8(0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF);
+
+        for (number = 0; number < avx_iters; number++) {
+            a = _mm256_loadu_si256((__m256i*)_in_a);
+            __VOLK_PREFETCH(_in_a + 16);
+            b = _mm256_loadu_si256((__m256i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 16);
+            c = _mm256_mullo_epi16(a, b);
+
+            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+                                            // in zeros, and store the results in dst.
+            real = _mm256_subs_epi16(c, c_sr);
+
+            b_sl = _mm256_slli_si256(b, 2);
+            a_sl = _mm256_slli_si256(a, 2);
+
+            imag1 = _mm256_mullo_epi16(a, b_sl);
+            imag2 = _mm256_mullo_epi16(b, a_sl);
+
+            imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm256_adds_epi16(realcacc, real);
+            imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+            _in_a += 8;
+            _in_b += 8;
+        }
+
+        realcacc = _mm256_and_si256(realcacc, mask_real);
+        imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+        result = _mm256_or_si256(realcacc, imagcacc);
+
+        _mm256_storeu_si256((__m256i*)dotProductVector,
+                            result); // Store the results back into the dot product vector
+
+        for (number = 0; number < 8; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+        }
+    }
+
+    for (number = 0; number < (num_points % 8); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
+
+    *_out = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+    const unsigned int avx_iters = num_points / 8;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+    unsigned int number;
+
+    if (avx_iters > 0) {
+        __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+            realcacc, imagcacc, result;
+        __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+
+        realcacc = _mm256_setzero_si256();
+        imagcacc = _mm256_setzero_si256();
+
+        mask_imag = _mm256_set_epi8(0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0);
+        mask_real = _mm256_set_epi8(0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF,
+                                    0,
+                                    0,
+                                    0xFF,
+                                    0xFF);
+
+        for (number = 0; number < avx_iters; number++) {
+            a = _mm256_load_si256((__m256i*)_in_a);
+            __VOLK_PREFETCH(_in_a + 16);
+            b = _mm256_load_si256((__m256i*)_in_b);
+            __VOLK_PREFETCH(_in_b + 16);
+            c = _mm256_mullo_epi16(a, b);
+
+            c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting
+                                            // in zeros, and store the results in dst.
+            real = _mm256_subs_epi16(c, c_sr);
+
+            b_sl = _mm256_slli_si256(b, 2);
+            a_sl = _mm256_slli_si256(a, 2);
+
+            imag1 = _mm256_mullo_epi16(a, b_sl);
+            imag2 = _mm256_mullo_epi16(b, a_sl);
+
+            imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic!
+
+            realcacc = _mm256_adds_epi16(realcacc, real);
+            imagcacc = _mm256_adds_epi16(imagcacc, imag);
+
+            _in_a += 8;
+            _in_b += 8;
+        }
+
+        realcacc = _mm256_and_si256(realcacc, mask_real);
+        imagcacc = _mm256_and_si256(imagcacc, mask_imag);
+
+        result = _mm256_or_si256(realcacc, imagcacc);
+
+        _mm256_store_si256((__m256i*)dotProductVector,
+                           result); // Store the results back into the dot product vector
+
+        for (number = 0; number < 8; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+        }
+    }
+
+    for (number = 0; number < (num_points % 8); ++number) {
+        lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
+        dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)),
+                              sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
+    }
+
+    *_out = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out,
+                                                   const lv_16sc_t* in_a,
+                                                   const lv_16sc_t* in_b,
+                                                   unsigned int num_points)
+{
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+    *out = lv_cmake((int16_t)0, (int16_t)0);
+
+    if (quarter_points > 0) {
+        // for 2-lane vectors, 1st lane holds the real part,
+        // 2nd lane holds the imaginary part
+        int16x4x2_t a_val, b_val, c_val, accumulator;
+        int16x4x2_t tmp_real, tmp_imag;
+        __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+        accumulator.val[0] = vdup_n_s16(0);
+        accumulator.val[1] = vdup_n_s16(0);
+        lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
+
+        for (number = 0; number < quarter_points; ++number) {
+            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+            __VOLK_PREFETCH(a_ptr + 8);
+            __VOLK_PREFETCH(b_ptr + 8);
+
+            // multiply the real*real and imag*imag to get real result
+            // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+            tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+            // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+            tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+            // Multiply cross terms to get the imaginary result
+            // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+            tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+            // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+            tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+            c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+            c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+
+            accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+            accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+
+            a_ptr += 4;
+            b_ptr += 4;
+        }
+
+        vst2_s16((int16_t*)accum_result, accumulator);
+        for (number = 0; number < 4; ++number) {
+            dotProduct = lv_cmake(
+                sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])),
+                sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
+        }
+
+        *out = dotProduct;
+    }
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out,
+                                                       const lv_16sc_t* in_a,
+                                                       const lv_16sc_t* in_b,
+                                                       unsigned int num_points)
+{
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    int16x4x2_t a_val, b_val, accumulator;
+    int16x4x2_t tmp;
+    __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+    accumulator.val[0] = vdup_n_s16(0);
+    accumulator.val[1] = vdup_n_s16(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+        tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
+        tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
+
+        accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
+        accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
+
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+
+    vst2_s16((int16_t*)accum_result, accumulator);
+    *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out,
+                                                          const lv_16sc_t* in_a,
+                                                          const lv_16sc_t* in_b,
+                                                          unsigned int num_points)
+{
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    int16x4x2_t a_val, b_val, accumulator1, accumulator2;
+
+    __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+    accumulator1.val[0] = vdup_n_s16(0);
+    accumulator1.val[1] = vdup_n_s16(0);
+    accumulator2.val[0] = vdup_n_s16(0);
+    accumulator2.val[1] = vdup_n_s16(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+        accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+
+    accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
+    accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
+
+    vst2_s16((int16_t*)accum_result, accumulator1);
+    *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *out += (*a_ptr++) * (*b_ptr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_16ic_x2_dot_prod_16ic_H*/
diff --git a/kernels/volk/volk_16ic_x2_multiply_16ic.h b/kernels/volk/volk_16ic_x2_multiply_16ic.h

new file mode 100644 (file)

index 0000000..03ee145
--- /dev/null
+++ b/kernels/volk/volk_16ic_x2_multiply_16ic.h
@@ -0,0 +1,465 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16ic_x2_multiply_16ic
+ *
+ * \b Overview
+ *
+ * Multiplies two input complex vectors, point-by-point, storing the result in the third
+ * vector. WARNING: Saturation is not checked.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16ic_x2_multiply_16ic(lv_16sc_t* result, const lv_16sc_t* in_a, const
+ * lv_16sc_t* in_b, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li in_a: One of the vectors to be multiplied.
+ * \li in_b: The other vector to be multiplied.
+ * \li num_points: The number of complex data points to be multiplied from both input
+ * vectors.
+ *
+ * \b Outputs
+ * \li result: The vector where the results will be stored.
+ *
+ */
+
+#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
+#define INCLUDED_volk_16ic_x2_multiply_16ic_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
+                                                      const lv_16sc_t* in_a,
+                                                      const lv_16sc_t* in_b,
+                                                      unsigned int num_points)
+{
+    unsigned int n;
+    for (n = 0; n < num_points; n++) {
+        result[n] = in_a[n] * in_b[n];
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+        result;
+
+    mask_imag = _mm_set_epi8(
+        0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+    mask_real = _mm_set_epi8(
+        0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+    unsigned int number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_load_si128(
+            (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+        b = _mm_load_si128((__m128i*)_in_b);
+        c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+        c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                     // zeros, and store the results in dst.
+        real = _mm_subs_epi16(c, c_sr);
+        real = _mm_and_si128(real,
+                             mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+        a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm_adds_epi16(imag1, imag2);
+        imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm_or_si128(real, imag);
+
+        _mm_store_si128((__m128i*)_out, result);
+
+        _in_a += 4;
+        _in_b += 4;
+        _out += 4;
+    }
+
+    for (number = sse_iters * 4; number < num_points; ++number) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
+        result;
+
+    mask_imag = _mm_set_epi8(
+        0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
+    mask_real = _mm_set_epi8(
+        0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+    unsigned int number;
+
+    for (number = 0; number < sse_iters; number++) {
+        a = _mm_loadu_si128(
+            (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+        b = _mm_loadu_si128((__m128i*)_in_b);
+        c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+        c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
+                                     // zeros, and store the results in dst.
+        real = _mm_subs_epi16(c, c_sr);
+        real = _mm_and_si128(real,
+                             mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+        a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm_adds_epi16(imag1, imag2);
+        imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm_or_si128(real, imag);
+
+        _mm_storeu_si128((__m128i*)_out, result);
+
+        _in_a += 4;
+        _in_b += 4;
+        _out += 4;
+    }
+
+    for (number = sse_iters * 4; number < num_points; ++number) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int avx2_points = num_points / 8;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+
+    __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+    const __m256i mask_imag = _mm256_set_epi8(0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0);
+    const __m256i mask_real = _mm256_set_epi8(0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF);
+
+    for (; number < avx2_points; number++) {
+        a = _mm256_loadu_si256(
+            (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        b = _mm256_loadu_si256(
+            (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        c = _mm256_mullo_epi16(a, b);
+
+        c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+                                        // zeros, and store the results in dst.
+        real = _mm256_subs_epi16(c, c_sr);
+        real = _mm256_and_si256(
+            real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+        a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm256_adds_epi16(imag1, imag2);
+        imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm256_or_si256(real, imag);
+
+        _mm256_storeu_si256((__m256i*)_out, result);
+
+        _in_a += 8;
+        _in_b += 8;
+        _out += 8;
+    }
+
+    number = avx2_points * 8;
+    for (; number < num_points; number++) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
+}
+#endif /* LV_HAVE_AVX2  */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
+                                                     const lv_16sc_t* in_a,
+                                                     const lv_16sc_t* in_b,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int avx2_points = num_points / 8;
+
+    const lv_16sc_t* _in_a = in_a;
+    const lv_16sc_t* _in_b = in_b;
+    lv_16sc_t* _out = out;
+
+    __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
+
+    const __m256i mask_imag = _mm256_set_epi8(0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0);
+    const __m256i mask_real = _mm256_set_epi8(0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF,
+                                              0,
+                                              0,
+                                              0xFF,
+                                              0xFF);
+
+    for (; number < avx2_points; number++) {
+        a = _mm256_load_si256(
+            (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        b = _mm256_load_si256(
+            (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        c = _mm256_mullo_epi16(a, b);
+
+        c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
+                                        // zeros, and store the results in dst.
+        real = _mm256_subs_epi16(c, c_sr);
+        real = _mm256_and_si256(
+            real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0,  a3.r*b3.r- a3.i*b3.i
+
+        b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
+        a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
+
+        imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+        imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+        imag = _mm256_adds_epi16(imag1, imag2);
+        imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
+
+        result = _mm256_or_si256(real, imag);
+
+        _mm256_store_si256((__m256i*)_out, result);
+
+        _in_a += 8;
+        _in_b += 8;
+        _out += 8;
+    }
+
+    number = avx2_points * 8;
+    for (; number < num_points; number++) {
+        *_out++ = (*_in_a++) * (*_in_b++);
+    }
+}
+#endif /* LV_HAVE_AVX2  */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t* out,
+                                                   const lv_16sc_t* in_a,
+                                                   const lv_16sc_t* in_b,
+                                                   unsigned int num_points)
+{
+    lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
+    lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
+    unsigned int quarter_points = num_points / 4;
+    int16x4x2_t a_val, b_val, c_val;
+    int16x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2_s16((int16_t*)out, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        out += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *out++ = (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
diff --git a/kernels/volk/volk_16u_byteswap.h b/kernels/volk/volk_16u_byteswap.h

new file mode 100644 (file)

index 0000000..56a11fe
--- /dev/null
+++ b/kernels/volk/volk_16u_byteswap.h
@@ -0,0 +1,299 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_16u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int16_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_16u_byteswap(uint16_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_16u_byteswap(x, N);
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_16u_byteswap_u_H
+#define INCLUDED_volk_16u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16u_byteswap_generic(uint16_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    uint16_t* inputPtr = intsToSwap;
+    for (unsigned int point = 0; point < num_points; point++) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_16u_byteswap_a_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
+
+    const unsigned int nPerSet = 16;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint16_t* inputPtr = (uint16_t*)intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
+                                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+                                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+    for (number = 0; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nPerSet * nSets; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_16u_byteswap_u_avx2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
+
+    const unsigned int nPerSet = 16;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint16_t* inputPtr = (uint16_t*)intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
+                                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
+                                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+    for (number = 0; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nPerSet * nSets; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16u_byteswap_u_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+    uint16_t* inputPtr = intsToSwap;
+    __m128i input, left, right, output;
+
+    const unsigned int eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+        // Load the 16t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+        // Do the two shifts
+        left = _mm_slli_epi16(input, 8);
+        right = _mm_srli_epi16(input, 8);
+        // Or the left and right halves together
+        output = _mm_or_si128(left, right);
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 8;
+    }
+
+    // Byteswap any remaining points:
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        uint16_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 8) & 0xff) | ((outputVal << 8) & 0xff00));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_u_H */
+#ifndef INCLUDED_volk_16u_byteswap_a_H
+#define INCLUDED_volk_16u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_16u_byteswap_a_sse2(uint16_t* intsToSwap, unsigned int num_points)
+{
+    uint16_t* inputPtr = intsToSwap;
+    __m128i input, left, right, output;
+
+    const unsigned int eighthPoints = num_points / 8;
+    for (unsigned int number = 0; number < eighthPoints; number++) {
+        // Load the 16t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+        // Do the two shifts
+        left = _mm_slli_epi16(input, 8);
+        right = _mm_srli_epi16(input, 8);
+        // Or the left and right halves together
+        output = _mm_or_si128(left, right);
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 8;
+    }
+
+    // Byteswap any remaining points:
+    volk_16u_byteswap_generic(inputPtr, num_points - eighthPoints * 8);
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16u_byteswap_neon(uint16_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number;
+    unsigned int eighth_points = num_points / 8;
+    uint16x8_t input, output;
+    uint16_t* inputPtr = intsToSwap;
+
+    for (number = 0; number < eighth_points; number++) {
+        input = vld1q_u16(inputPtr);
+        output = vsriq_n_u16(output, input, 8);
+        output = vsliq_n_u16(output, input, 8);
+        vst1q_u16(inputPtr, output);
+        inputPtr += 8;
+    }
+
+    volk_16u_byteswap_generic(inputPtr, num_points - eighth_points * 8);
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_16u_byteswap_neon_table(uint16_t* intsToSwap,
+                                                unsigned int num_points)
+{
+    uint16_t* inputPtr = intsToSwap;
+    unsigned int number = 0;
+    unsigned int n16points = num_points / 16;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(1232017111498883080);
+    int_lookup23 = vcreate_u8(1376697457175036426);
+    int_lookup45 = vcreate_u8(1521377802851189772);
+    int_lookup67 = vcreate_u8(1666058148527343118);
+
+    for (number = 0; number < n16points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 8), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 12), swapped_int67);
+
+        inputPtr += 16;
+    }
+
+    volk_16u_byteswap_generic(inputPtr, num_points - n16points * 16);
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_16u_byteswap_a_generic(uint16_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    uint16_t* inputPtr = intsToSwap;
+    for (unsigned int point = 0; point < num_points; point++) {
+        uint16_t output = *inputPtr;
+        output = (((output >> 8) & 0xff) | ((output << 8) & 0xff00));
+        *inputPtr = output;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_16u_byteswap_a_orc_impl(uint16_t* intsToSwap, unsigned int num_points);
+static inline void volk_16u_byteswap_u_orc(uint16_t* intsToSwap, unsigned int num_points)
+{
+    volk_16u_byteswap_a_orc_impl(intsToSwap, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_16u_byteswap_a_H */
diff --git a/kernels/volk/volk_16u_byteswappuppet_16u.h b/kernels/volk/volk_16u_byteswappuppet_16u.h

new file mode 100644 (file)

index 0000000..6c3b604
--- /dev/null
+++ b/kernels/volk/volk_16u_byteswappuppet_16u.h
@@ -0,0 +1,95 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014, 2015, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_16u_byteswappuppet_16u_H
+#define INCLUDED_volk_16u_byteswappuppet_16u_H
+
+
+#include <stdint.h>
+#include <string.h>
+#include <volk/volk_16u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_16u_byteswappuppet_16u_generic(uint16_t* output,
+                                                       uint16_t* intsToSwap,
+                                                       unsigned int num_points)
+{
+
+    volk_16u_byteswap_generic((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon(uint16_t* output,
+                                                    uint16_t* intsToSwap,
+                                                    unsigned int num_points)
+{
+
+    volk_16u_byteswap_neon((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_16u_byteswappuppet_16u_neon_table(uint16_t* output,
+                                                          uint16_t* intsToSwap,
+                                                          unsigned int num_points)
+{
+
+    volk_16u_byteswap_neon_table((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_u_sse2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_16u_byteswap_u_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_16u_byteswappuppet_16u_a_sse2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_16u_byteswap_a_sse2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_16u_byteswappuppet_16u_u_avx2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_16u_byteswap_u_avx2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_16u_byteswappuppet_16u_a_avx2(uint16_t* output,
+                                                      uint16_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_16u_byteswap_a_avx2((uint16_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint16_t));
+}
+#endif
+
+#endif
diff --git a/kernels/volk/volk_32f_64f_add_64f.h b/kernels/volk/volk_32f_64f_add_64f.h

new file mode 100644 (file)

index 0000000..06b5681
--- /dev/null
+++ b/kernels/volk/volk_32f_64f_add_64f.h
@@ -0,0 +1,233 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_64f_add_64f
+ *
+ * \b Overview
+ *
+ * Adds two input vectors and store result as a double-precision vectors. One
+ * of the input vector is defined as a single precision floating point, so
+ * upcasting is performed before the addition
+ *
+ * c[i] = a[i] + b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_64f_add_64f(double* cVector, const double* aVector, const
+ * double* bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * add elements of an increasing vector by those of a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (double)ii;
+ *       decreasing[ii] = 10.f - (double)ii;
+ *   }
+ *
+ *   volk_32f_64f_add_64f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2F\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_64f_add_64f_H
+#define INCLUDED_volk_32f_64f_add_64f_H
+
+#include <inttypes.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_64f_add_64f_generic(double* cVector,
+                                                const float* aVector,
+                                                const double* bVector,
+                                                unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_32f_64f_add_64f_neon(double* cVector,
+                                             const float* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    float64x2_t aVal, bVal, cVal;
+    float32x2_t aVal1;
+    for (number = 0; number < half_points; number++) {
+        // Load in to NEON registers
+        aVal1 = vld1_f32(aPtr);
+        bVal = vld1q_f64(bPtr);
+        __VOLK_PREFETCH(aPtr + 2);
+        __VOLK_PREFETCH(bPtr + 2);
+        aPtr += 2; // q uses quadwords, 4 floats per vadd
+        bPtr += 2;
+
+        // Vector conversion
+        aVal = vcvt_f64_f32(aVal1);
+        // vector add
+        cVal = vaddq_f64(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f64(cPtr, cVal);
+
+        cPtr += 2;
+    }
+
+    number = half_points * 2; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEONV8 */
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_64f_add_64f_u_avx(double* cVector,
+                                              const float* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_pd(bPtr);
+        bVal2 = _mm256_loadu_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_add_pd(aDbl1, bVal1);
+        cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+        _mm256_storeu_pd(cPtr,
+                         cVal1); // Store the results back into the C container
+        _mm256_storeu_pd(cPtr + 4,
+                         cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_64f_add_64f_a_avx(double* cVector,
+                                              const float* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_pd(bPtr);
+        bVal2 = _mm256_load_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_add_pd(aDbl1, bVal1);
+        cVal2 = _mm256_add_pd(aDbl2, bVal2);
+
+        _mm256_store_pd(cPtr, cVal1); // Store the results back into the C container
+        _mm256_store_pd(cPtr + 4,
+                        cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_64f_add_64f_u_H */
diff --git a/kernels/volk/volk_32f_64f_multiply_64f.h b/kernels/volk/volk_32f_64f_multiply_64f.h

new file mode 100644 (file)

index 0000000..069cd73
--- /dev/null
+++ b/kernels/volk/volk_32f_64f_multiply_64f.h
@@ -0,0 +1,192 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_64f_multiply_64f
+ *
+ * \b Overview
+ *
+ * Multiplies two input double-precision doubleing point vectors together.
+ *
+ * c[i] = a[i] * b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_64f_multiply_64f(double* cVector, const double* aVector, const double*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Multiply elements of an increasing vector by those of a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (double)ii;
+ *       decreasing[ii] = 10.f - (double)ii;
+ *   }
+ *
+ *   volk_32f_64f_multiply_64f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2F\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_64f_multiply_64f_H
+#define INCLUDED_volk_32f_64f_multiply_64f_H
+
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_64f_multiply_64f_generic(double* cVector,
+                                                     const float* aVector,
+                                                     const double* bVector,
+                                                     unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+/*
+ * Unaligned versions
+ */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_64f_multiply_64f_u_avx(double* cVector,
+                                                   const float* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_pd(bPtr);
+        bVal2 = _mm256_loadu_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+        cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+
+        _mm256_storeu_pd(cPtr, cVal1);     // Store the results back into the C container
+        _mm256_storeu_pd(cPtr + 4, cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_64f_multiply_64f_a_avx(double* cVector,
+                                                   const float* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighth_points = num_points / 8;
+
+    double* cPtr = cVector;
+    const float* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256 aVal;
+    __m128 aVal1, aVal2;
+    __m256d aDbl1, aDbl2, bVal1, bVal2, cVal1, cVal2;
+    for (; number < eighth_points; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_pd(bPtr);
+        bVal2 = _mm256_load_pd(bPtr + 4);
+
+        aVal1 = _mm256_extractf128_ps(aVal, 0);
+        aVal2 = _mm256_extractf128_ps(aVal, 1);
+
+        aDbl1 = _mm256_cvtps_pd(aVal1);
+        aDbl2 = _mm256_cvtps_pd(aVal2);
+
+        cVal1 = _mm256_mul_pd(aDbl1, bVal1);
+        cVal2 = _mm256_mul_pd(aDbl2, bVal2);
+
+        _mm256_store_pd(cPtr, cVal1);     // Store the results back into the C container
+        _mm256_store_pd(cPtr + 4, cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighth_points * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = ((double)(*aPtr++)) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_64f_multiply_64f_u_H */
diff --git a/kernels/volk/volk_32f_8u_polarbutterfly_32f.h b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h

new file mode 100644 (file)

index 0000000..562dede
--- /dev/null
+++ b/kernels/volk/volk_32f_8u_polarbutterfly_32f.h
@@ -0,0 +1,394 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_8u_polarbutterfly_32f
+ *
+ * \b Overview
+ *
+ * decode butterfly for one bit in polar decoder graph.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * volk_32f_8u_polarbutterfly_32f(float* llrs, unsigned char* u,
+ *    const int frame_size, const int frame_exp,
+ *    const int stage, const int u_num, const int row)
+ * \endcode
+ *
+ * \b Inputs
+ * \li llrs: buffer with LLRs. contains received LLRs and already decoded LLRs.
+ * \li u: previously decoded bits
+ * \li frame_size: = 2 ^ frame_exp.
+ * \li frame_exp: power of 2 value for frame size.
+ * \li stage: value in range [0, frame_exp). start stage algorithm goes deeper.
+ * \li u_num: bit number currently to be decoded
+ * \li row: row in graph to start decoding.
+ *
+ * \b Outputs
+ * \li frame: necessary LLRs for bit [u_num] to be decoded
+ *
+ * \b Example
+ * \code
+ * int frame_exp = 10;
+ * int frame_size = 0x01 << frame_exp;
+ *
+ * float* llrs = (float*) volk_malloc(sizeof(float) * frame_size * (frame_exp + 1),
+ * volk_get_alignment()); unsigned char* u = (unsigned char) volk_malloc(sizeof(unsigned
+ * char) * frame_size * (frame_exp + 1), volk_get_alignment());
+ *
+ *  {some_function_to_write_encoded_bits_to_float_llrs(llrs + frame_size * frame_exp,
+ * data)};
+ *
+ * unsigned int u_num;
+ * for(u_num = 0; u_num < frame_size; u_num++){
+ *     volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_size, frame_exp, 0, u_num,
+ * u_num);
+ *     // next line could first search for frozen bit value and then do bit decision.
+ *     u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+ * }
+ *
+ * volk_free(llrs);
+ * volk_free(u);
+ * \endcode
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
+#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_
+#include <math.h>
+#include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+static inline float llr_odd(const float la, const float lb)
+{
+    const float ala = fabsf(la);
+    const float alb = fabsf(lb);
+    return copysignf(1.0f, la) * copysignf(1.0f, lb) * (ala > alb ? alb : ala);
+}
+
+static inline void llr_odd_stages(
+    float* llrs, int min_stage, const int depth, const int frame_size, const int row)
+{
+    int loop_stage = depth - 1;
+    float* dst_llr_ptr;
+    float* src_llr_ptr;
+    int stage_size = 0x01 << loop_stage;
+
+    int el;
+    while (min_stage <= loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el++) {
+            *dst_llr_ptr++ = llr_odd(*src_llr_ptr, *(src_llr_ptr + 1));
+            src_llr_ptr += 2;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+}
+
+static inline float llr_even(const float la, const float lb, const unsigned char f)
+{
+    switch (f) {
+    case 0:
+        return lb + la;
+    default:
+        return lb - la;
+    }
+}
+
+static inline void
+even_u_values(unsigned char* u_even, const unsigned char* u, const int u_num)
+{
+    u++;
+    int i;
+    for (i = 1; i < u_num; i += 2) {
+        *u_even++ = *u;
+        u += 2;
+    }
+}
+
+static inline void
+odd_xor_even_values(unsigned char* u_xor, const unsigned char* u, const int u_num)
+{
+    int i;
+    for (i = 1; i < u_num; i += 2) {
+        *u_xor++ = *u ^ *(u + 1);
+        u += 2;
+    }
+}
+
+static inline int calculate_max_stage_depth_for_row(const int frame_exp, const int row)
+{
+    int max_stage_depth = 0;
+    int half_stage_size = 0x01;
+    int stage_size = half_stage_size << 1;
+    while (max_stage_depth < (frame_exp - 1)) { // last stage holds received values.
+        if (!(row % stage_size < half_stage_size)) {
+            break;
+        }
+        half_stage_size <<= 1;
+        stage_size <<= 1;
+        max_stage_depth++;
+    }
+    return max_stage_depth;
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_8u_polarbutterfly_32f_generic(float* llrs,
+                                                          unsigned char* u,
+                                                          const int frame_exp,
+                                                          const int stage,
+                                                          const int u_num,
+                                                          const int row)
+{
+    const int frame_size = 0x01 << frame_exp;
+    const int next_stage = stage + 1;
+
+    const int half_stage_size = 0x01 << stage;
+    const int stage_size = half_stage_size << 1;
+
+    const bool is_upper_stage_half = row % stage_size < half_stage_size;
+
+    //      // this is a natural bit order impl
+    float* next_llrs = llrs + frame_size; // LLRs are stored in a consecutive array.
+    float* call_row_llr = llrs + row;
+
+    const int section = row - (row % stage_size);
+    const int jump_size = ((row % half_stage_size) << 1) % stage_size;
+
+    const int next_upper_row = section + jump_size;
+    const int next_lower_row = next_upper_row + 1;
+
+    const float* upper_right_llr_ptr = next_llrs + next_upper_row;
+    const float* lower_right_llr_ptr = next_llrs + next_lower_row;
+
+    if (!is_upper_stage_half) {
+        const int u_pos = u_num >> stage;
+        const unsigned char f = u[u_pos - 1];
+        *call_row_llr = llr_even(*upper_right_llr_ptr, *lower_right_llr_ptr, f);
+        return;
+    }
+
+    if (frame_exp > next_stage) {
+        unsigned char* u_half = u + frame_size;
+        odd_xor_even_values(u_half, u, u_num);
+        volk_32f_8u_polarbutterfly_32f_generic(
+            next_llrs, u_half, frame_exp, next_stage, u_num, next_upper_row);
+
+        even_u_values(u_half, u, u_num);
+        volk_32f_8u_polarbutterfly_32f_generic(
+            next_llrs, u_half, frame_exp, next_stage, u_num, next_lower_row);
+    }
+
+    *call_row_llr = llr_odd(*upper_right_llr_ptr, *lower_right_llr_ptr);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx(float* llrs,
+                                                        unsigned char* u,
+                                                        const int frame_exp,
+                                                        const int stage,
+                                                        const int u_num,
+                                                        const int row)
+{
+    const int frame_size = 0x01 << frame_exp;
+    if (row % 2) { // for odd rows just do the only necessary calculation and return.
+        const float* next_llrs = llrs + frame_size + row;
+        *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+        return;
+    }
+
+    const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+    if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+        return;
+    }
+
+    int loop_stage = max_stage_depth;
+    int stage_size = 0x01 << loop_stage;
+
+    float* src_llr_ptr;
+    float* dst_llr_ptr;
+
+    __m256 src0, src1, dst;
+
+    if (row) { // not necessary for ZERO row. == first bit to be decoded.
+        // first do bit combination for all stages
+        // effectively encode some decoded bits again.
+        unsigned char* u_target = u + frame_size;
+        unsigned char* u_temp = u + 2 * frame_size;
+        memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+        if (stage_size > 15) {
+            volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+        } else {
+            volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+        }
+
+        src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+        dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+        __m128i fbits;
+
+        int p;
+        for (p = 0; p < stage_size; p += 8) {
+            fbits = _mm_loadu_si128((__m128i*)u_target);
+            u_target += 8;
+
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+            src_llr_ptr += 16;
+
+            dst = _mm256_polar_fsign_add_llrs(src0, src1, fbits);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+
+    const int min_stage = stage > 2 ? stage : 2;
+
+    _mm256_zeroall(); // Important to clear cache!
+
+    int el;
+    while (min_stage < loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el += 8) {
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+            src1 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+
+            dst = _mm256_polar_minsum_llrs(src0, src1);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+
+    // for stages < 3 vectors are too small!.
+    llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32f_8u_polarbutterfly_32f_u_avx2(float* llrs,
+                                                         unsigned char* u,
+                                                         const int frame_exp,
+                                                         const int stage,
+                                                         const int u_num,
+                                                         const int row)
+{
+    const int frame_size = 0x01 << frame_exp;
+    if (row % 2) { // for odd rows just do the only necessary calculation and return.
+        const float* next_llrs = llrs + frame_size + row;
+        *(llrs + row) = llr_even(*(next_llrs - 1), *next_llrs, u[u_num - 1]);
+        return;
+    }
+
+    const int max_stage_depth = calculate_max_stage_depth_for_row(frame_exp, row);
+    if (max_stage_depth < 3) { // vectorized version needs larger vectors.
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, stage, u_num, row);
+        return;
+    }
+
+    int loop_stage = max_stage_depth;
+    int stage_size = 0x01 << loop_stage;
+
+    float* src_llr_ptr;
+    float* dst_llr_ptr;
+
+    __m256 src0, src1, dst;
+
+    if (row) { // not necessary for ZERO row. == first bit to be decoded.
+        // first do bit combination for all stages
+        // effectively encode some decoded bits again.
+        unsigned char* u_target = u + frame_size;
+        unsigned char* u_temp = u + 2 * frame_size;
+        memcpy(u_temp, u + u_num - stage_size, sizeof(unsigned char) * stage_size);
+
+        if (stage_size > 15) {
+            volk_8u_x2_encodeframepolar_8u_u_ssse3(u_target, u_temp, stage_size);
+        } else {
+            volk_8u_x2_encodeframepolar_8u_generic(u_target, u_temp, stage_size);
+        }
+
+        src_llr_ptr = llrs + (max_stage_depth + 1) * frame_size + row - stage_size;
+        dst_llr_ptr = llrs + max_stage_depth * frame_size + row;
+
+        __m128i fbits;
+
+        int p;
+        for (p = 0; p < stage_size; p += 8) {
+            fbits = _mm_loadu_si128((__m128i*)u_target);
+            u_target += 8;
+
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src1 = _mm256_loadu_ps(src_llr_ptr + 8);
+            src_llr_ptr += 16;
+
+            dst = _mm256_polar_fsign_add_llrs_avx2(src0, src1, fbits);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+
+    const int min_stage = stage > 2 ? stage : 2;
+
+    _mm256_zeroall(); // Important to clear cache!
+
+    int el;
+    while (min_stage < loop_stage) {
+        dst_llr_ptr = llrs + loop_stage * frame_size + row;
+        src_llr_ptr = dst_llr_ptr + frame_size;
+        for (el = 0; el < stage_size; el += 8) {
+            src0 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+            src1 = _mm256_loadu_ps(src_llr_ptr);
+            src_llr_ptr += 8;
+
+            dst = _mm256_polar_minsum_llrs(src0, src1);
+
+            _mm256_storeu_ps(dst_llr_ptr, dst);
+            dst_llr_ptr += 8;
+        }
+
+        --loop_stage;
+        stage_size >>= 1;
+    }
+
+    // for stages < 3 vectors are too small!.
+    llr_odd_stages(llrs, stage, loop_stage + 1, frame_size, row);
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLY_32F_H_ */
diff --git a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h

new file mode 100644 (file)

index 0000000..e3dec3e
--- /dev/null
+++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h
@@ -0,0 +1,148 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * This puppet is for VOLK tests only.
+ * For documentation see 'kernels/volk/volk_32f_8u_polarbutterfly_32f.h'
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_
+#define VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_
+
+#include <volk/volk_32f_8u_polarbutterfly_32f.h>
+#include <volk/volk_8u_x3_encodepolar_8u_x2.h>
+#include <volk/volk_8u_x3_encodepolarpuppet_8u.h>
+
+
+static inline void sanitize_bytes(unsigned char* u, const int elements)
+{
+    int i;
+    unsigned char* u_ptr = u;
+    for (i = 0; i < elements; i++) {
+        *u_ptr = (*u_ptr & 0x01);
+        u_ptr++;
+    }
+}
+
+static inline void clean_up_intermediate_values(float* llrs,
+                                                unsigned char* u,
+                                                const int frame_size,
+                                                const int elements)
+{
+    memset(u + frame_size, 0, sizeof(unsigned char) * (elements - frame_size));
+    memset(llrs + frame_size, 0, sizeof(float) * (elements - frame_size));
+}
+
+static inline void
+generate_error_free_input_vector(float* llrs, unsigned char* u, const int frame_size)
+{
+    memset(u, 0, frame_size);
+    unsigned char* target = u + frame_size;
+    volk_8u_x2_encodeframepolar_8u_generic(target, u + 2 * frame_size, frame_size);
+    float* ft = llrs;
+    int i;
+    for (i = 0; i < frame_size; i++) {
+        *ft = (-2 * ((float)*target++)) + 1.0f;
+        ft++;
+    }
+}
+
+static inline void
+print_llr_tree(const float* llrs, const int frame_size, const int frame_exp)
+{
+    int s, e;
+    for (s = 0; s < frame_size; s++) {
+        for (e = 0; e < frame_exp + 1; e++) {
+            printf("%+4.2f ", llrs[e * frame_size + s]);
+        }
+        printf("\n");
+        if ((s + 1) % 8 == 0) {
+            printf("\n");
+        }
+    }
+}
+
+static inline int maximum_frame_size(const int elements)
+{
+    unsigned int frame_size = next_lower_power_of_two(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
+    return next_lower_power_of_two(frame_size / frame_exp);
+}
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_8u_polarbutterflypuppet_32f_generic(float* llrs,
+                                                                const float* input,
+                                                                unsigned char* u,
+                                                                const int elements)
+{
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_generic(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
+
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx(float* llrs,
+                                                              const float* input,
+                                                              unsigned char* u,
+                                                              const int elements)
+{
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_u_avx(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
+
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_32f_8u_polarbutterflypuppet_32f_u_avx2(float* llrs,
+                                                               const float* input,
+                                                               unsigned char* u,
+                                                               const int elements)
+{
+    unsigned int frame_size = maximum_frame_size(elements);
+    unsigned int frame_exp = log2_of_power_of_2(frame_size);
+
+    sanitize_bytes(u, elements);
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+    generate_error_free_input_vector(llrs + frame_exp * frame_size, u, frame_size);
+
+    unsigned int u_num = 0;
+    for (; u_num < frame_size; u_num++) {
+        volk_32f_8u_polarbutterfly_32f_u_avx2(llrs, u, frame_exp, 0, u_num, u_num);
+        u[u_num] = llrs[u_num] > 0 ? 0 : 1;
+    }
+
+    clean_up_intermediate_values(llrs, u, frame_size, elements);
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_32F_8U_POLARBUTTERFLYPUPPET_32F_H_ */
diff --git a/kernels/volk/volk_32f_accumulator_s32f.h b/kernels/volk/volk_32f_accumulator_s32f.h

new file mode 100644 (file)

index 0000000..1f0ec15
--- /dev/null
+++ b/kernels/volk/volk_32f_accumulator_s32f.h
@@ -0,0 +1,235 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_accumulator_s32f
+ *
+ * \b Overview
+ *
+ * Accumulates the values in the input buffer.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_accumulator_s32f(float* result, const float* inputBuffer, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer The buffer of data to be accumulated
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li result The accumulated result.
+ *
+ * \b Example
+ * Calculate the sum of numbers  0 through 99
+ * \code
+ *   int N = 100;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float), alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *   }
+ *
+ *   volk_32f_accumulator_s32f(out, increasing, N);
+ *
+ *   printf("sum(1..100) = %1.2f\n", out[0]);
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
+#define INCLUDED_volk_32f_accumulator_s32f_a_H
+
+#include <inttypes.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_accumulator_s32f_a_avx(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
+{
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 8;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+    returnValue += tempBuffer[4];
+    returnValue += tempBuffer[5];
+    returnValue += tempBuffer[6];
+    returnValue += tempBuffer[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_accumulator_s32f_u_avx(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
+{
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 8;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+    returnValue += tempBuffer[4];
+    returnValue += tempBuffer[5];
+    returnValue += tempBuffer[6];
+    returnValue += tempBuffer[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_accumulator_s32f_a_sse(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
+{
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_accumulator_s32f_u_sse(float* result,
+                                                   const float* inputBuffer,
+                                                   unsigned int num_points)
+{
+    float returnValue = 0;
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = tempBuffer[0];
+    returnValue += tempBuffer[1];
+    returnValue += tempBuffer[2];
+    returnValue += tempBuffer[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_accumulator_s32f_generic(float* result,
+                                                     const float* inputBuffer,
+                                                     unsigned int num_points)
+{
+    const float* aPtr = inputBuffer;
+    unsigned int number = 0;
+    float returnValue = 0;
+
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
diff --git a/kernels/volk/volk_32f_acos_32f.h b/kernels/volk/volk_32f_acos_32f.h

new file mode 100644 (file)

index 0000000..5cf0d69
--- /dev/null
+++ b/kernels/volk/volk_32f_acos_32f.h
@@ -0,0 +1,504 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_acos_32f
+ *
+ * \b Overview
+ *
+ * Computes arccosine of the input vector and stores results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_acos_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate common angles around the top half of the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 1;
+ *   in[1] = std::sqrt(3.f)/2.f;
+ *   in[2] = std::sqrt(2.f)/2.f;
+ *   in[3] = 0.5;
+ *   in[4] = in[5] = 0;
+ *   for(unsigned int ii = 6; ii < N; ++ii){
+ *       in[ii] = - in[N-ii-1];
+ *   }
+ *
+ *   volk_32f_acos_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("acos(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
+#define ACOS_TERMS 2
+
+#ifndef INCLUDED_volk_32f_acos_32f_a_H
+#define INCLUDED_volk_32f_acos_32f_a_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_store_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_store_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+            aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine =
+            _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_store_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_acos_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_acos_32f_u_H
+#define INCLUDED_volk_32f_acos_32f_u_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_storeu_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm256_set1_ps(3.14159265358979323846);
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))),
+                             aVal);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_sub_ps(
+            arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
+        condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
+        arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
+
+        _mm256_storeu_ps(bPtr, arccosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = acos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pi = _mm_set1_ps(3.14159265358979323846);
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        d = aVal;
+        aVal = _mm_div_ps(
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
+            aVal);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+
+        for (j = ACOS_TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arccosine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arccosine =
+            _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
+        condition = _mm_cmplt_ps(d, fzeroes);
+        arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+        _mm_storeu_ps(bPtr, arccosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = acosf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_acos_32f_u_H */
diff --git a/kernels/volk/volk_32f_asin_32f.h b/kernels/volk/volk_32f_asin_32f.h

new file mode 100644 (file)

index 0000000..9c797d8
--- /dev/null
+++ b/kernels/volk/volk_32f_asin_32f.h
@@ -0,0 +1,489 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_asin_32f
+ *
+ * \b Overview
+ *
+ * Computes arcsine of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_asin_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * \code
+ * Calculate common angles around the top half of the unit circle.
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0;
+ *   in[1] = 0.5;
+ *   in[2] = std::sqrt(2.f)/2.f;
+ *   in[3] = std::sqrt(3.f)/2.f;
+ *   in[4] = in[5] = 1;
+ *   for(unsigned int ii = 6; ii < N; ++ii){
+ *       in[ii] = - in[N-ii-1];
+ *   }
+ *
+ *   volk_32f_asin_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("asin(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
+#define ASIN_TERMS 2
+
+#ifndef INCLUDED_volk_32f_asin_32f_a_H
+#define INCLUDED_volk_32f_asin_32f_a_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_asin_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_asin_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_asin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        aVal = _mm_div_ps(
+            aVal,
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+        _mm_store_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_asin_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_asin_32f_u_H
+#define INCLUDED_volk_32f_asin_32f_u_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_asin_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_asin_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arcsine;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        aVal = _mm256_div_ps(aVal,
+                             _mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
+                                                          _mm256_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arcsine = _mm256_sub_ps(arcsine,
+                                _mm256_and_ps(_mm256_mul_ps(arcsine, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arcsine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = asin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for unaligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_asin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arcsine;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        aVal = _mm_div_ps(
+            aVal,
+            _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = ASIN_TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arcsine = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos), condition));
+
+        _mm_storeu_ps(bPtr, arcsine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_asin_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = asinf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_asin_32f_u_H */
diff --git a/kernels/volk/volk_32f_atan_32f.h b/kernels/volk/volk_32f_atan_32f.h

new file mode 100644 (file)

index 0000000..96078f4
--- /dev/null
+++ b/kernels/volk/volk_32f_atan_32f.h
@@ -0,0 +1,470 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_atan_32f
+ *
+ * \b Overview
+ *
+ * Computes arcsine of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate common angles around the top half of the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0.f;
+ *   in[1] = 1.f/std::sqrt(3.f);
+ *   in[2] = 1.f;
+ *   in[3] = std::sqrt(3.f);
+ *   in[4] = in[5] = 1e99;
+ *   for(unsigned int ii = 6; ii < N; ++ii){
+ *       in[ii] = - in[N-ii-1];
+ *   }
+ *
+ *   volk_32f_atan_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+/* This is the number of terms of Taylor series to evaluate, increase this for more
+ * accuracy*/
+#define TERMS 2
+
+#ifndef INCLUDED_volk_32f_atan_32f_a_H
+#define INCLUDED_volk_32f_atan_32f_a_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_store_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        }
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent =
+            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+        _mm_store_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_atan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_atan_32f_u_H
+#define INCLUDED_volk_32f_atan_32f_u_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_fmadd_ps(
+                y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    int i, j;
+
+    __m256 aVal, pio2, x, y, z, arctangent;
+    __m256 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm256_setzero_ps();
+    fones = _mm256_set1_ps(1.0);
+    ftwos = _mm256_set1_ps(2.0);
+    ffours = _mm256_set1_ps(4.0);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
+        z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
+        condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
+        x = _mm256_add_ps(
+            z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++) {
+            x = _mm256_add_ps(x,
+                              _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
+        }
+        x = _mm256_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--) {
+            y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
+                              _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
+        }
+
+        y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
+        condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
+
+        y = _mm256_add_ps(
+            y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
+        arctangent = _mm256_sub_ps(
+            arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
+
+        _mm256_storeu_ps(bPtr, arctangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = atan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for unaligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    int i, j;
+
+    __m128 aVal, pio2, x, y, z, arctangent;
+    __m128 fzeroes, fones, ftwos, ffours, condition;
+
+    pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
+    fzeroes = _mm_setzero_ps();
+    fones = _mm_set1_ps(1.0);
+    ftwos = _mm_set1_ps(2.0);
+    ffours = _mm_set1_ps(4.0);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        z = aVal;
+        condition = _mm_cmplt_ps(z, fzeroes);
+        z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+        condition = _mm_cmplt_ps(z, fones);
+        x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
+
+        for (i = 0; i < 2; i++)
+            x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+        x = _mm_div_ps(fones, x);
+        y = fzeroes;
+        for (j = TERMS - 1; j >= 0; j--)
+            y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
+                           _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
+
+        y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+        condition = _mm_cmpgt_ps(z, fones);
+
+        y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
+        arctangent = y;
+        condition = _mm_cmplt_ps(aVal, fzeroes);
+        arctangent =
+            _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
+
+        _mm_storeu_ps(bPtr, arctangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = atanf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_atan_32f_u_H */
diff --git a/kernels/volk/volk_32f_binary_slicer_32i.h b/kernels/volk/volk_32f_binary_slicer_32i.h

new file mode 100644 (file)

index 0000000..7606145
--- /dev/null
+++ b/kernels/volk/volk_32f_binary_slicer_32i.h
@@ -0,0 +1,265 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_binary_slicer_32i
+ *
+ * \b Overview
+ *
+ * Slices input floats and and returns 1 when the input >= 0 and 0
+ * when < 0. Results are returned as 32-bit ints.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_binary_slicer_32i(int* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of 32-bit ints.
+ *
+ * \b Example
+ * Generate ints of a 7-bit barker code from floats.
+ * \code
+ *   int N = 7;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   int32_t* out = (int32_t*)volk_malloc(sizeof(int32_t)*N, alignment);
+ *
+ *   in[0] = 0.9f;
+ *   in[1] = 1.1f;
+ *   in[2] = 0.4f;
+ *   in[3] = -0.7f;
+ *   in[5] = -1.2f;
+ *   in[6] = 0.2f;
+ *   in[7] = -0.8f;
+ *
+ *   volk_32f_binary_slicer_32i(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_binary_slicer_32i_H
+#define INCLUDED_volk_32f_binary_slicer_32i_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_binary_slicer_32i_generic(int* cVector,
+                                                      const float* aVector,
+                                                      unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_binary_slicer_32i_generic_branchless(int* cVector,
+                                                                 const float* aVector,
+                                                                 unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++ >= 0);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_binary_slicer_32i_a_sse2(int* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int quarter_points = num_points / 4;
+    __m128 a_val, res_f;
+    __m128i res_i, binary_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm_load_ps(aPtr);
+
+        res_f = _mm_cmpge_ps(a_val, zero_val);
+        res_i = _mm_cvtps_epi32(res_f);
+        binary_i = _mm_srli_epi32(res_i, 31);
+
+        _mm_store_si128((__m128i*)cPtr, binary_i);
+
+        cPtr += 4;
+        aPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_binary_slicer_32i_a_avx(int* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int quarter_points = num_points / 8;
+    __m256 a_val, res_f, binary_f;
+    __m256i binary_i;
+    __m256 zero_val, one_val;
+    zero_val = _mm256_set1_ps(0.0f);
+    one_val = _mm256_set1_ps(1.0f);
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm256_load_ps(aPtr);
+
+        res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+        binary_f = _mm256_and_ps(res_f, one_val);
+        binary_i = _mm256_cvtps_epi32(binary_f);
+
+        _mm256_store_si256((__m256i*)cPtr, binary_i);
+
+        cPtr += 8;
+        aPtr += 8;
+    }
+
+    for (number = quarter_points * 8; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_binary_slicer_32i_u_sse2(int* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int quarter_points = num_points / 4;
+    __m128 a_val, res_f;
+    __m128i res_i, binary_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm_loadu_ps(aPtr);
+
+        res_f = _mm_cmpge_ps(a_val, zero_val);
+        res_i = _mm_cvtps_epi32(res_f);
+        binary_i = _mm_srli_epi32(res_i, 31);
+
+        _mm_storeu_si128((__m128i*)cPtr, binary_i);
+
+        cPtr += 4;
+        aPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_binary_slicer_32i_u_avx(int* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int quarter_points = num_points / 8;
+    __m256 a_val, res_f, binary_f;
+    __m256i binary_i;
+    __m256 zero_val, one_val;
+    zero_val = _mm256_set1_ps(0.0f);
+    one_val = _mm256_set1_ps(1.0f);
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = _mm256_loadu_ps(aPtr);
+
+        res_f = _mm256_cmp_ps(a_val, zero_val, _CMP_GE_OS);
+        binary_f = _mm256_and_ps(res_f, one_val);
+        binary_i = _mm256_cvtps_epi32(binary_f);
+
+        _mm256_storeu_si256((__m256i*)cPtr, binary_i);
+
+        cPtr += 8;
+        aPtr += 8;
+    }
+
+    for (number = quarter_points * 8; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_binary_slicer_32i_H */
diff --git a/kernels/volk/volk_32f_binary_slicer_8i.h b/kernels/volk/volk_32f_binary_slicer_8i.h

new file mode 100644 (file)

index 0000000..c6929db
--- /dev/null
+++ b/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -0,0 +1,504 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_binary_slicer_8i
+ *
+ * \b Overview
+ *
+ * Slices input floats and and returns 1 when the input >= 0 and 0
+ * when < 0. Results are converted to 8-bit chars.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_binary_slicer_8i(int8_t* cVector, const float* aVector, unsigned int
+ num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of 8-bit chars.
+ *
+ * \b Example
+ * Generate bytes of a 7-bit barker code from floats.
+ * \code
+    int N = 7;
+    unsigned int alignment = volk_get_alignment();
+    float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+    int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
+
+    in[0] = 0.9f;
+    in[1] = 1.1f;
+    in[2] = 0.4f;
+    in[3] = -0.7f;
+    in[5] = -1.2f;
+    in[6] = 0.2f;
+    in[7] = -0.8f;
+
+    volk_32f_binary_slicer_8i(out, in, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out(%i) = %i\n", ii, out[ii]);
+    }
+
+    volk_free(in);
+    volk_free(out);
+
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_binary_slicer_8i_H
+#define INCLUDED_volk_32f_binary_slicer_8i_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_binary_slicer_8i_generic(int8_t* cVector,
+                                                     const float* aVector,
+                                                     unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_binary_slicer_8i_generic_branchless(int8_t* cVector,
+                                                                const float* aVector,
+                                                                unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++ >= 0);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_binary_slicer_8i_a_avx2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n32points = num_points / 32;
+
+    const __m256 zero_val = _mm256_set1_ps(0.0f);
+    __m256 a0_val, a1_val, a2_val, a3_val;
+    __m256 res0_f, res1_f, res2_f, res3_f;
+    __m256i res0_i, res1_i, res2_i, res3_i;
+    __m256i byte_shuffle = _mm256_set_epi8(15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0,
+                                           15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0);
+
+    for (number = 0; number < n32points; number++) {
+        a0_val = _mm256_load_ps(aPtr);
+        a1_val = _mm256_load_ps(aPtr + 8);
+        a2_val = _mm256_load_ps(aPtr + 16);
+        a3_val = _mm256_load_ps(aPtr + 24);
+
+        // compare >= 0; return float
+        res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+        res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+        res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+        res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+        // convert to 32i and >> 31
+        res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+        res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+        res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+        res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+        // pack in to 16-bit results
+        res0_i = _mm256_packs_epi32(res0_i, res1_i);
+        res2_i = _mm256_packs_epi32(res2_i, res3_i);
+        // pack in to 8-bit results
+        // res0: (after packs_epi32)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        // res2:
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_packs_epi16(res0_i, res2_i);
+        // shuffle the lanes
+        // res0: (after packs_epi16)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+        //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+        //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+        res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+        // shuffle bytes within lanes
+        // res0: (after shuffle_epi8)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+        _mm256_store_si256((__m256i*)cPtr, res0_i);
+        aPtr += 32;
+        cPtr += 32;
+    }
+
+    for (number = n32points * 32; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_binary_slicer_8i_u_avx2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n32points = num_points / 32;
+
+    const __m256 zero_val = _mm256_set1_ps(0.0f);
+    __m256 a0_val, a1_val, a2_val, a3_val;
+    __m256 res0_f, res1_f, res2_f, res3_f;
+    __m256i res0_i, res1_i, res2_i, res3_i;
+    __m256i byte_shuffle = _mm256_set_epi8(15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0,
+                                           15,
+                                           14,
+                                           13,
+                                           12,
+                                           7,
+                                           6,
+                                           5,
+                                           4,
+                                           11,
+                                           10,
+                                           9,
+                                           8,
+                                           3,
+                                           2,
+                                           1,
+                                           0);
+
+    for (number = 0; number < n32points; number++) {
+        a0_val = _mm256_loadu_ps(aPtr);
+        a1_val = _mm256_loadu_ps(aPtr + 8);
+        a2_val = _mm256_loadu_ps(aPtr + 16);
+        a3_val = _mm256_loadu_ps(aPtr + 24);
+
+        // compare >= 0; return float
+        res0_f = _mm256_cmp_ps(a0_val, zero_val, _CMP_GE_OS);
+        res1_f = _mm256_cmp_ps(a1_val, zero_val, _CMP_GE_OS);
+        res2_f = _mm256_cmp_ps(a2_val, zero_val, _CMP_GE_OS);
+        res3_f = _mm256_cmp_ps(a3_val, zero_val, _CMP_GE_OS);
+
+        // convert to 32i and >> 31
+        res0_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res0_f), 31);
+        res1_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res1_f), 31);
+        res2_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res2_f), 31);
+        res3_i = _mm256_srli_epi32(_mm256_cvtps_epi32(res3_f), 31);
+
+        // pack in to 16-bit results
+        res0_i = _mm256_packs_epi32(res0_i, res1_i);
+        res2_i = _mm256_packs_epi32(res2_i, res3_i);
+        // pack in to 8-bit results
+        // res0: (after packs_epi32)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        // res2:
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_packs_epi16(res0_i, res2_i);
+        // shuffle the lanes
+        // res0: (after packs_epi16)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3
+        //  a4, a5, a6, a7, b4, b5, b6, b7, c4, c5, c6, c7, d4, d5, d6, d7
+        //   0, 2, 1, 3 -> 11 01 10 00 (0xd8)
+        res0_i = _mm256_permute4x64_epi64(res0_i, 0xd8);
+
+        // shuffle bytes within lanes
+        // res0: (after shuffle_epi8)
+        //  a0, a1, a2, a3, b0, b1, b2, b3, a4, a5, a6, a7, b4, b5, b6, b7
+        //  c0, c1, c2, c3, d0, d1, d2, d3, c4, c5, c6, c7, d4, d5, d6, d7
+        res0_i = _mm256_shuffle_epi8(res0_i, byte_shuffle);
+
+        _mm256_storeu_si256((__m256i*)cPtr, res0_i);
+        aPtr += 32;
+        cPtr += 32;
+    }
+
+    for (number = n32points * 32; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif
+
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_32f_binary_slicer_8i_a_sse2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int n16points = num_points / 16;
+    __m128 a0_val, a1_val, a2_val, a3_val;
+    __m128 res0_f, res1_f, res2_f, res3_f;
+    __m128i res0_i, res1_i, res2_i, res3_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < n16points; number++) {
+        a0_val = _mm_load_ps(aPtr);
+        a1_val = _mm_load_ps(aPtr + 4);
+        a2_val = _mm_load_ps(aPtr + 8);
+        a3_val = _mm_load_ps(aPtr + 12);
+
+        // compare >= 0; return float
+        res0_f = _mm_cmpge_ps(a0_val, zero_val);
+        res1_f = _mm_cmpge_ps(a1_val, zero_val);
+        res2_f = _mm_cmpge_ps(a2_val, zero_val);
+        res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+        // convert to 32i and >> 31
+        res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+        res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+        res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+        res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+        // pack into 16-bit results
+        res0_i = _mm_packs_epi32(res0_i, res1_i);
+        res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+        // pack into 8-bit results
+        res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+        _mm_store_si128((__m128i*)cPtr, res0_i);
+
+        cPtr += 16;
+        aPtr += 16;
+    }
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector,
+                                                    const float* aVector,
+                                                    unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    unsigned int n16points = num_points / 16;
+    __m128 a0_val, a1_val, a2_val, a3_val;
+    __m128 res0_f, res1_f, res2_f, res3_f;
+    __m128i res0_i, res1_i, res2_i, res3_i;
+    __m128 zero_val;
+    zero_val = _mm_set1_ps(0.0f);
+
+    for (number = 0; number < n16points; number++) {
+        a0_val = _mm_loadu_ps(aPtr);
+        a1_val = _mm_loadu_ps(aPtr + 4);
+        a2_val = _mm_loadu_ps(aPtr + 8);
+        a3_val = _mm_loadu_ps(aPtr + 12);
+
+        // compare >= 0; return float
+        res0_f = _mm_cmpge_ps(a0_val, zero_val);
+        res1_f = _mm_cmpge_ps(a1_val, zero_val);
+        res2_f = _mm_cmpge_ps(a2_val, zero_val);
+        res3_f = _mm_cmpge_ps(a3_val, zero_val);
+
+        // convert to 32i and >> 31
+        res0_i = _mm_srli_epi32(_mm_cvtps_epi32(res0_f), 31);
+        res1_i = _mm_srli_epi32(_mm_cvtps_epi32(res1_f), 31);
+        res2_i = _mm_srli_epi32(_mm_cvtps_epi32(res2_f), 31);
+        res3_i = _mm_srli_epi32(_mm_cvtps_epi32(res3_f), 31);
+
+        // pack into 16-bit results
+        res0_i = _mm_packs_epi32(res0_i, res1_i);
+        res2_i = _mm_packs_epi32(res2_i, res3_i);
+
+        // pack into 8-bit results
+        res0_i = _mm_packs_epi16(res0_i, res2_i);
+
+        _mm_storeu_si128((__m128i*)cPtr, res0_i);
+
+        cPtr += 16;
+        aPtr += 16;
+    }
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_binary_slicer_8i_neon(int8_t* cVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
+{
+    int8_t* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int n16points = num_points / 16;
+
+    float32x4x2_t input_val0, input_val1;
+    float32x4_t zero_val;
+    uint32x4x2_t res0_u32, res1_u32;
+    uint16x4x2_t res0_u16x4, res1_u16x4;
+    uint16x8x2_t res_u16x8;
+    uint8x8x2_t res_u8;
+    uint8x8_t one;
+
+    zero_val = vdupq_n_f32(0.0);
+    one = vdup_n_u8(0x01);
+
+    // TODO: this is a good candidate for asm because the vcombines
+    // can be eliminated simply by picking dst registers that are
+    // adjacent.
+    for (number = 0; number < n16points; number++) {
+        input_val0 = vld2q_f32(aPtr);
+        input_val1 = vld2q_f32(aPtr + 8);
+
+        // test against 0; return uint32
+        res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+        res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+        res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+        res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+        // narrow uint32 -> uint16 followed by combine to 8-element vectors
+        res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+        res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+        res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+        res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+        res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+        res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+        // narrow uint16x8 -> uint8x8
+        res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+        res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+        // we *could* load twice as much data and do another vcombine here
+        // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+        // but that turns out to be ~16% slower than this version on zc702
+        // it's possible register contention in GCC scheduler slows it down
+        // and a hand-written asm with quad-word u8 registers is much faster.
+
+        res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+        res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+        vst2_u8((unsigned char*)cPtr, res_u8);
+        cPtr += 16;
+        aPtr += 16;
+    }
+
+    for (number = n16points * 16; number < num_points; number++) {
+        if (*aPtr++ >= 0) {
+            *cPtr++ = 1;
+        } else {
+            *cPtr++ = 0;
+        }
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
diff --git a/kernels/volk/volk_32f_convert_64f.h b/kernels/volk/volk_32f_convert_64f.h

new file mode 100644 (file)

index 0000000..0570a60
--- /dev/null
+++ b/kernels/volk/volk_32f_convert_64f.h
@@ -0,0 +1,251 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_convert_64f
+ *
+ * \b Overview
+ *
+ * Converts float values into doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_convert_64f(double* outputVector, const float* inputVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of floats to convert to doubles.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: returns the converted doubles.
+ *
+ * \b Example
+ * Generate floats and convert them to doubles.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       in[ii] = (float)ii;
+ *   }
+ *
+ *   volk_32f_convert_64f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %g\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+
+#ifndef INCLUDED_volk_32f_convert_64f_u_H
+#define INCLUDED_volk_32f_convert_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_convert_64f_u_avx(double* outputVector,
+                                              const float* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m256d ret;
+    __m128 inputVal;
+
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm256_cvtps_pd(inputVal);
+        _mm256_storeu_pd(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_convert_64f_u_sse2(double* outputVector,
+                                               const float* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m128d ret;
+    __m128 inputVal;
+
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_cvtps_pd(inputVal);
+
+        _mm_storeu_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+
+        inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+        ret = _mm_cvtps_pd(inputVal);
+
+        _mm_storeu_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_convert_64f_generic(double* outputVector,
+                                                const float* inputVector,
+                                                unsigned int num_points)
+{
+    double* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_convert_64f_a_H
+#define INCLUDED_volk_32f_convert_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_convert_64f_a_avx(double* outputVector,
+                                              const float* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m256d ret;
+    __m128 inputVal;
+
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm256_cvtps_pd(inputVal);
+        _mm256_store_pd(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_convert_64f_a_sse2(double* outputVector,
+                                               const float* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    double* outputVectorPtr = outputVector;
+    __m128d ret;
+    __m128 inputVal;
+
+    for (; number < quarterPoints; number++) {
+        inputVal = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_cvtps_pd(inputVal);
+
+        _mm_store_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+
+        inputVal = _mm_movehl_ps(inputVal, inputVal);
+
+        ret = _mm_cvtps_pd(inputVal);
+
+        _mm_store_pd(outputVectorPtr, ret);
+        outputVectorPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (double)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_convert_64f_a_generic(double* outputVector,
+                                                  const float* inputVector,
+                                                  unsigned int num_points)
+{
+    double* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((double)(*inputVectorPtr++));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_convert_64f_a_H */
diff --git a/kernels/volk/volk_32f_cos_32f.h b/kernels/volk/volk_32f_cos_32f.h

new file mode 100644 (file)

index 0000000..37785df
--- /dev/null
+++ b/kernels/volk/volk_32f_cos_32f.h
@@ -0,0 +1,999 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_cos_32f
+ *
+ * \b Overview
+ *
+ * Computes cosine of the input vector and stores results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_cos_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * Calculate cos(theta) for common angles.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0.000;
+ *   in[1] = 0.524;
+ *   in[2] = 0.786;
+ *   in[3] = 1.047;
+ *   in[4] = 1.571;
+ *   in[5] = 1.571;
+ *   in[6] = 2.094;
+ *   in[7] = 2.356;
+ *   in[8] = 2.618;
+ *   in[9] = 3.142;
+ *
+ *   volk_32f_cos_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("cos(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef INCLUDED_volk_32f_cos_32f_a_H
+#define INCLUDED_volk_32f_cos_32f_a_H
+
+#ifdef LV_HAVE_AVX512F
+
+#include <immintrin.h>
+static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
+                                              const float* inVector,
+                                              unsigned int num_points)
+{
+    float* cosPtr = cosVector;
+    const float* inPtr = inVector;
+
+    unsigned int number = 0;
+    unsigned int sixteenPoints = num_points / 16;
+    unsigned int i = 0;
+
+    __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, sine, cosine;
+    __m512i q, zeros, ones, twos, fours;
+
+    m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm512_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm512_set1_ps(4.0);
+    ftwos = _mm512_set1_ps(2.0);
+    fones = _mm512_set1_ps(1.0);
+    zeros = _mm512_setzero_epi32();
+    ones = _mm512_set1_epi32(1);
+    twos = _mm512_set1_epi32(2);
+    fours = _mm512_set1_epi32(4);
+
+    cp1 = _mm512_set1_ps(1.0);
+    cp2 = _mm512_set1_ps(0.08333333333333333);
+    cp3 = _mm512_set1_ps(0.002777777777777778);
+    cp4 = _mm512_set1_ps(4.96031746031746e-05);
+    cp5 = _mm512_set1_ps(5.511463844797178e-07);
+    __mmask16 condition1, condition2;
+
+    for (; number < sixteenPoints; number++) {
+        aVal = _mm512_load_ps(inPtr);
+        // s = fabs(aVal)
+        s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
+
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
+
+        s = _mm512_fnmadd_ps(r, pio4A, s);
+        s = _mm512_fnmadd_ps(r, pio4B, s);
+        s = _mm512_fnmadd_ps(r, pio4C, s);
+
+        s = _mm512_div_ps(
+            s,
+            _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm512_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm512_mul_ps(
+            _mm512_fmadd_ps(
+                _mm512_fmsub_ps(
+                    _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+        s = _mm512_div_ps(s, ftwos);
+
+        sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
+        cosine = _mm512_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition2 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
+        cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
+        cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
+        _mm512_store_ps(cosPtr, cosine);
+        inPtr += 16;
+        cosPtr += 16;
+    }
+
+    number = sixteenPoints * 16;
+    for (; number < num_points; number++) {
+        *cosPtr++ = cosf(*inPtr++);
+    }
+}
+#endif
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_fnmadd_ps(r, pio4A, s);
+        s = _mm256_fnmadd_ps(r, pio4B, s);
+        s = _mm256_fnmadd_ps(r, pio4C, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_store_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_store_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m128 sine, cosine;
+    __m128i q, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    __m128i zeroes = _mm_set1_epi32(0);
+    ones = _mm_set1_epi32(1);
+    __m128i allones = _mm_set1_epi32(0xffffffff);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.08333333333333333);
+    cp3 = _mm_set1_ps(0.002777777777777778);
+    cp4 = _mm_set1_ps(4.96031746031746e-05);
+    cp5 = _mm_set1_ps(5.511463844797178e-07);
+    union bit128 condition1;
+    union bit128 condition3;
+
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
+        s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec =
+            _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
+
+        cosine = _mm_add_ps(cosine,
+                            _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm_sub_ps(
+            cosine,
+            _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
+        _mm_store_ps(bPtr, cosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_cos_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_cos_32f_u_H
+#define INCLUDED_volk_32f_cos_32f_u_H
+
+#ifdef LV_HAVE_AVX512F
+
+#include <immintrin.h>
+static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
+                                              const float* inVector,
+                                              unsigned int num_points)
+{
+    float* cosPtr = cosVector;
+    const float* inPtr = inVector;
+
+    unsigned int number = 0;
+    unsigned int sixteenPoints = num_points / 16;
+    unsigned int i = 0;
+
+    __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, sine, cosine;
+    __m512i q, zeros, ones, twos, fours;
+
+    m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm512_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm512_set1_ps(4.0);
+    ftwos = _mm512_set1_ps(2.0);
+    fones = _mm512_set1_ps(1.0);
+    zeros = _mm512_setzero_epi32();
+    ones = _mm512_set1_epi32(1);
+    twos = _mm512_set1_epi32(2);
+    fours = _mm512_set1_epi32(4);
+
+    cp1 = _mm512_set1_ps(1.0);
+    cp2 = _mm512_set1_ps(0.08333333333333333);
+    cp3 = _mm512_set1_ps(0.002777777777777778);
+    cp4 = _mm512_set1_ps(4.96031746031746e-05);
+    cp5 = _mm512_set1_ps(5.511463844797178e-07);
+    __mmask16 condition1, condition2;
+    for (; number < sixteenPoints; number++) {
+        aVal = _mm512_loadu_ps(inPtr);
+        // s = fabs(aVal)
+        s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
+
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
+
+        s = _mm512_fnmadd_ps(r, pio4A, s);
+        s = _mm512_fnmadd_ps(r, pio4B, s);
+        s = _mm512_fnmadd_ps(r, pio4C, s);
+
+        s = _mm512_div_ps(
+            s,
+            _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm512_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm512_mul_ps(
+            _mm512_fmadd_ps(
+                _mm512_fmsub_ps(
+                    _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+        s = _mm512_div_ps(s, ftwos);
+
+        sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
+        cosine = _mm512_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition2 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
+
+        cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
+        cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
+        _mm512_storeu_ps(cosPtr, cosine);
+        inPtr += 16;
+        cosPtr += 16;
+    }
+
+    number = sixteenPoints * 16;
+    for (; number < num_points; number++) {
+        *cosPtr++ = cosf(*inPtr++);
+    }
+}
+#endif
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_fnmadd_ps(r, pio4A, s);
+        s = _mm256_fnmadd_ps(r, pio4B, s);
+        s = _mm256_fnmadd_ps(r, pio4C, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_storeu_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones, fzeroes;
+    __m256 sine, cosine;
+    __m256i q, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm256_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    __m256i zeroes = _mm256_set1_epi32(0);
+    ones = _mm256_set1_epi32(1);
+    __m256i allones = _mm256_set1_epi32(0xffffffff);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.08333333333333333);
+    cp3 = _mm256_set1_ps(0.002777777777777778);
+    cp4 = _mm256_set1_ps(4.96031746031746e-05);
+    cp5 = _mm256_set1_ps(5.511463844797178e-07);
+    union bit256 condition1;
+    union bit256 condition3;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        // s = fabs(aVal)
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        // if(((q+1)&2) != 0) { cosine=sine;}
+        condition1.int_vec =
+            _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
+        condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
+
+        // if(((q+2)&4) != 0) { cosine = -cosine;}
+        condition3.int_vec = _mm256_cmpeq_epi32(
+            _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
+        condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
+
+        cosine = _mm256_add_ps(
+            cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
+        cosine = _mm256_sub_ps(cosine,
+                               _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
+                                             condition3.float_vec));
+        _mm256_storeu_ps(bPtr, cosine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = cos(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        _mm_storeu_ps(bPtr, cosine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+/*
+ * For derivation see
+ * Shibata, Naoki, "Efficient evaluation methods of elementary functions
+ * suitable for SIMD computation," in Springer-Verlag 2010
+ */
+static inline void volk_32f_cos_32f_generic_fast(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    float m4pi = 1.273239544735162542821171882678754627704620361328125;
+    float pio4A = 0.7853981554508209228515625;
+    float pio4B = 0.794662735614792836713604629039764404296875e-8;
+    float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
+    int N = 3; // order of argument reduction
+
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        float s = fabs(*aPtr);
+        int q = (int)(s * m4pi);
+        int r = q + (q & 1);
+        s -= r * pio4A;
+        s -= r * pio4B;
+        s -= r * pio4C;
+
+        s = s * 0.125; // 2^-N (<--3)
+        s = s * s;
+        s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
+             1.0) *
+            s;
+
+        int i;
+        for (i = 0; i < N; ++i) {
+            s = (4.0 - s) * s;
+        }
+        s = s / 2.0;
+
+        float sine = sqrt((2.0 - s) * s);
+        float cosine = 1 - s;
+
+        if (((q + 1) & 2) != 0) {
+            s = cosine;
+            cosine = sine;
+            sine = s;
+        }
+        if (((q + 2) & 4) != 0) {
+            cosine = -cosine;
+        }
+        *bPtr = cosine;
+        bPtr++;
+        aPtr++;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (; number < num_points; number++) {
+        *bPtr++ = cosf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float* bVectorPtr = bVector;
+    const float* aVectorPtr = aVector;
+
+    float32x4_t b_vec;
+    float32x4_t a_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aVectorPtr);
+        // Prefetch next one, speeds things up
+        __VOLK_PREFETCH(aVectorPtr + 4);
+        b_vec = _vcosq_f32(a_vec);
+        vst1q_f32(bVectorPtr, b_vec);
+        // move pointers ahead
+        bVectorPtr += 4;
+        aVectorPtr += 4;
+    }
+
+    // Deal with the rest
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *bVectorPtr++ = cosf(*aVectorPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_cos_32f_u_H */
diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h

new file mode 100644 (file)

index 0000000..c1cee5e
--- /dev/null
+++ b/kernels/volk/volk_32f_exp_32f.h
@@ -0,0 +1,285 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/* SIMD (SSE4) implementation of exp
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+/*!
+ * \page volk_32f_exp_32f
+ *
+ * \b Overview
+ *
+ * Computes exponential of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_exp_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0;
+ *   in[1] = 0.5;
+ *   in[2] = std::sqrt(2.f)/2.f;
+ *   in[3] = std::sqrt(3.f)/2.f;
+ *   in[4] = in[5] = 1;
+ *   for(unsigned int ii = 6; ii < N; ++ii){
+ *       in[ii] = - in[N-ii-1];
+ *   }
+ *
+ *   volk_32f_exp_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("exp(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef INCLUDED_volk_32f_exp_32f_a_H
+#define INCLUDED_volk_32f_exp_32f_a_H
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+
+    // Declare variables and constants
+    __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        tmp = _mm_setzero_ps();
+
+        aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
+
+        /* express exp(x) as exp(g + n*log(2)) */
+        fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
+
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
+
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
+
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
+        z = _mm_mul_ps(aVal, aVal);
+
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
+        y = _mm_add_ps(y, one);
+
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm_castsi128_ps(emm0);
+        bVal = _mm_mul_ps(y, pow2n);
+
+        _mm_store_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 for aligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_exp_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_exp_32f_u_H
+#define INCLUDED_volk_32f_exp_32f_u_H
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void
+volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+
+    // Declare variables and constants
+    __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        tmp = _mm_setzero_ps();
+
+        aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
+
+        /* express exp(x) as exp(g + n*log(2)) */
+        fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
+
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
+
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
+
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
+        z = _mm_mul_ps(aVal, aVal);
+
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
+        y = _mm_add_ps(y, one);
+
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm_castsi128_ps(emm0);
+        bVal = _mm_mul_ps(y, pow2n);
+
+        _mm_storeu_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_exp_32f_u_H */
diff --git a/kernels/volk/volk_32f_expfast_32f.h b/kernels/volk/volk_32f_expfast_32f.h

new file mode 100644 (file)

index 0000000..7dfbaac
--- /dev/null
+++ b/kernels/volk/volk_32f_expfast_32f.h
@@ -0,0 +1,304 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_expfast_32f
+ *
+ * \b Overview
+ *
+ * Computes exp of input vector and stores results in output
+ * vector. This uses a fast exp approximation with a maximum 7% error.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_expfast_32f(float* bVector, const float* aVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       in[ii] = std::log((float)ii);
+ *   }
+ *
+ *   volk_32f_expfast_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#define Mln2 0.6931471805f
+#define A 8388608.0f
+#define B 1065353216.0f
+#define C 60801.0f
+
+
+#ifndef INCLUDED_volk_32f_expfast_32f_a_H
+#define INCLUDED_volk_32f_expfast_32f_a_H
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+
+#include <immintrin.h>
+
+static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_store_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void
+volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_store_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A / Mln2);
+    b = _mm_set1_ps(B - C);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_store_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_expfast_32f_u_H
+#define INCLUDED_volk_32f_expfast_32f_u_H
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_storeu_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, a, b;
+    __m256i exp;
+    a = _mm256_set1_ps(A / Mln2);
+    b = _mm256_set1_ps(B - C);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
+        bVal = _mm256_castsi256_ps(exp);
+
+        _mm256_storeu_ps(bPtr, bVal);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX for unaligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
+                                                 const float* aVector,
+                                                 unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, a, b;
+    __m128i exp;
+    a = _mm_set1_ps(A / Mln2);
+    b = _mm_set1_ps(B - C);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
+        bVal = _mm_castsi128_ps(exp);
+
+        _mm_storeu_ps(bPtr, bVal);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_expfast_32f_generic(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = expf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_expfast_32f_u_H */
diff --git a/kernels/volk/volk_32f_index_max_16u.h b/kernels/volk/volk_32f_index_max_16u.h

new file mode 100644 (file)

index 0000000..2aad087
--- /dev/null
+++ b/kernels/volk/volk_32f_index_max_16u.h
@@ -0,0 +1,362 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_index_max_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i x[i]. Finds and returns the index which contains
+ * the fist maximum value in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the fist maximum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N; ++ii){
+ *       float x = (float)ii;
+ *       // a parabola with a maximum at x=4
+ *       in[ii] = -(x-4) * (x-4) + 5;
+ *   }
+ *
+ *   volk_32f_index_max_16u(out, in, N);
+ *
+ *   printf("maximum is %1.2f at index %u\n", in[*out], *out);
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_max_16u_a_H
+#define INCLUDED_volk_32f_index_max_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    uint32_t number = 0;
+    const uint32_t eighthPoints = num_points / 8;
+
+    float* inputPtr = (float*)src0;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float max = src0[0];
+    float index = 0;
+    __m256 maxValues = _mm256_set1_ps(max);
+    __m256 maxValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+    for (; number < eighthPoints; number++) {
+
+        currentValues = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+
+        maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm256_store_ps(maxValuesBuffer, maxValues);
+    _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 8; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    uint32_t number = 0;
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+        maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 4; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    uint32_t number = 0;
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)src0;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float max = src0[0];
+    float index = 0;
+    __m128 maxValues = _mm_set1_ps(max);
+    __m128 maxValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+        maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                   _mm_andnot_ps(compareResults, maxValuesIndex));
+        maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                              _mm_andnot_ps(compareResults, maxValues));
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm_store_ps(maxValuesBuffer, maxValues);
+    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 4; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    float max = src0[0];
+    uint16_t index = 0;
+
+    uint32_t i = 1;
+
+    for (; i < num_points; ++i) {
+        if (src0[i] > max) {
+            index = i;
+            max = src0[i];
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_max_16u_u_H
+#define INCLUDED_volk_32f_index_max_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    uint32_t number = 0;
+    const uint32_t eighthPoints = num_points / 8;
+
+    float* inputPtr = (float*)src0;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float max = src0[0];
+    float index = 0;
+    __m256 maxValues = _mm256_set1_ps(max);
+    __m256 maxValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+    for (; number < eighthPoints; number++) {
+
+        currentValues = _mm256_loadu_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+
+        maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+        maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+    }
+
+    // Calculate the largest value from the remaining 4 points
+    _mm256_storeu_ps(maxValuesBuffer, maxValues);
+    _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
+
+    for (number = 0; number < 8; number++) {
+        if (maxValuesBuffer[number] > max) {
+            index = maxIndexesBuffer[number];
+            max = maxValuesBuffer[number];
+        } else if (maxValuesBuffer[number] == max) {
+            if (index > maxIndexesBuffer[number])
+                index = maxIndexesBuffer[number];
+        }
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (src0[number] > max) {
+            index = number;
+            max = src0[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
diff --git a/kernels/volk/volk_32f_index_max_32u.h b/kernels/volk/volk_32f_index_max_32u.h

new file mode 100644 (file)

index 0000000..86dad0d
--- /dev/null
+++ b/kernels/volk/volk_32f_index_max_32u.h
@@ -0,0 +1,545 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_index_max_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i x[i]. Finds and returns the index which contains the first maximum
+ * value in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_max_32u(uint32_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li src0: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the first maximum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N; ++ii){
+ *       float x = (float)ii;
+ *       // a parabola with a maximum at x=4
+ *       in[ii] = -(x-4) * (x-4) + 5;
+ *   }
+ *
+ *   volk_32f_index_max_32u(out, in, N);
+ *
+ *   printf("maximum is %1.2f at index %u\n", in[*out], *out);
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_max_32u_a_H
+#define INCLUDED_volk_32f_index_max_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+
+            currentValues = _mm_load_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+            maxValuesIndex =
+                _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+
+            currentValues = _mm_load_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+
+            maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                       _mm_andnot_ps(compareResults, maxValuesIndex));
+
+            maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                                  _mm_andnot_ps(compareResults, maxValues));
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 8;
+
+        float* inputPtr = (float*)src0;
+
+        __m256 indexIncrementValues = _mm256_set1_ps(8);
+        __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+        float max = src0[0];
+        float index = 0;
+        __m256 maxValues = _mm256_set1_ps(max);
+        __m256 maxValuesIndex = _mm256_setzero_ps();
+        __m256 compareResults;
+        __m256 currentValues;
+
+        __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm256_load_ps(inputPtr);
+            inputPtr += 8;
+            currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+            maxValuesIndex =
+                _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+        }
+
+        // Calculate the largest value from the remaining 8 points
+        _mm256_store_ps(maxValuesBuffer, maxValues);
+        _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 8; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 8;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+        float32x4_t indexIncrementValues = vdupq_n_f32(4);
+        __VOLK_ATTR_ALIGNED(16)
+        float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+        float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+        float max = src0[0];
+        float index = 0;
+        float32x4_t maxValues = vdupq_n_f32(max);
+        uint32x4_t maxValuesIndex = vmovq_n_u32(0);
+        uint32x4_t compareResults;
+        uint32x4_t currentIndexes_u;
+        float32x4_t currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = vld1q_f32(inputPtr);
+            inputPtr += 4;
+            currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+            currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+            compareResults = vcleq_f32(currentValues, maxValues);
+            maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex),
+                                       vbicq_u32(currentIndexes_u, compareResults));
+            maxValues = vmaxq_f32(currentValues, maxValues);
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        vst1q_f32(maxValuesBuffer, maxValues);
+        vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValues[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        float max = src0[0];
+        uint32_t index = 0;
+
+        uint32_t i = 1;
+
+        for (; i < num_points; ++i) {
+            if (src0[i] > max) {
+                index = i;
+                max = src0[i];
+            }
+        }
+        target[0] = index;
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_max_32u_u_H
+#define INCLUDED_volk_32f_index_max_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 8;
+
+        float* inputPtr = (float*)src0;
+
+        __m256 indexIncrementValues = _mm256_set1_ps(8);
+        __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+        float max = src0[0];
+        float index = 0;
+        __m256 maxValues = _mm256_set1_ps(max);
+        __m256 maxValuesIndex = _mm256_setzero_ps();
+        __m256 compareResults;
+        __m256 currentValues;
+
+        __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
+        __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm256_loadu_ps(inputPtr);
+            inputPtr += 8;
+            currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
+            maxValuesIndex =
+                _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
+        }
+
+        // Calculate the largest value from the remaining 8 points
+        _mm256_store_ps(maxValuesBuffer, maxValues);
+        _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 8; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 8;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm_loadu_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            maxValuesIndex =
+                _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
+            maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
+{
+    if (num_points > 0) {
+        uint32_t number = 0;
+        const uint32_t quarterPoints = num_points / 4;
+
+        float* inputPtr = (float*)src0;
+
+        __m128 indexIncrementValues = _mm_set1_ps(4);
+        __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+        float max = src0[0];
+        float index = 0;
+        __m128 maxValues = _mm_set1_ps(max);
+        __m128 maxValuesIndex = _mm_setzero_ps();
+        __m128 compareResults;
+        __m128 currentValues;
+
+        __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+        __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+        for (; number < quarterPoints; number++) {
+            currentValues = _mm_loadu_ps(inputPtr);
+            inputPtr += 4;
+            currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+            compareResults = _mm_cmpgt_ps(currentValues, maxValues);
+            maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                       _mm_andnot_ps(compareResults, maxValuesIndex));
+            maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                                  _mm_andnot_ps(compareResults, maxValues));
+        }
+
+        // Calculate the largest value from the remaining 4 points
+        _mm_store_ps(maxValuesBuffer, maxValues);
+        _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+        for (number = 0; number < 4; number++) {
+            if (maxValuesBuffer[number] > max) {
+                index = maxIndexesBuffer[number];
+                max = maxValuesBuffer[number];
+            } else if (maxValuesBuffer[number] == max) {
+                if (index > maxIndexesBuffer[number])
+                    index = maxIndexesBuffer[number];
+            }
+        }
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            if (src0[number] > max) {
+                index = number;
+                max = src0[number];
+            }
+        }
+        target[0] = (uint32_t)index;
+    }
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_index_max_32u_u_H*/
diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h

new file mode 100644 (file)

index 0000000..000ecaf
--- /dev/null
+++ b/kernels/volk/volk_32f_index_min_16u.h
@@ -0,0 +1,349 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_index_min_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i x[i]. Finds and returns the index which contains
+ * the fist minimum value in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_min_16u(uint16_t* target, const float* source, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li source: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the fist minimum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N; ++ii){
+ *       float x = (float)ii;
+ *       // a parabola with a minimum at x=4
+ *       in[ii] = (x-4) * (x-4) - 5;
+ *   }
+ *
+ *   volk_32f_index_min_16u(out, in, N);
+ *
+ *   printf("minimum is %1.2f at index %u\n", in[*out], *out);
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_min_16u_a_H
+#define INCLUDED_volk_32f_index_min_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    const uint32_t eighthPoints = num_points / 8;
+
+    float* inputPtr = (float*)source;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float min = source[0];
+    float index = 0;
+    __m256 minValues = _mm256_set1_ps(min);
+    __m256 minValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+    for (uint32_t number = 0; number < eighthPoints; number++) {
+
+        currentValues = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+
+        minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm256_store_ps(minValuesBuffer, minValues);
+    _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 8; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target,
+                                                   const float* source,
+                                                   uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+        minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+        minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                   _mm_andnot_ps(compareResults, minValuesIndex));
+        minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                              _mm_andnot_ps(compareResults, minValues));
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    float min = source[0];
+    uint16_t index = 0;
+
+    for (uint32_t i = 1; i < num_points; ++i) {
+        if (source[i] < min) {
+            index = i;
+            min = source[i];
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_min_16u_u_H
+#define INCLUDED_volk_32f_index_min_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    const uint32_t eighthPoints = num_points / 8;
+
+    float* inputPtr = (float*)source;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float min = source[0];
+    float index = 0;
+    __m256 minValues = _mm256_set1_ps(min);
+    __m256 minValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+    for (uint32_t number = 0; number < eighthPoints; number++) {
+
+        currentValues = _mm256_loadu_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+
+        minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm256_storeu_ps(minValuesBuffer, minValues);
+    _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 8; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint16_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h

new file mode 100644 (file)

index 0000000..0c8bf8c
--- /dev/null
+++ b/kernels/volk/volk_32f_index_min_32u.h
@@ -0,0 +1,511 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_index_min_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i x[i]. Finds and returns the index which contains the first minimum
+ * value in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_index_min_32u(uint32_t* target, const float* source, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li source: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: The index of the first minimum value in the input buffer.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N; ++ii){
+ *       float x = (float)ii;
+ *       // a parabola with a minimum at x=4
+ *       in[ii] = (x-4) * (x-4) - 5;
+ *   }
+ *
+ *   volk_32f_index_min_32u(out, in, N);
+ *
+ *   printf("minimum is %1.2f at index %u\n", in[*out], *out);
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_index_min_32u_a_H
+#define INCLUDED_volk_32f_index_min_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
+                                                   const float* source,
+                                                   uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+        minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
+#ifdef LV_HAVE_SSE
+
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+
+        currentValues = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+
+        minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                   _mm_andnot_ps(compareResults, minValuesIndex));
+
+        minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                              _mm_andnot_ps(compareResults, minValues));
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 8;
+
+    float* inputPtr = (float*)source;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float min = source[0];
+    float index = 0;
+    __m256 minValues = _mm256_set1_ps(min);
+    __m256 minValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+        currentValues = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+        compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+        minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 8 points
+    _mm256_store_ps(minValuesBuffer, minValues);
+    _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 8; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+    float32x4_t indexIncrementValues = vdupq_n_f32(4);
+    __VOLK_ATTR_ALIGNED(16)
+    float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
+    float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
+
+    float min = source[0];
+    float index = 0;
+    float32x4_t minValues = vdupq_n_f32(min);
+    uint32x4_t minValuesIndex = vmovq_n_u32(0);
+    uint32x4_t compareResults;
+    uint32x4_t currentIndexes_u;
+    float32x4_t currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+        currentValues = vld1q_f32(inputPtr);
+        inputPtr += 4;
+        currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
+        currentIndexes_u = vcvtq_u32_f32(currentIndexes);
+        compareResults = vcgeq_f32(currentValues, minValues);
+        minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
+                                   vbicq_u32(currentIndexes_u, compareResults));
+        minValues = vminq_f32(currentValues, minValues);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    vst1q_f32(minValuesBuffer, minValues);
+    vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValues[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
+{
+    float min = source[0];
+    uint32_t index = 0;
+
+    for (uint32_t i = 1; i < num_points; ++i) {
+        if (source[i] < min) {
+            index = i;
+            min = source[i];
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
+
+
+#ifndef INCLUDED_volk_32f_index_min_32u_u_H
+#define INCLUDED_volk_32f_index_min_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 8;
+
+    float* inputPtr = (float*)source;
+
+    __m256 indexIncrementValues = _mm256_set1_ps(8);
+    __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
+
+    float min = source[0];
+    float index = 0;
+    __m256 minValues = _mm256_set1_ps(min);
+    __m256 minValuesIndex = _mm256_setzero_ps();
+    __m256 compareResults;
+    __m256 currentValues;
+
+    __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
+    __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+        currentValues = _mm256_loadu_ps(inputPtr);
+        inputPtr += 8;
+        currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
+        compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
+        minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smalles value from the remaining 8 points
+    _mm256_store_ps(minValuesBuffer, minValues);
+    _mm256_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 8; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
+                                                   const float* source,
+                                                   uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+        currentValues = _mm_loadu_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+        minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
+        minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
+{
+    const uint32_t quarterPoints = num_points / 4;
+
+    float* inputPtr = (float*)source;
+
+    __m128 indexIncrementValues = _mm_set1_ps(4);
+    __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
+
+    float min = source[0];
+    float index = 0;
+    __m128 minValues = _mm_set1_ps(min);
+    __m128 minValuesIndex = _mm_setzero_ps();
+    __m128 compareResults;
+    __m128 currentValues;
+
+    __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
+    __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
+
+    for (uint32_t number = 0; number < quarterPoints; number++) {
+        currentValues = _mm_loadu_ps(inputPtr);
+        inputPtr += 4;
+        currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+        compareResults = _mm_cmplt_ps(currentValues, minValues);
+        minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
+                                   _mm_andnot_ps(compareResults, minValuesIndex));
+        minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
+                              _mm_andnot_ps(compareResults, minValues));
+    }
+
+    // Calculate the smallest value from the remaining 4 points
+    _mm_store_ps(minValuesBuffer, minValues);
+    _mm_store_ps(minIndexesBuffer, minValuesIndex);
+
+    for (uint32_t number = 0; number < 4; number++) {
+        if (minValuesBuffer[number] < min) {
+            index = minIndexesBuffer[number];
+            min = minValuesBuffer[number];
+        } else if (minValuesBuffer[number] == min) {
+            if (index > minIndexesBuffer[number])
+                index = minIndexesBuffer[number];
+        }
+    }
+
+    for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
+        if (source[number] < min) {
+            index = number;
+            min = source[number];
+        }
+    }
+    target[0] = (uint32_t)index;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
diff --git a/kernels/volk/volk_32f_invsqrt_32f.h b/kernels/volk/volk_32f_invsqrt_32f.h

new file mode 100644 (file)

index 0000000..e91b6c7
--- /dev/null
+++ b/kernels/volk/volk_32f_invsqrt_32f.h
@@ -0,0 +1,206 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_invsqrt_32f
+ *
+ * \b Overview
+ *
+ * Computes the inverse square root of the input vector and stores
+ * result in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_invsqrt_32f(float* cVector, const float* aVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       in[ii] = 1.0 / (float)(ii*ii);
+ *   }
+ *
+ *   volk_32f_invsqrt_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
+#define INCLUDED_volk_32f_invsqrt_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+static inline float Q_rsqrt(float number)
+{
+    float x2;
+    const float threehalfs = 1.5F;
+    union f32_to_i32 {
+        int32_t i;
+        float f;
+    } u;
+
+    x2 = number * 0.5F;
+    u.f = number;
+    u.i = 0x5f3759df - (u.i >> 1);               // what the fuck?
+    u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
+    // u.f  = u.f * ( threehalfs - ( x2 * u.f * u.f ) );   // 2nd iteration, this can be
+    // removed
+
+    return u.f;
+}
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        cVal = _mm256_rsqrt_ps(aVal);
+        _mm256_store_ps(cPtr, cVal);
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+
+        cVal = _mm_rsqrt_ps(aVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = Q_rsqrt(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int quarter_points = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    float32x4_t a_val, c_val;
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld1q_f32(aPtr);
+        c_val = vrsqrteq_f32(a_val);
+        vst1q_f32(cPtr, c_val);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_invsqrt_32f_generic(float* cVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = Q_rsqrt(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        cVal = _mm256_rsqrt_ps(aVal);
+        _mm256_storeu_ps(cPtr, cVal);
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++)
+        *cPtr++ = Q_rsqrt(*aPtr++);
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
diff --git a/kernels/volk/volk_32f_log2_32f.h b/kernels/volk/volk_32f_log2_32f.h

new file mode 100644 (file)

index 0000000..8b96dab
--- /dev/null
+++ b/kernels/volk/volk_32f_log2_32f.h
@@ -0,0 +1,740 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_log2_32f
+ *
+ * \b Overview
+ *
+ * Computes base 2 log of input vector and stores results in output vector.
+ *
+ * Note that this implementation is not conforming to the IEEE FP standard, i.e.,
+ * +-Inf outputs are mapped to +-127.0f and +-NaN input values are not supported.
+ *
+ * This kernel was adapted from Jose Fonseca's Fast SSE2 log implementation
+ * http://jrfonseca.blogspot.in/2008/09/fast-sse2-pow-tables-or-polynomials.htm
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This is the MIT License (MIT)
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_log2_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: the input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       in[ii] = std::pow(2.f,((float)ii));
+ *   }
+ *
+ *   volk_32f_log2_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_log2_32f_a_H
+#define INCLUDED_volk_32f_log2_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define LOG_POLY_DEGREE 6
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_log2_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++)
+        *bPtr++ = log2f_non_ieee(*aPtr++);
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_FMAAVX2(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_a_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5_FMAAVX2(frac,
+                                 3.1157899f,
+                                 -3.3241990f,
+                                 2.5988452f,
+                                 -1.2315303f,
+                                 3.1821337e-1f,
+                                 -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4_FMAAVX2(frac,
+                                 2.8882704548164776201f,
+                                 -2.52074962577807006663f,
+                                 1.48116647521213171641f,
+                                 -0.465725644288844778798f,
+                                 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3_FMAAVX2(frac,
+                                 2.61761038894603480148f,
+                                 -1.75647175389045657003f,
+                                 0.688243882994381274313f,
+                                 -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2_FMAAVX2(frac,
+                                 2.28330284476918490682f,
+                                 -1.04913055217340124191f,
+                                 0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+        _mm256_store_ps(bPtr, bVal);
+
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void
+volk_32f_log2_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal =
+            _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+        _mm256_store_ps(bPtr, bVal);
+
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void
+volk_32f_log2_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, mantissa, frac, leadingOne;
+    __m128i bias, exp;
+
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        bVal = _mm_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        _mm_store_ps(bPtr, bVal);
+
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE()                         \
+    int32x4_t one = vdupq_n_s32(0x000800000);          \
+    /* minimax polynomial */                           \
+    float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+    float32x4_t p1 = vdupq_n_f32(6.1129631282966113);  \
+    float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+    float32x4_t p3 = vdupq_n_f32(3.2865287703753912);  \
+    float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+    float32x4_t p5 = vdupq_n_f32(0.2751487703421256);  \
+    float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+    int32x4_t exp_mask = vdupq_n_s32(0x7f800000);      \
+    int32x4_t sig_mask = vdupq_n_s32(0x007fffff);      \
+    int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval)                                      \
+    int32x4_t exponent_i = vandq_s32(aval, exp_mask);                           \
+    int32x4_t significand_i = vandq_s32(aval, sig_mask);                        \
+    exponent_i = vshrq_n_s32(exponent_i, 23);                                   \
+                                                                                \
+    /* extract the exponent and significand                                     \
+       we can treat this as fixed point to save ~9% on the                      \
+       conversion + float add */                                                \
+    significand_i = vorrq_s32(one, significand_i);                              \
+    float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23);             \
+    /* debias the exponent and convert to float */                              \
+    exponent_i = vsubq_s32(exponent_i, exp_bias);                               \
+    float32x4_t exponent_f = vcvtq_f32_s32(exponent_i);                         \
+                                                                                \
+    /* put the significand through a polynomial fit of log2(x) [1,2]            \
+       add the result to the exponent */                                        \
+    log2_approx = vaddq_f32(exponent_f, p0);         /* p0 */                   \
+    float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */               \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */      \
+    tmp1 = vmulq_f32(sig_2, p2);                                 /* p2 * x^2 */ \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+                                                                                \
+    float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */              \
+    tmp1 = vmulq_f32(sig_3, p3);                         /* p3 * x^3 */         \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */                      \
+    tmp1 = vmulq_f32(sig_4, p4);                 /* p4 * x^4 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */                      \
+    tmp1 = vmulq_f32(sig_5, p5);                 /* p5 * x^5 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);                                 \
+    float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */                      \
+    tmp1 = vmulq_f32(sig_6, p6);                 /* p6 * x^6 */                 \
+    log2_approx = vaddq_f32(log2_approx, tmp1);
+
+static inline void
+volk_32f_log2_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
+
+    int32x4_t aval;
+    float32x4_t log2_approx;
+
+    VLOG2Q_NEON_PREAMBLE()
+    // lms
+    // p0 = vdupq_n_f32(-1.649132280361871);
+    // p1 = vdupq_n_f32(1.995047138579499);
+    // p2 = vdupq_n_f32(-0.336914839219728);
+
+    // keep in mind a single precision float is represented as
+    //   (-1)^sign * 2^exp * 1.significand, so the log2 is
+    // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+    for (number = 0; number < quarterPoints; ++number) {
+        // load float in to an int register without conversion
+        aval = vld1q_s32((int*)aPtr);
+
+        VLOG2Q_NEON_F32(log2_approx, aval)
+
+        vst1q_f32(bPtr, log2_approx);
+
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_log2_32f_u_H
+#define INCLUDED_volk_32f_log2_32f_u_H
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_log2_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        float const result = log2f(*aPtr++);
+        *bPtr++ = isinf(result) ? -127.0f : result;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void
+volk_32f_log2_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, mantissa, frac, leadingOne;
+    __m128i bias, exp;
+
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_loadu_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        bVal = _mm_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        _mm_storeu_ps(bPtr, bVal);
+
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_FMAAVX2(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_FMAAVX2(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_log2_32f_u_avx2_fma(float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5_FMAAVX2(frac,
+                                 3.1157899f,
+                                 -3.3241990f,
+                                 2.5988452f,
+                                 -1.2315303f,
+                                 3.1821337e-1f,
+                                 -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4_FMAAVX2(frac,
+                                 2.8882704548164776201f,
+                                 -2.52074962577807006663f,
+                                 1.48116647521213171641f,
+                                 -0.465725644288844778798f,
+                                 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3_FMAAVX2(frac,
+                                 2.61761038894603480148f,
+                                 -1.75647175389045657003f,
+                                 0.688243882994381274313f,
+                                 -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2_FMAAVX2(frac,
+                                 2.28330284476918490682f,
+                                 -1.04913055217340124191f,
+                                 0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
+        _mm256_storeu_ps(bPtr, bVal);
+
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void
+volk_32f_log2_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, mantissa, frac, leadingOne;
+    __m256i bias, exp;
+
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        bVal = _mm256_cvtepi32_ps(exp);
+
+        // Now to extract mantissa
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if LOG_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
+#else
+#error
+#endif
+
+        bVal =
+            _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
+        _mm256_storeu_ps(bPtr, bVal);
+
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_log2_32f_u_generic(bPtr, aPtr, num_points - number);
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_u_H */
diff --git a/kernels/volk/volk_32f_null_32f.h b/kernels/volk/volk_32f_null_32f.h

new file mode 100644 (file)

index 0000000..6790e73
--- /dev/null
+++ b/kernels/volk/volk_32f_null_32f.h
@@ -0,0 +1,32 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef INCLUDED_volk_32f_null_32f_a_H
+#define INCLUDED_volk_32f_null_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = *aPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_null_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h

new file mode 100644 (file)

index 0000000..a6eb37c
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_32f_fm_detect_32f.h
@@ -0,0 +1,338 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_32f_fm_detect_32f
+ *
+ * \b Overview
+ *
+ * Performs FM-detect differentiation on the input vector and stores
+ * the results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_32f_fm_detect_32f(float* outputVector, const float* inputVector,
+ * const float bound, float* saveValue, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector containing phase data (must be on the interval
+ * (-bound, bound]). \li bound: The interval that the input phase data is in, which is
+ * used to modulo the differentiation. \li saveValue: A pointer to a float which contains
+ * the phase value of the sample before the first input sample. \li num_points The number
+ * of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32f_s32f_32f_fm_detect_32f();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int eighthPoints = (num_points - 1) / 8;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m256 upperBound = _mm256_set1_ps(bound);
+    __m256 lowerBound = _mm256_set1_ps(-bound);
+    __m256 next3old1;
+    __m256 next4;
+    __m256 boundAdjust;
+    __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+    __m256 negBoundAdjust = _mm256_set1_ps(2 * bound);  // Add when we're below.
+    // Do the first 8 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
+    inPtr++;
+    outPtr++;
+    for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < eighthPoints; number++) {
+        // Load data
+        next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm256_load_ps(inPtr);
+        inPtr += 8;
+        // Subtract and store:
+        next3old1 = _mm256_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+        boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+        next4 = _mm256_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm256_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+        _mm256_store_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 8;
+    }
+
+    for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_a_sse(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int quarterPoints = (num_points - 1) / 4;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m128 upperBound = _mm_set_ps1(bound);
+    __m128 lowerBound = _mm_set_ps1(-bound);
+    __m128 next3old1;
+    __m128 next4;
+    __m128 boundAdjust;
+    __m128 posBoundAdjust = _mm_set_ps1(-2 * bound); // Subtract when we're above.
+    __m128 negBoundAdjust = _mm_set_ps1(2 * bound);  // Add when we're below.
+    // Do the first 4 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
+    inPtr++;
+    outPtr++;
+    for (j = 1; j < ((4 < num_points) ? 4 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < quarterPoints; number++) {
+        // Load data
+        next3old1 = _mm_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm_load_ps(inPtr);
+        inPtr += 4;
+        // Subtract and store:
+        next3old1 = _mm_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm_cmpgt_ps(next3old1, upperBound);
+        boundAdjust = _mm_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm_cmplt_ps(next3old1, lowerBound);
+        next4 = _mm_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm_add_ps(next3old1, boundAdjust);
+        _mm_store_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 4;
+    }
+
+    for (number = (4 > (quarterPoints * 4) ? 4 : (4 * quarterPoints));
+         number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_generic(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float bound,
+                                                           float* saveValue,
+                                                           unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 0;
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+
+    // Do the first 1 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
+    inPtr++;
+    outPtr++;
+
+    for (number = 1; number < num_points; number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+#define INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_32f_fm_detect_32f_u_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         const float bound,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    if (num_points < 1) {
+        return;
+    }
+    unsigned int number = 1;
+    unsigned int j = 0;
+    // num_points-1 keeps Fedora 7's gcc from crashing...
+    // num_points won't work.  :(
+    const unsigned int eighthPoints = (num_points - 1) / 8;
+
+    float* outPtr = outputVector;
+    const float* inPtr = inputVector;
+    __m256 upperBound = _mm256_set1_ps(bound);
+    __m256 lowerBound = _mm256_set1_ps(-bound);
+    __m256 next3old1;
+    __m256 next4;
+    __m256 boundAdjust;
+    __m256 posBoundAdjust = _mm256_set1_ps(-2 * bound); // Subtract when we're above.
+    __m256 negBoundAdjust = _mm256_set1_ps(2 * bound);  // Add when we're below.
+    // Do the first 8 by hand since we're going in from the saveValue:
+    *outPtr = *inPtr - *saveValue;
+    if (*outPtr > bound)
+        *outPtr -= 2 * bound;
+    if (*outPtr < -bound)
+        *outPtr += 2 * bound;
+    inPtr++;
+    outPtr++;
+    for (j = 1; j < ((8 < num_points) ? 8 : num_points); j++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    for (; number < eighthPoints; number++) {
+        // Load data
+        next3old1 = _mm256_loadu_ps((float*)(inPtr - 1));
+        next4 = _mm256_loadu_ps(inPtr);
+        inPtr += 8;
+        // Subtract and store:
+        next3old1 = _mm256_sub_ps(next4, next3old1);
+        // Bound:
+        boundAdjust = _mm256_cmp_ps(next3old1, upperBound, _CMP_GT_OS);
+        boundAdjust = _mm256_and_ps(boundAdjust, posBoundAdjust);
+        next4 = _mm256_cmp_ps(next3old1, lowerBound, _CMP_LT_OS);
+        next4 = _mm256_and_ps(next4, negBoundAdjust);
+        boundAdjust = _mm256_or_ps(next4, boundAdjust);
+        // Make sure we're in the bounding interval:
+        next3old1 = _mm256_add_ps(next3old1, boundAdjust);
+        _mm256_storeu_ps(outPtr, next3old1); // Store the results back into the output
+        outPtr += 8;
+    }
+
+    for (number = (8 > (eighthPoints * 8) ? 8 : (8 * eighthPoints)); number < num_points;
+         number++) {
+        *outPtr = *(inPtr) - *(inPtr - 1);
+        if (*outPtr > bound)
+            *outPtr -= 2 * bound;
+        if (*outPtr < -bound)
+            *outPtr += 2 * bound;
+        inPtr++;
+        outPtr++;
+    }
+
+    *saveValue = inputVector[num_points - 1];
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_s32f_32f_fm_detect_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_add_32f.h b/kernels/volk/volk_32f_s32f_add_32f.h

new file mode 100644 (file)

index 0000000..58a1fc8
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_add_32f.h
@@ -0,0 +1,261 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_add_32f
+ *
+ * \b Overview
+ *
+ * Adds a floating point scalar to a floating point vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_add_32f(float* cVector, const float* aVector, const float scalar,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li scalar: the scalar value to add against \p aVector.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of floats.
+ *
+ * \b Example
+ * \code
+ *  int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Add addshift to each entry.
+ *   float addshift = 5.0f;
+ *
+ *   volk_32f_s32f_add_32f(out, increasing, addshift, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_add_32f_generic(float* cVector,
+                                                 const float* aVector,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    for (number = 0; number < num_points; number++) {
+        *outputPtr = (*inputPtr) + scalar;
+        inputPtr++;
+        outputPtr++;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+#ifndef INCLUDED_volk_32f_s32f_add_32f_u_H
+#define INCLUDED_volk_32f_s32f_add_32f_u_H
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_add_32f_u_sse(float* cVector,
+                                               const float* aVector,
+                                               const float scalar,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_add_32f_u_avx(float* cVector,
+                                               const float* aVector,
+                                               const float scalar,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_s32f_add_32f_u_neon(float* cVector,
+                                                const float* aVector,
+                                                const float scalar,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float32x4_t aVal, cVal, scalarvec;
+
+    scalarvec = vdupq_n_f32(scalar);
+
+    for (number = 0; number < quarterPoints; number++) {
+        aVal = vld1q_f32(inputPtr);        // Load into NEON regs
+        cVal = vaddq_f32(aVal, scalarvec); // Do the add
+        vst1q_f32(outputPtr, cVal);        // Store results back to output
+        inputPtr += 4;
+        outputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number);
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_add_32f_a_H
+#define INCLUDED_volk_32f_s32f_add_32f_a_H
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_add_32f_a_sse(float* cVector,
+                                               const float* aVector,
+                                               const float scalar,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_add_32f_a_avx(float* cVector,
+                                               const float* aVector,
+                                               const float scalar,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst,
+                                             const float* src,
+                                             const float scalar,
+                                             unsigned int num_points);
+
+static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
+                                               const float* aVector,
+                                               const float scalar,
+                                               unsigned int num_points)
+{
+    volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */
diff --git a/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h

new file mode 100644 (file)

index 0000000..816f609
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_calc_spectral_noise_floor_32f.h
@@ -0,0 +1,461 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_calc_spectral_noise_floor_32f
+ *
+ * \b Overview
+ *
+ * Computes the spectral noise floor of an input power spectrum.
+ *
+ * Calculates the spectral noise floor of an input power spectrum by
+ * determining the mean of the input power spectrum, then
+ * recalculating the mean excluding any power spectrum values that
+ * exceed the mean by the spectralExclusionValue (in dB).  Provides a
+ * rough estimation of the signal noise floor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_calc_spectral_noise_floor_32f(float* noiseFloorAmplitude, const
+ * float* realDataPoints, const float spectralExclusionValue, const unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li realDataPoints: The input power spectrum.
+ * \li spectralExclusionValue: The number of dB above the noise floor that a data point
+ * must be to be excluded from the noise floor calculation - default value is 20. \li
+ * num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li noiseFloorAmplitude: The noise floor of the input spectrum, in dB.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32f_s32f_calc_spectral_noise_floor_32f
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_a_avx(float* noiseFloorAmplitude,
+                                                  const float* realDataPoints,
+                                                  const float spectralExclusionValue,
+                                                  const unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(32) float avgPointsVector[8];
+
+    __m256 dataPointsVal;
+    __m256 avgPointsVal = _mm256_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+    }
+
+    _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+    __m256 vOnesVector = _mm256_set1_ps(1.0);
+    __m256 vValidBinCount = _mm256_setzero_ps();
+    avgPointsVal = _mm256_setzero_ps();
+    __m256 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal =
+            _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount =
+            _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+    }
+
+    // Calculate the mean from the remaining data points
+    _mm256_store_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(32) float validBinCountVector[8];
+    _mm256_store_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+    validBinCount += validBinCountVector[4];
+    validBinCount += validBinCountVector[5];
+    validBinCount += validBinCountVector[6];
+    validBinCount += validBinCountVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
+    }
+
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
+
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_a_sse(float* noiseFloorAmplitude,
+                                                  const float* realDataPoints,
+                                                  const float spectralExclusionValue,
+                                                  const unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(16) float avgPointsVector[4];
+
+    __m128 dataPointsVal;
+    __m128 avgPointsVal = _mm_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < quarterPoints; number++) {
+
+        dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 4;
+
+        avgPointsVal = _mm_add_ps(avgPointsVal, dataPointsVal);
+    }
+
+    _mm_store_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m128 vMeanAmplitudeVector = _mm_set_ps1(meanAmplitude);
+    __m128 vOnesVector = _mm_set_ps1(1.0);
+    __m128 vValidBinCount = _mm_setzero_ps();
+    avgPointsVal = _mm_setzero_ps();
+    __m128 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < quarterPoints; number++) {
+
+        dataPointsVal = _mm_load_ps(dataPointsPtr);
+
+        dataPointsPtr += 4;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm_cmple_ps(dataPointsVal, vMeanAmplitudeVector);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal = _mm_add_ps(avgPointsVal, _mm_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount = _mm_add_ps(vValidBinCount, _mm_and_ps(compareMask, vOnesVector));
+    }
+
+    // Calculate the mean from the remaining data points
+    _mm_store_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(16) float validBinCountVector[4];
+    _mm_store_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
+    }
+
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
+
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_generic(float* noiseFloorAmplitude,
+                                                    const float* realDataPoints,
+                                                    const float spectralExclusionValue,
+                                                    const unsigned int num_points)
+{
+    float sumMean = 0.0;
+    unsigned int number;
+    // find the sum (for mean), etc
+    for (number = 0; number < num_points; number++) {
+        // sum (for mean)
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more)
+    const float meanAmplitude = (sumMean / num_points) + spectralExclusionValue;
+
+    // now throw out any bins higher than the mean
+    sumMean = 0.0;
+    unsigned int newNumDataPoints = num_points;
+    for (number = 0; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude)
+            sumMean += realDataPoints[number];
+        else
+            newNumDataPoints--;
+    }
+
+    float localNoiseFloorAmplitude = 0.0;
+    if (newNumDataPoints == 0)                    // in the odd case that all
+        localNoiseFloorAmplitude = meanAmplitude; // amplitudes are equal!
+    else
+        localNoiseFloorAmplitude = sumMean / ((float)newNumDataPoints);
+
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
+#define INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_s32f_calc_spectral_noise_floor_32f_u_avx(float* noiseFloorAmplitude,
+                                                  const float* realDataPoints,
+                                                  const float spectralExclusionValue,
+                                                  const unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* dataPointsPtr = realDataPoints;
+    __VOLK_ATTR_ALIGNED(16) float avgPointsVector[8];
+
+    __m256 dataPointsVal;
+    __m256 avgPointsVal = _mm256_setzero_ps();
+    // Calculate the sum (for mean) for all points
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        avgPointsVal = _mm256_add_ps(avgPointsVal, dataPointsVal);
+    }
+
+    _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+    float sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        sumMean += realDataPoints[number];
+    }
+
+    // calculate the spectral mean
+    // +20 because for the comparison below we only want to throw out bins
+    // that are significantly higher (and would, thus, affect the mean more
+    const float meanAmplitude = (sumMean / ((float)num_points)) + spectralExclusionValue;
+
+    dataPointsPtr = realDataPoints; // Reset the dataPointsPtr
+    __m256 vMeanAmplitudeVector = _mm256_set1_ps(meanAmplitude);
+    __m256 vOnesVector = _mm256_set1_ps(1.0);
+    __m256 vValidBinCount = _mm256_setzero_ps();
+    avgPointsVal = _mm256_setzero_ps();
+    __m256 compareMask;
+    number = 0;
+    // Calculate the sum (for mean) for any points which do NOT exceed the mean amplitude
+    for (; number < eighthPoints; number++) {
+
+        dataPointsVal = _mm256_loadu_ps(dataPointsPtr);
+
+        dataPointsPtr += 8;
+
+        // Identify which items do not exceed the mean amplitude
+        compareMask = _mm256_cmp_ps(dataPointsVal, vMeanAmplitudeVector, _CMP_LE_OQ);
+
+        // Mask off the items that exceed the mean amplitude and add the avg Points that
+        // do not exceed the mean amplitude
+        avgPointsVal =
+            _mm256_add_ps(avgPointsVal, _mm256_and_ps(compareMask, dataPointsVal));
+
+        // Count the number of bins which do not exceed the mean amplitude
+        vValidBinCount =
+            _mm256_add_ps(vValidBinCount, _mm256_and_ps(compareMask, vOnesVector));
+    }
+
+    // Calculate the mean from the remaining data points
+    _mm256_storeu_ps(avgPointsVector, avgPointsVal);
+
+    sumMean = 0.0;
+    sumMean += avgPointsVector[0];
+    sumMean += avgPointsVector[1];
+    sumMean += avgPointsVector[2];
+    sumMean += avgPointsVector[3];
+    sumMean += avgPointsVector[4];
+    sumMean += avgPointsVector[5];
+    sumMean += avgPointsVector[6];
+    sumMean += avgPointsVector[7];
+
+    // Calculate the number of valid bins from the remaining count
+    __VOLK_ATTR_ALIGNED(16) float validBinCountVector[8];
+    _mm256_storeu_ps(validBinCountVector, vValidBinCount);
+
+    float validBinCount = 0;
+    validBinCount += validBinCountVector[0];
+    validBinCount += validBinCountVector[1];
+    validBinCount += validBinCountVector[2];
+    validBinCount += validBinCountVector[3];
+    validBinCount += validBinCountVector[4];
+    validBinCount += validBinCountVector[5];
+    validBinCount += validBinCountVector[6];
+    validBinCount += validBinCountVector[7];
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        if (realDataPoints[number] <= meanAmplitude) {
+            sumMean += realDataPoints[number];
+            validBinCount += 1.0;
+        }
+    }
+
+    float localNoiseFloorAmplitude = 0;
+    if (validBinCount > 0.0) {
+        localNoiseFloorAmplitude = sumMean / validBinCount;
+    } else {
+        localNoiseFloorAmplitude =
+            meanAmplitude; // For the odd case that all the amplitudes are equal...
+    }
+
+    *noiseFloorAmplitude = localNoiseFloorAmplitude;
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32f_s32f_calc_spectral_noise_floor_32f_u_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_16i.h b/kernels/volk/volk_32f_s32f_convert_16i.h

new file mode 100644 (file)

index 0000000..7d34fcd
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -0,0 +1,581 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_16i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 16-bit short after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_16i(int16_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ * delta int N = 10; unsigned int alignment = volk_get_alignment(); float* increasing =
+ * (float*)volk_malloc(sizeof(float)*N, alignment); int16_t* out =
+ * (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   float scale = 5.f;
+ *
+ *   volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
+#define INCLUDED_volk_32f_s32f_convert_16i_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+                             vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+                             vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal, ret;
+    __m256i intInputVal;
+    __m128i intInputVal1, intInputVal2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+                            vmin_val);
+
+        intInputVal = _mm256_cvtps_epi32(ret);
+
+        intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+        intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
+                                                     const float* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
+#define INCLUDED_volk_32f_s32f_convert_16i_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
+                             vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
+                             vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal, ret;
+    __m256i intInputVal;
+    __m128i intInputVal1, intInputVal2;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        // Scale and clip
+        ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
+                            vmin_val);
+
+        intInputVal = _mm256_cvtps_epi32(ret);
+
+        intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+        intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int16_t* outputVectorPtr = outputVector;
+
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        // Scale and clip
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
+                                                       const float* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    int16_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float min_val = SHRT_MIN;
+    float max_val = SHRT_MAX;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        if (r < min_val)
+            r = min_val;
+        else if (r > max_val)
+            r = max_val;
+        *outputVectorPtr++ = (int16_t)rintf(r);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_32i.h b/kernels/volk/volk_32f_s32f_convert_32i.h

new file mode 100644 (file)

index 0000000..9c6c922
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_convert_32i.h
@@ -0,0 +1,421 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_32i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 32-bit integer after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_32i(int32_t* outputVector, const float* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to integers with a scale of 5 to maintain smallest delta
+ * \code
+ *  int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   int32_t* out = (int32_t*)volk_malloc(sizeof(int32_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   float scale = 5.f;
+ *
+ *   volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_u_H
+#define INCLUDED_volk_32f_s32f_convert_32i_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_u_avx(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1;
+    __m256i intInputVal1;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_u_sse2(int32_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1;
+    __m128i intInputVal1;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_u_sse(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_32i_generic(int32_t* outputVector,
+                                                     const float* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    int32_t* outputVectorPtr = outputVector;
+    const float* inputVectorPtr = inputVector;
+    const float min_val = (float)INT_MIN;
+    const float max_val = (float)INT_MAX;
+
+    for (unsigned int number = 0; number < num_points; number++) {
+        const float r = *inputVectorPtr++ * scalar;
+        int s;
+        if (r >= max_val)
+            s = INT_MAX;
+        else if (r < min_val)
+            s = INT_MIN;
+        else
+            s = (int32_t)rintf(r);
+        *outputVectorPtr++ = s;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_32i_a_H
+#define INCLUDED_volk_32f_s32f_convert_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_a_avx(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1;
+    __m256i intInputVal1;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for (; number < eighthPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_a_sse2(int32_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1;
+    __m128i intInputVal1;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_32i_a_sse(int32_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int32_t* outputVectorPtr = outputVector;
+
+    float min_val = INT_MIN;
+    float max_val = INT_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[0]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[1]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[2]);
+        *outputVectorPtr++ = (int32_t)rintf(outputFloatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        if (r > max_val)
+            r = max_val;
+        else if (r < min_val)
+            r = min_val;
+        outputVector[number] = (int32_t)rintf(r);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_32i_a_generic(int32_t* outputVector,
+                                                       const float* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    volk_32f_s32f_convert_32i_generic(outputVector, inputVector, scalar, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_convert_32i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_convert_8i.h b/kernels/volk/volk_32f_s32f_convert_8i.h

new file mode 100644 (file)

index 0000000..4d7c5ca
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_convert_8i.h
@@ -0,0 +1,504 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_convert_8i
+ *
+ * \b Overview
+ *
+ * Converts a floating point number to a 8-bit int after applying a
+ * scaling factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_convert_8i(int8_t* outputVector, const float* inputVector, const
+ float scalar, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: the input vector of floats.
+ * \li scalar: The value multiplied against each point in the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector.
+ *
+ * \b Example
+ * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ delta
+ *  int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   // With float -> 8 bit ints be careful of scaling
+
+ *   float scale = 5.1f;
+ *
+ *   volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_u_H
+#define INCLUDED_volk_32f_s32f_convert_8i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
+{
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    if (in > max_val) {
+        *out = (int8_t)(max_val);
+    } else if (in < min_val) {
+        *out = (int8_t)(min_val);
+    } else {
+        *out = (int8_t)(rintf(in));
+    }
+}
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_u_avx2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256i intInputVal;
+
+    for (; number < thirtysecondPoints; number++) {
+        inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal3 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal4 = _mm256_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+        intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+        intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+        intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+        intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_u_sse2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal3 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal4 = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+        intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_u_sse(int8_t* outputVector,
+                                                  const float* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    size_t inner_loop;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_loadu_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+            *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        }
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
+                                                    const float* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
+#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
+#define INCLUDED_volk_32f_s32f_convert_8i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_a_avx2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256 vmin_val = _mm256_set1_ps(min_val);
+    __m256 vmax_val = _mm256_set1_ps(max_val);
+    __m256i intInputVal;
+
+    for (; number < thirtysecondPoints; number++) {
+        inputVal1 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal3 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal4 = _mm256_load_ps(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        inputVal1 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 = _mm256_max_ps(
+            _mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+        intInputVal3 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+        intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
+
+        intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
+        intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
+        outputVectorPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_a_sse2(int8_t* outputVector,
+                                                   const float* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+    int8_t* outputVectorPtr = outputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal1 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal3 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal4 = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        inputVal1 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
+        inputVal2 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
+        inputVal3 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal3, vScalar), vmax_val), vmin_val);
+        inputVal4 =
+            _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(inputVal1);
+        intInputVal2 = _mm_cvtps_epi32(inputVal2);
+        intInputVal3 = _mm_cvtps_epi32(inputVal3);
+        intInputVal4 = _mm_cvtps_epi32(inputVal4);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
+
+        intInputVal1 = _mm_packs_epi16(intInputVal1, intInputVal3);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_convert_8i_a_sse(int8_t* outputVector,
+                                                  const float* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    size_t inner_loop;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* inputVectorPtr = (const float*)inputVector;
+
+    float min_val = INT8_MIN;
+    float max_val = INT8_MAX;
+    float r;
+
+    int8_t* outputVectorPtr = outputVector;
+    __m128 vScalar = _mm_set_ps1(scalar);
+    __m128 ret;
+    __m128 vmin_val = _mm_set_ps1(min_val);
+    __m128 vmax_val = _mm_set_ps1(max_val);
+
+    __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        ret = _mm_load_ps(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
+
+        _mm_store_ps(outputFloatBuffer, ret);
+        for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+            *outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
+        }
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        r = inputVector[number] * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
+                                                      const float* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    const float* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    float r;
+
+    for (number = 0; number < num_points; number++) {
+        r = *inputVectorPtr++ * scalar;
+        volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */
diff --git a/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h

new file mode 100644 (file)

index 0000000..3e0bc43
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_mod_rangepuppet_32f.h
@@ -0,0 +1,89 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2017, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_32F_S32F_MOD_RANGEPUPPET_32F_H
+#define INCLUDED_VOLK_32F_S32F_MOD_RANGEPUPPET_32F_H
+
+#include <volk/volk_32f_s32f_s32f_mod_range_32f.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32f_s32f_mod_rangepuppet_32f_generic(float* output,
+                                                             const float* input,
+                                                             float bound,
+                                                             unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+
+
+#ifdef LV_HAVE_SSE
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_sse(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+#ifdef LV_HAVE_SSE
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_sse(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_sse2(float* output,
+                                                            const float* input,
+                                                            float bound,
+                                                            unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_sse2(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+#ifdef LV_HAVE_SSE2
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_sse2(float* output,
+                                                            const float* input,
+                                                            float bound,
+                                                            unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_sse2(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+
+#ifdef LV_HAVE_AVX
+static inline void volk_32f_s32f_mod_rangepuppet_32f_u_avx(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_u_avx(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+#ifdef LV_HAVE_AVX
+static inline void volk_32f_s32f_mod_rangepuppet_32f_a_avx(float* output,
+                                                           const float* input,
+                                                           float bound,
+                                                           unsigned int num_points)
+{
+    volk_32f_s32f_s32f_mod_range_32f_a_avx(
+        output, input, bound - 3.141f, bound, num_points);
+}
+#endif
+#endif
diff --git a/kernels/volk/volk_32f_s32f_multiply_32f.h b/kernels/volk/volk_32f_s32f_multiply_32f.h

new file mode 100644 (file)

index 0000000..41a9d8a
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_multiply_32f.h
@@ -0,0 +1,291 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_multiply_32f
+ *
+ * \b Overview
+ *
+ * Multiplies a floating point vector by a floating point scalar.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_multiply_32f(float* cVector, const float* aVector, const float
+ * scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li scalar: the scalar value to multiply against \p aVector.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector of floats.
+ *
+ * \b Example
+ * \code
+ *  int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   float scale = 5.0f;
+ *
+ *   volk_32f_s32f_multiply_32f(out, increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+
+        cVal = _mm_mul_ps(aVal, bVal);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+
+        cVal = _mm256_mul_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
+                                                      const float* aVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    for (number = 0; number < num_points; number++) {
+        *outputPtr = (*inputPtr) * scalar;
+        inputPtr++;
+        outputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
+#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, bVal, cVal;
+    bVal = _mm_set_ps1(scalar);
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+
+        cVal = _mm_mul_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, bVal, cVal;
+    bVal = _mm256_set1_ps(scalar);
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+
+        cVal = _mm256_mul_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
+                                                     const float* aVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float32x4_t aVal, cVal;
+
+    for (number = 0; number < quarterPoints; number++) {
+        aVal = vld1q_f32(inputPtr);       // Load into NEON regs
+        cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
+        vst1q_f32(outputPtr, cVal);       // Store results back to output
+        inputPtr += 4;
+        outputPtr += 4;
+    }
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *outputPtr++ = (*inputPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
+                                                        const float* aVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* inputPtr = aVector;
+    float* outputPtr = cVector;
+    for (number = 0; number < num_points; number++) {
+        *outputPtr = (*inputPtr) * scalar;
+        inputPtr++;
+        outputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
+                                                  const float* src,
+                                                  const float scalar,
+                                                  unsigned int num_points);
+
+static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
+                                                    const float* aVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
diff --git a/kernels/volk/volk_32f_s32f_normalize.h b/kernels/volk/volk_32f_s32f_normalize.h

new file mode 100644 (file)

index 0000000..6198d65
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_normalize.h
@@ -0,0 +1,207 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_normalize
+ *
+ * \b Overview
+ *
+ * Normalizes all points in the buffer by the scalar value (divides
+ * each data point by the scalar value).
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_normalize(float* vecBuffer, const float scalar, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li vecBuffer: The buffer of values to be vectorized.
+ * \li scalar: The scale value to be applied to each buffer value.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li vecBuffer: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   float scale = 5.0f;
+ *
+ *   volk_32f_s32f_normalize(increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("increasing[%u] = %f\n", ii, increasing[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_normalize_a_H
+#define INCLUDED_volk_32f_s32f_normalize_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_normalize_a_avx(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
+
+    const float invScalar = 1.0 / scalar;
+    __m256 vecScalar = _mm256_set1_ps(invScalar);
+
+    __m256 input1;
+
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+
+        input1 = _mm256_load_ps(inputPtr);
+
+        input1 = _mm256_mul_ps(input1, vecScalar);
+
+        _mm256_store_ps(inputPtr, input1);
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_normalize_a_sse(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
+
+    const float invScalar = 1.0 / scalar;
+    __m128 vecScalar = _mm_set_ps1(invScalar);
+
+    __m128 input1;
+
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+
+        input1 = _mm_load_ps(inputPtr);
+
+        input1 = _mm_mul_ps(input1, vecScalar);
+
+        _mm_store_ps(inputPtr, input1);
+
+        inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_normalize_generic(float* vecBuffer,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_s32f_normalize_a_orc_impl(float* dst,
+                                               float* src,
+                                               const float scalar,
+                                               unsigned int num_points);
+static inline void volk_32f_s32f_normalize_u_orc(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    float invscalar = 1.0 / scalar;
+    volk_32f_s32f_normalize_a_orc_impl(vecBuffer, vecBuffer, invscalar, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_a_H */
+
+#ifndef INCLUDED_volk_32f_s32f_normalize_u_H
+#define INCLUDED_volk_32f_s32f_normalize_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_normalize_u_avx(float* vecBuffer,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* inputPtr = vecBuffer;
+
+    const float invScalar = 1.0 / scalar;
+    __m256 vecScalar = _mm256_set1_ps(invScalar);
+
+    __m256 input1;
+
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+
+        input1 = _mm256_loadu_ps(inputPtr);
+
+        input1 = _mm256_mul_ps(input1, vecScalar);
+
+        _mm256_storeu_ps(inputPtr, input1);
+
+        inputPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *inputPtr *= invScalar;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32f_s32f_normalize_u_H */
diff --git a/kernels/volk/volk_32f_s32f_power_32f.h b/kernels/volk/volk_32f_s32f_power_32f.h

new file mode 100644 (file)

index 0000000..0e85365
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_power_32f.h
@@ -0,0 +1,200 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_power_32f
+ *
+ * \b Overview
+ *
+ * Takes each input vector value to the specified power and stores the
+ * results in the return vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li power: The power to raise the input value to.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Square the numbers (0,9)
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *   }
+ *
+ *   // Normalize by the smallest delta (0.2 in this example)
+ *   float scale = 2.0f;
+ *
+ *   volk_32f_s32f_power_32f(out, increasing, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_power_32f_a_H
+#define INCLUDED_volk_32f_s32f_power_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <tmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32f_s32f_power_32f_a_sse4_1(float* cVector,
+                                                    const float* aVector,
+                                                    const float power,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
+    __m128 zeroValue = _mm_setzero_ps();
+    __m128 signMask;
+    __m128 negatedValues;
+    __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+    __m128 onesMask = _mm_set_ps1(1);
+
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        signMask = _mm_cmplt_ps(aVal, zeroValue);
+        negatedValues = _mm_sub_ps(zeroValue, aVal);
+        aVal = _mm_blendv_ps(aVal, negatedValues, signMask);
+
+        // powf4 doesn't support negative values in the base, so we mask them off and then
+        // apply the negative after
+        cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+        cVal = _mm_mul_ps(_mm_blendv_ps(onesMask, negativeOneToPower, signMask), cVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+    for (; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32f_s32f_power_32f_a_sse(float* cVector,
+                                                 const float* aVector,
+                                                 const float power,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
+    __m128 zeroValue = _mm_setzero_ps();
+    __m128 signMask;
+    __m128 negatedValues;
+    __m128 negativeOneToPower = _mm_set_ps1(powf(-1, power));
+    __m128 onesMask = _mm_set_ps1(1);
+
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        signMask = _mm_cmplt_ps(aVal, zeroValue);
+        negatedValues = _mm_sub_ps(zeroValue, aVal);
+        aVal =
+            _mm_or_ps(_mm_andnot_ps(signMask, aVal), _mm_and_ps(signMask, negatedValues));
+
+        // powf4 doesn't support negative values in the base, so we mask them off and then
+        // apply the negative after
+        cVal = powf4(aVal, vPower); // Takes each input value to the specified power
+
+        cVal = _mm_mul_ps(_mm_or_ps(_mm_andnot_ps(signMask, onesMask),
+                                    _mm_and_ps(signMask, negativeOneToPower)),
+                          cVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+    for (; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_power_32f_generic(float* cVector,
+                                                   const float* aVector,
+                                                   const float power,
+                                                   unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = powf((*aPtr++), power);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */
diff --git a/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h

new file mode 100644 (file)

index 0000000..db52725
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_s32f_mod_range_32f.h
@@ -0,0 +1,363 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2017 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_s32f_mod_range_32f
+ *
+ * \b wraps floating point numbers to stay within a defined [min,max] range
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_s32f_mod_range_32f(float* outputVector, const float* inputVector,
+ * const float lower_bound, const float upper_bound, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector
+ * \li lower_bound: The lower output boundary
+ * \li upper_bound: The upper output boundary
+ * \li num_points The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
+#define INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_s32f_mod_range_32f_generic(float* outputVector,
+                                                            const float* inputVector,
+                                                            const float lower_bound,
+                                                            const float upper_bound,
+                                                            unsigned int num_points)
+{
+    float* outPtr = outputVector;
+    const float* inPtr;
+    const float distance = upper_bound - lower_bound;
+
+    for (inPtr = inputVector; inPtr < inputVector + num_points; inPtr++) {
+        float val = *inPtr;
+        if (val < lower_bound) {
+            float excess = lower_bound - val;
+            signed int count = (int)(excess / distance);
+            *outPtr = val + (count + 1) * distance;
+        } else if (val > upper_bound) {
+            float excess = val - upper_bound;
+            signed int count = (int)(excess / distance);
+            *outPtr = val - (count + 1) * distance;
+        } else
+            *outPtr = val;
+        outPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_avx(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    const __m256 lower = _mm256_set1_ps(lower_bound);
+    const __m256 upper = _mm256_set1_ps(upper_bound);
+    const __m256 distance = _mm256_sub_ps(upper, lower);
+    __m256 input, output;
+    __m256 is_smaller, is_bigger;
+    __m256 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t eight_points = num_points / 8;
+    for (size_t counter = 0; counter < eight_points; counter++) {
+        input = _mm256_loadu_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm256_cmp_ps(
+            input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+        is_bigger = _mm256_cmp_ps(
+            input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+        excess =
+            _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm256_div_ps(excess, distance);
+        // round down
+        excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm256_set1_ps(1.0f);
+        excess = _mm256_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm256_and_ps(adj, is_smaller);
+        adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+        output = _mm256_add_ps(input, excess);
+        _mm256_storeu_ps(outPtr, output);
+        inPtr += 8;
+        outPtr += 8;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
+}
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_avx(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    const __m256 lower = _mm256_set1_ps(lower_bound);
+    const __m256 upper = _mm256_set1_ps(upper_bound);
+    const __m256 distance = _mm256_sub_ps(upper, lower);
+    __m256 input, output;
+    __m256 is_smaller, is_bigger;
+    __m256 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t eight_points = num_points / 8;
+    for (size_t counter = 0; counter < eight_points; counter++) {
+        input = _mm256_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm256_cmp_ps(
+            input, lower, _CMP_LT_OQ); // 0x11: Less than, ordered, non-signalling
+        is_bigger = _mm256_cmp_ps(
+            input, upper, _CMP_GT_OQ); // 0x1e: greater than, ordered, non-signalling
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm256_and_ps(_mm256_sub_ps(lower, input), is_smaller);
+        excess =
+            _mm256_or_ps(_mm256_and_ps(_mm256_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm256_div_ps(excess, distance);
+        // round down
+        excess = _mm256_cvtepi32_ps(_mm256_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm256_set1_ps(1.0f);
+        excess = _mm256_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm256_and_ps(adj, is_smaller);
+        adj = _mm256_or_ps(_mm256_and_ps(_mm256_set1_ps(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm256_mul_ps(_mm256_mul_ps(excess, adj), distance);
+        output = _mm256_add_ps(input, excess);
+        _mm256_store_ps(outPtr, output);
+        inPtr += 8;
+        outPtr += 8;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - eight_points * 8);
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse2(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float lower_bound,
+                                                           const float upper_bound,
+                                                           unsigned int num_points)
+{
+    const __m128 lower = _mm_set_ps1(lower_bound);
+    const __m128 upper = _mm_set_ps1(upper_bound);
+    const __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t quarter_points = num_points / 4;
+    for (size_t counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down
+        excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
+}
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse2(float* outputVector,
+                                                           const float* inputVector,
+                                                           const float lower_bound,
+                                                           const float upper_bound,
+                                                           unsigned int num_points)
+{
+    const __m128 lower = _mm_set_ps1(lower_bound);
+    const __m128 upper = _mm_set_ps1(upper_bound);
+    const __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t quarter_points = num_points / 4;
+    for (size_t counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down – for some reason, SSE doesn't come with a 4x float -> 4x int32
+        // conversion.
+        excess = _mm_cvtepi32_ps(_mm_cvttps_epi32(excess));
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
+}
+#endif /* LV_HAVE_SSE2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_s32f_mod_range_32f_u_sse(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    const __m128 lower = _mm_set_ps1(lower_bound);
+    const __m128 upper = _mm_set_ps1(upper_bound);
+    const __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+    __m128i rounddown;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t quarter_points = num_points / 4;
+    for (size_t counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down – for some reason
+        rounddown = _mm_cvttps_epi32(excess);
+        excess = _mm_cvtepi32_ps(rounddown);
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
+}
+static inline void volk_32f_s32f_s32f_mod_range_32f_a_sse(float* outputVector,
+                                                          const float* inputVector,
+                                                          const float lower_bound,
+                                                          const float upper_bound,
+                                                          unsigned int num_points)
+{
+    const __m128 lower = _mm_set_ps1(lower_bound);
+    const __m128 upper = _mm_set_ps1(upper_bound);
+    const __m128 distance = _mm_sub_ps(upper, lower);
+    __m128 input, output;
+    __m128 is_smaller, is_bigger;
+    __m128 excess, adj;
+    __m128i rounddown;
+
+    const float* inPtr = inputVector;
+    float* outPtr = outputVector;
+    const size_t quarter_points = num_points / 4;
+    for (size_t counter = 0; counter < quarter_points; counter++) {
+        input = _mm_load_ps(inPtr);
+        // calculate mask: input < lower, input > upper
+        is_smaller = _mm_cmplt_ps(input, lower);
+        is_bigger = _mm_cmpgt_ps(input, upper);
+        // find out how far we are out-of-bound – positive values!
+        excess = _mm_and_ps(_mm_sub_ps(lower, input), is_smaller);
+        excess = _mm_or_ps(_mm_and_ps(_mm_sub_ps(input, upper), is_bigger), excess);
+        // how many do we have to add? (int(excess/distance+1)*distance)
+        excess = _mm_div_ps(excess, distance);
+        // round down
+        rounddown = _mm_cvttps_epi32(excess);
+        excess = _mm_cvtepi32_ps(rounddown);
+        // plus 1
+        adj = _mm_set_ps1(1.0f);
+        excess = _mm_add_ps(excess, adj);
+        // get the sign right, adj is still {1.0f,1.0f,1.0f,1.0f}
+        adj = _mm_and_ps(adj, is_smaller);
+        adj = _mm_or_ps(_mm_and_ps(_mm_set_ps1(-1.0f), is_bigger), adj);
+        // scale by distance, sign
+        excess = _mm_mul_ps(_mm_mul_ps(excess, adj), distance);
+        output = _mm_add_ps(input, excess);
+        _mm_store_ps(outPtr, output);
+        inPtr += 4;
+        outPtr += 4;
+    }
+
+    volk_32f_s32f_s32f_mod_range_32f_generic(
+        outPtr, inPtr, lower_bound, upper_bound, num_points - quarter_points * 4);
+}
+#endif /* LV_HAVE_SSE */
+
+
+#endif /* INCLUDED_VOLK_32F_S32F_S32F_MOD_RANGE_32F_A_H */
diff --git a/kernels/volk/volk_32f_s32f_stddev_32f.h b/kernels/volk/volk_32f_s32f_stddev_32f.h

new file mode 100644 (file)

index 0000000..3b5bb6e
--- /dev/null
+++ b/kernels/volk/volk_32f_s32f_stddev_32f.h
@@ -0,0 +1,347 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_s32f_stddev_32f
+ *
+ * \b Overview
+ *
+ * Computes the standard deviation of the input buffer using the supplied mean.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_s32f_stddev_32f(float* stddev, const float* inputBuffer, const float
+ * mean, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The input vector of floats.
+ * \li mean: The mean of the input buffer.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li stddev: The output vector.
+ *
+ * \b Example
+ * Calculate the standard deviation from numbers generated with c++11's normal generator
+ * \code
+ *   int N = 1000;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float mean = 0.0f;
+ *   float* stddev = (float*)volk_malloc(sizeof(float), alignment);
+ *
+ *   // Use a normal generator with 0 mean, stddev = 1
+ *   std::default_random_engine generator;
+ *   std::normal_distribution<float> distribution(mean,1);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] =  distribution(generator);
+ *   }
+ *
+ *   volk_32f_s32f_power_32f(stddev, increasing, mean, N);
+ *
+ *   printf("std. dev. = %f\n", *stddev);
+ *
+ *   volk_free(increasing);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
+                                                     const float* inputBuffer,
+                                                     const float mean,
+                                                     unsigned int num_points)
+{
+    float returnValue = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int sixteenthPoints = num_points / 16;
+
+        const float* aPtr = inputBuffer;
+
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal1, aVal2, aVal3, aVal4;
+        __m128 cVal1, cVal2, cVal3, cVal4;
+        for (; number < sixteenthPoints; number++) {
+            aVal1 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm_load_ps(aPtr);
+            aPtr += 4;
+            cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm_or_ps(cVal1, cVal2);
+            cVal3 = _mm_or_ps(cVal3, cVal4);
+            cVal1 = _mm_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = sixteenthPoints * 16;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
+    }
+    *stddev = returnValue;
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
+{
+    float returnValue = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int quarterPoints = num_points / 4;
+
+        const float* aPtr = inputBuffer;
+
+        __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
+
+        __m128 squareAccumulator = _mm_setzero_ps();
+        __m128 aVal = _mm_setzero_ps();
+        for (; number < quarterPoints; number++) {
+            aVal = _mm_load_ps(aPtr);      // aVal = x
+            aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
+            squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
+            aPtr += 4;
+        }
+        _mm_store_ps(squareBuffer,
+                     squareAccumulator); // Store the results back into the C container
+        returnValue = squareBuffer[0];
+        returnValue += squareBuffer[1];
+        returnValue += squareBuffer[2];
+        returnValue += squareBuffer[3];
+
+        number = quarterPoints * 4;
+        for (; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
+    }
+    *stddev = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
+{
+    float stdDev = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm256_load_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_store_ps(squareBuffer,
+                        squareAccumulator); // Store the results back into the C container
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        stdDev /= num_points;
+        stdDev -= (mean * mean);
+        stdDev = sqrtf(stdDev);
+    }
+    *stddev = stdDev;
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
+                                                    const float* inputBuffer,
+                                                    const float mean,
+                                                    unsigned int num_points)
+{
+    float returnValue = 0;
+    if (num_points > 0) {
+        const float* aPtr = inputBuffer;
+        unsigned int number = 0;
+
+        for (number = 0; number < num_points; number++) {
+            returnValue += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+
+        returnValue /= num_points;
+        returnValue -= (mean * mean);
+        returnValue = sqrtf(returnValue);
+    }
+    *stddev = returnValue;
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
+#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
+                                                  const float* inputBuffer,
+                                                  const float mean,
+                                                  unsigned int num_points)
+{
+    float stdDev = 0;
+    if (num_points > 0) {
+        unsigned int number = 0;
+        const unsigned int thirtySecondthPoints = num_points / 32;
+
+        const float* aPtr = inputBuffer;
+        __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
+
+        __m256 squareAccumulator = _mm256_setzero_ps();
+        __m256 aVal1, aVal2, aVal3, aVal4;
+        __m256 cVal1, cVal2, cVal3, cVal4;
+        for (; number < thirtySecondthPoints; number++) {
+            aVal1 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
+
+            aVal2 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
+
+            aVal3 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
+
+            aVal4 = _mm256_loadu_ps(aPtr);
+            aPtr += 8;
+            cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
+
+            cVal1 = _mm256_or_ps(cVal1, cVal2);
+            cVal3 = _mm256_or_ps(cVal3, cVal4);
+            cVal1 = _mm256_or_ps(cVal1, cVal3);
+
+            squareAccumulator =
+                _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
+        }
+        _mm256_storeu_ps(
+            squareBuffer,
+            squareAccumulator); // Store the results back into the C container
+        stdDev = squareBuffer[0];
+        stdDev += squareBuffer[1];
+        stdDev += squareBuffer[2];
+        stdDev += squareBuffer[3];
+        stdDev += squareBuffer[4];
+        stdDev += squareBuffer[5];
+        stdDev += squareBuffer[6];
+        stdDev += squareBuffer[7];
+
+        number = thirtySecondthPoints * 32;
+        for (; number < num_points; number++) {
+            stdDev += (*aPtr) * (*aPtr);
+            aPtr++;
+        }
+        stdDev /= num_points;
+        stdDev -= (mean * mean);
+        stdDev = sqrtf(stdDev);
+    }
+    *stddev = stdDev;
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
diff --git a/kernels/volk/volk_32f_sin_32f.h b/kernels/volk/volk_32f_sin_32f.h

new file mode 100644 (file)

index 0000000..371e424
--- /dev/null
+++ b/kernels/volk/volk_32f_sin_32f.h
@@ -0,0 +1,897 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_sin_32f
+ *
+ * \b Overview
+ *
+ * Computes the sine of the input vector and stores the results in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_sin_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * Calculate sin(theta) for several common angles.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0.000;
+ *   in[1] = 0.524;
+ *   in[2] = 0.786;
+ *   in[3] = 1.047;
+ *   in[4] = 1.571;
+ *   in[5] = 1.571;
+ *   in[6] = 2.094;
+ *   in[7] = 2.356;
+ *   in[8] = 2.618;
+ *   in[9] = 3.142;
+ *
+ *   volk_32f_sin_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("sin(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef INCLUDED_volk_32f_sin_32f_a_H
+#define INCLUDED_volk_32f_sin_32f_a_H
+#ifdef LV_HAVE_AVX512F
+
+#include <immintrin.h>
+static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
+                                              const float* inVector,
+                                              unsigned int num_points)
+{
+    float* sinPtr = sinVector;
+    const float* inPtr = inVector;
+
+    unsigned int number = 0;
+    unsigned int sixteenPoints = num_points / 16;
+    unsigned int i = 0;
+
+    __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones;
+    __m512 sine, cosine;
+    __m512i q, zeros, ones, twos, fours;
+
+    m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm512_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm512_set1_ps(4.0);
+    ftwos = _mm512_set1_ps(2.0);
+    fones = _mm512_set1_ps(1.0);
+    zeros = _mm512_setzero_epi32();
+    ones = _mm512_set1_epi32(1);
+    twos = _mm512_set1_epi32(2);
+    fours = _mm512_set1_epi32(4);
+
+    cp1 = _mm512_set1_ps(1.0);
+    cp2 = _mm512_set1_ps(0.08333333333333333);
+    cp3 = _mm512_set1_ps(0.002777777777777778);
+    cp4 = _mm512_set1_ps(4.96031746031746e-05);
+    cp5 = _mm512_set1_ps(5.511463844797178e-07);
+    __mmask16 condition1, condition2, ltZero;
+
+    for (; number < sixteenPoints; number++) {
+        aVal = _mm512_load_ps(inPtr);
+        // s = fabs(aVal)
+        s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
+
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
+
+        s = _mm512_fnmadd_ps(r, pio4A, s);
+        s = _mm512_fnmadd_ps(r, pio4B, s);
+        s = _mm512_fnmadd_ps(r, pio4C, s);
+
+        s = _mm512_div_ps(
+            s,
+            _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm512_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm512_mul_ps(
+            _mm512_fmadd_ps(
+                _mm512_fmsub_ps(
+                    _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+        s = _mm512_div_ps(s, ftwos);
+
+        sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
+        cosine = _mm512_sub_ps(fones, s);
+
+        condition1 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
+        ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
+        condition2 = _mm512_kxor(
+            _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
+
+        sine = _mm512_mask_blend_ps(condition1, sine, cosine);
+        sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
+        _mm512_store_ps(sinPtr, sine);
+        inPtr += 16;
+        sinPtr += 16;
+    }
+
+    number = sixteenPoints * 16;
+    for (; number < num_points; number++) {
+        *sinPtr++ = sinf(*inPtr++);
+    }
+}
+#endif
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_store_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_store_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition2;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        _mm_store_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#endif /* INCLUDED_volk_32f_sin_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_sin_32f_u_H
+#define INCLUDED_volk_32f_sin_32f_u_H
+
+#ifdef LV_HAVE_AVX512F
+
+#include <immintrin.h>
+static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
+                                              const float* inVector,
+                                              unsigned int num_points)
+{
+    float* sinPtr = sinVector;
+    const float* inPtr = inVector;
+
+    unsigned int number = 0;
+    unsigned int sixteenPoints = num_points / 16;
+    unsigned int i = 0;
+
+    __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
+        fones;
+    __m512 sine, cosine;
+    __m512i q, zeros, ones, twos, fours;
+
+    m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
+    pio4A = _mm512_set1_ps(0.7853981554508209228515625);
+    pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
+    pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
+    ffours = _mm512_set1_ps(4.0);
+    ftwos = _mm512_set1_ps(2.0);
+    fones = _mm512_set1_ps(1.0);
+    zeros = _mm512_setzero_epi32();
+    ones = _mm512_set1_epi32(1);
+    twos = _mm512_set1_epi32(2);
+    fours = _mm512_set1_epi32(4);
+
+    cp1 = _mm512_set1_ps(1.0);
+    cp2 = _mm512_set1_ps(0.08333333333333333);
+    cp3 = _mm512_set1_ps(0.002777777777777778);
+    cp4 = _mm512_set1_ps(4.96031746031746e-05);
+    cp5 = _mm512_set1_ps(5.511463844797178e-07);
+    __mmask16 condition1, condition2, ltZero;
+
+    for (; number < sixteenPoints; number++) {
+        aVal = _mm512_loadu_ps(inPtr);
+        // s = fabs(aVal)
+        s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
+
+        // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
+        q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
+        // r = q + q&1, q indicates quadrant, r gives
+        r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
+
+        s = _mm512_fnmadd_ps(r, pio4A, s);
+        s = _mm512_fnmadd_ps(r, pio4B, s);
+        s = _mm512_fnmadd_ps(r, pio4C, s);
+
+        s = _mm512_div_ps(
+            s,
+            _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm512_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm512_mul_ps(
+            _mm512_fmadd_ps(
+                _mm512_fmsub_ps(
+                    _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++)
+            s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
+        s = _mm512_div_ps(s, ftwos);
+
+        sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
+        cosine = _mm512_sub_ps(fones, s);
+
+        condition1 = _mm512_cmpneq_epi32_mask(
+            _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
+        ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
+        condition2 = _mm512_kxor(
+            _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
+
+        sine = _mm512_mask_blend_ps(condition1, sine, cosine);
+        sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
+        _mm512_storeu_ps(sinPtr, sine);
+        inPtr += 16;
+        sinPtr += 16;
+    }
+
+    number = sixteenPoints * 16;
+    for (; number < num_points; number++) {
+        *sinPtr++ = sinf(*inPtr++);
+    }
+}
+#endif
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_storeu_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, condition1, condition2;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        // Need this condition only for cos
+        // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
+        // twos), fours)), fzeroes);
+
+        sine =
+            _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        _mm256_storeu_ps(bPtr, sine);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = sin(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, condition1, condition2;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        _mm_storeu_ps(bPtr, sine);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *bPtr++ = sinf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float* bVectorPtr = bVector;
+    const float* aVectorPtr = aVector;
+
+    float32x4_t b_vec;
+    float32x4_t a_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aVectorPtr);
+        // Prefetch next one, speeds things up
+        __VOLK_PREFETCH(aVectorPtr + 4);
+        b_vec = _vsinq_f32(a_vec);
+        vst1q_f32(bVectorPtr, b_vec);
+        // move pointers ahead
+        bVectorPtr += 4;
+        aVectorPtr += 4;
+    }
+
+    // Deal with the rest
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *bVectorPtr++ = sinf(*aVectorPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_sin_32f_u_H */
diff --git a/kernels/volk/volk_32f_sqrt_32f.h b/kernels/volk/volk_32f_sqrt_32f.h

new file mode 100644 (file)

index 0000000..ad34c63
--- /dev/null
+++ b/kernels/volk/volk_32f_sqrt_32f.h
@@ -0,0 +1,221 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_sqrt_32f
+ *
+ * \b Overview
+ *
+ * Computes the square root of the input vector and stores the results
+ * in the output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_sqrt_32f(float* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+    float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+    float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        in[ii] = (float)(ii*ii);
+    }
+
+    volk_32f_sqrt_32f(out, in, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out(%i) = %f\n", ii, out[ii]);
+    }
+
+    volk_free(in);
+    volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
+#define INCLUDED_volk_32f_sqrt_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+
+        cVal = _mm_sqrt_ps(aVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+
+        cVal = _mm256_sqrt_ps(aVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float32x4_t in_vec, out_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        in_vec = vld1q_f32(aPtr);
+        // note that armv8 has vsqrt_f32 which will be much better
+        out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
+        vst1q_f32(cPtr, out_vec);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
+
+static inline void
+volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
+{
+    volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
+#define INCLUDED_volk_32f_sqrt_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+
+        cVal = _mm256_sqrt_ps(aVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = sqrtf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h

new file mode 100644 (file)

index 0000000..c71514b
--- /dev/null
+++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h
@@ -0,0 +1,572 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_stddev_and_mean_32f_x2
+ *
+ * \b Overview
+ *
+ * Computes the standard deviation and mean of the input buffer by means of
+ * Youngs and Cramer's Algorithm
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_stddev_and_mean_32f_x2(float* stddev, float* mean, const float*
+ * inputBuffer, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The buffer of points.
+ * \li num_points The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li stddev: The calculated standard deviation.
+ * \li mean: The mean of the input buffer.
+ *
+ * \b Example
+ * Generate random numbers with c++11's normal distribution and estimate the mean and
+ * standard deviation
+ * \code
+ *   int N = 1000;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment);
+ *   float* mean = (float*) volk_malloc(sizeof(float), alignment);
+ *   float* stddev = (float*) volk_malloc(sizeof(float), alignment);
+ *
+ *   // Use a normal generator with 0 mean, stddev 1000
+ *   std::default_random_engine generator;
+ *   std::normal_distribution<float> distribution(0, 1000);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii) {
+ *       rand_numbers[ii] =  distribution(generator);
+ *   }
+ *
+ *   volk_32f_stddev_and_mean_32f_x2(stddev, mean, rand_numbers, N);
+ *
+ *   printf("std. dev. = %f\n", *stddev);
+ *   printf("mean = %f\n", *mean);
+ *
+ *   volk_free(rand_numbers);
+ *   volk_free(mean);
+ *   volk_free(stddev);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <volk/volk_common.h>
+
+// Youngs and Cramer's Algorithm for calculating std and mean
+//   Using the methods discussed here:
+//   https://doi.org/10.1145/3221269.3223036
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
+                                                           float* mean,
+                                                           const float* inputBuffer,
+                                                           unsigned int num_points)
+{
+    const float* in_ptr = inputBuffer;
+    if (num_points == 0) {
+        return;
+    } else if (num_points == 1) {
+        *stddev = 0.f;
+        *mean = (*in_ptr);
+        return;
+    }
+
+    float Sum[2];
+    float SquareSum[2] = { 0.f, 0.f };
+    Sum[0] = (*in_ptr++);
+    Sum[1] = (*in_ptr++);
+
+    uint32_t half_points = num_points / 2;
+
+    for (uint32_t number = 1; number < half_points; number++) {
+        float Val0 = (*in_ptr++);
+        float Val1 = (*in_ptr++);
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+        float r = 1.f / (n * n_plus_one);
+
+        Sum[0] += Val0;
+        Sum[1] += Val1;
+
+        SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
+        SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
+    }
+
+    SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
+    Sum[0] += Sum[1];
+
+    uint32_t points_done = half_points * 2;
+
+    for (; points_done < num_points; points_done++) {
+        float Val = (*in_ptr++);
+        float n = (float)points_done;
+        float n_plus_one = n + 1.f;
+        float r = 1.f / (n * n_plus_one);
+        Sum[0] += Val;
+        SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
+    }
+    *stddev = sqrtf(SquareSum[0] / num_points);
+    *mean = Sum[0] / num_points;
+}
+#endif /* LV_HAVE_GENERIC */
+
+static inline float update_square_sum_1_val(const float SquareSum,
+                                            const float Sum,
+                                            const uint32_t len,
+                                            const float val)
+{
+    // Updates a sum of squares calculated over len values with the value val
+    float n = (float)len;
+    float n_plus_one = n + 1.f;
+    return SquareSum +
+           1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
+}
+
+static inline float add_square_sums(const float SquareSum0,
+                                    const float Sum0,
+                                    const float SquareSum1,
+                                    const float Sum1,
+                                    const uint32_t len)
+{
+    // Add two sums of squares calculated over the same number of values, len
+    float n = (float)len;
+    return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
+}
+
+static inline void accrue_result(float* PartialSquareSums,
+                                 float* PartialSums,
+                                 const uint32_t NumberOfPartitions,
+                                 const uint32_t PartitionLen)
+{
+    // Add all partial sums and square sums into the first element of the arrays
+    uint32_t accumulators = NumberOfPartitions;
+    uint32_t stages = 0;
+    uint32_t offset = 1;
+    uint32_t partition_len = PartitionLen;
+
+    while (accumulators >>= 1) {
+        stages++;
+    } // Integer log2
+    accumulators = NumberOfPartitions;
+
+    for (uint32_t s = 0; s < stages; s++) {
+        accumulators /= 2;
+        uint32_t idx = 0;
+        for (uint32_t a = 0; a < accumulators; a++) {
+            PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
+                                                     PartialSums[idx],
+                                                     PartialSquareSums[idx + offset],
+                                                     PartialSums[idx + offset],
+                                                     partition_len);
+            PartialSums[idx] += PartialSums[idx + offset];
+            idx += 2 * offset;
+        }
+        offset *= 2;
+        partition_len *= 2;
+    }
+}
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
+                                                        float* mean,
+                                                        const float* inputBuffer,
+                                                        unsigned int num_points)
+{
+    if (num_points < 8) {
+        volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+        return;
+    }
+
+    const float* in_ptr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
+    __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
+
+    const uint32_t eigth_points = num_points / 8;
+
+    float32x4_t Sum0, Sum1;
+
+    Sum0 = vld1q_f32((const float32_t*)in_ptr);
+    in_ptr += 4;
+    __VOLK_PREFETCH(in_ptr + 4);
+
+    Sum1 = vld1q_f32((const float32_t*)in_ptr);
+    in_ptr += 4;
+    __VOLK_PREFETCH(in_ptr + 4);
+
+    float32x4_t SquareSum0 = { 0.f };
+    float32x4_t SquareSum1 = { 0.f };
+
+    float32x4_t Values0, Values1;
+    float32x4_t Aux0, Aux1;
+    float32x4_t Reciprocal;
+
+    for (uint32_t number = 1; number < eigth_points; number++) {
+        Values0 = vld1q_f32(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        Values1 = vld1q_f32(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+        Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
+
+        Sum0 = vaddq_f32(Sum0, Values0);
+        Aux0 = vdupq_n_f32(n_plus_one);
+        SquareSum0 =
+            _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
+
+        Sum1 = vaddq_f32(Sum1, Values1);
+        Aux1 = vdupq_n_f32(n_plus_one);
+        SquareSum1 =
+            _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
+    }
+
+    vst1q_f32(&SumLocal[0], Sum0);
+    vst1q_f32(&SumLocal[4], Sum1);
+    vst1q_f32(&SquareSumLocal[0], SquareSum0);
+    vst1q_f32(&SquareSumLocal[4], SquareSum1);
+
+    accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
+
+    uint32_t points_done = eigth_points * 8;
+
+    for (; points_done < num_points; points_done++) {
+        float val = (*in_ptr++);
+        SumLocal[0] += val;
+        SquareSumLocal[0] =
+            update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
+    }
+
+    *stddev = sqrtf(SquareSumLocal[0] / num_points);
+    *mean = SumLocal[0] / num_points;
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
+{
+    if (num_points < 8) {
+        volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+        return;
+    }
+
+    const float* in_ptr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
+    __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
+
+
+    const uint32_t eigth_points = num_points / 8;
+
+    __m128 Sum0 = _mm_loadu_ps(in_ptr);
+    in_ptr += 4;
+    __m128 Sum1 = _mm_loadu_ps(in_ptr);
+    in_ptr += 4;
+    __m128 SquareSum0 = _mm_setzero_ps();
+    __m128 SquareSum1 = _mm_setzero_ps();
+    __m128 Values0, Values1;
+    __m128 Aux0, Aux1;
+    __m128 Reciprocal;
+
+    for (uint32_t number = 1; number < eigth_points; number++) {
+        Values0 = _mm_loadu_ps(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        Values1 = _mm_loadu_ps(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+        Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
+
+        Sum0 = _mm_add_ps(Sum0, Values0);
+        Aux0 = _mm_set_ps1(n_plus_one);
+        SquareSum0 =
+            _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
+
+        Sum1 = _mm_add_ps(Sum1, Values1);
+        Aux1 = _mm_set_ps1(n_plus_one);
+        SquareSum1 =
+            _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
+    }
+
+    _mm_store_ps(&SumLocal[0], Sum0);
+    _mm_store_ps(&SumLocal[4], Sum1);
+    _mm_store_ps(&SquareSumLocal[0], SquareSum0);
+    _mm_store_ps(&SquareSumLocal[4], SquareSum1);
+
+    accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
+
+    uint32_t points_done = eigth_points * 8;
+
+    for (; points_done < num_points; points_done++) {
+        float val = (*in_ptr++);
+        SumLocal[0] += val;
+        SquareSumLocal[0] =
+            update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
+    }
+
+    *stddev = sqrtf(SquareSumLocal[0] / num_points);
+    *mean = SumLocal[0] / num_points;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
+{
+    if (num_points < 16) {
+        volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+        return;
+    }
+
+    const float* in_ptr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
+    __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
+
+    const unsigned int sixteenth_points = num_points / 16;
+
+    __m256 Sum0 = _mm256_loadu_ps(in_ptr);
+    in_ptr += 8;
+    __m256 Sum1 = _mm256_loadu_ps(in_ptr);
+    in_ptr += 8;
+
+    __m256 SquareSum0 = _mm256_setzero_ps();
+    __m256 SquareSum1 = _mm256_setzero_ps();
+    __m256 Values0, Values1;
+    __m256 Aux0, Aux1;
+    __m256 Reciprocal;
+
+    for (uint32_t number = 1; number < sixteenth_points; number++) {
+        Values0 = _mm256_loadu_ps(in_ptr);
+        in_ptr += 8;
+        __VOLK_PREFETCH(in_ptr + 8);
+
+        Values1 = _mm256_loadu_ps(in_ptr);
+        in_ptr += 8;
+        __VOLK_PREFETCH(in_ptr + 8);
+
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+
+        Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
+
+        Sum0 = _mm256_add_ps(Sum0, Values0);
+        Aux0 = _mm256_set1_ps(n_plus_one);
+        SquareSum0 =
+            _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
+
+        Sum1 = _mm256_add_ps(Sum1, Values1);
+        Aux1 = _mm256_set1_ps(n_plus_one);
+        SquareSum1 =
+            _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
+    }
+
+    _mm256_store_ps(&SumLocal[0], Sum0);
+    _mm256_store_ps(&SumLocal[8], Sum1);
+    _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
+    _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
+
+    accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
+
+    uint32_t points_done = sixteenth_points * 16;
+
+    for (; points_done < num_points; points_done++) {
+        float val = (*in_ptr++);
+        SumLocal[0] += val;
+        SquareSumLocal[0] =
+            update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
+    }
+
+    *stddev = sqrtf(SquareSumLocal[0] / num_points);
+    *mean = SumLocal[0] / num_points;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
+{
+    if (num_points < 8) {
+        volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+        return;
+    }
+
+    const float* in_ptr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
+    __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
+
+
+    const uint32_t eigth_points = num_points / 8;
+
+    __m128 Sum0 = _mm_load_ps(in_ptr);
+    in_ptr += 4;
+    __m128 Sum1 = _mm_load_ps(in_ptr);
+    in_ptr += 4;
+    __m128 SquareSum0 = _mm_setzero_ps();
+    __m128 SquareSum1 = _mm_setzero_ps();
+    __m128 Values0, Values1;
+    __m128 Aux0, Aux1;
+    __m128 Reciprocal;
+
+    for (uint32_t number = 1; number < eigth_points; number++) {
+        Values0 = _mm_load_ps(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        Values1 = _mm_load_ps(in_ptr);
+        in_ptr += 4;
+        __VOLK_PREFETCH(in_ptr + 4);
+
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+        Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
+
+        Sum0 = _mm_add_ps(Sum0, Values0);
+        Aux0 = _mm_set_ps1(n_plus_one);
+        SquareSum0 =
+            _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
+
+        Sum1 = _mm_add_ps(Sum1, Values1);
+        Aux1 = _mm_set_ps1(n_plus_one);
+        SquareSum1 =
+            _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
+    }
+
+    _mm_store_ps(&SumLocal[0], Sum0);
+    _mm_store_ps(&SumLocal[4], Sum1);
+    _mm_store_ps(&SquareSumLocal[0], SquareSum0);
+    _mm_store_ps(&SquareSumLocal[4], SquareSum1);
+
+    accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
+
+    uint32_t points_done = eigth_points * 8;
+
+    for (; points_done < num_points; points_done++) {
+        float val = (*in_ptr++);
+        SumLocal[0] += val;
+        SquareSumLocal[0] =
+            update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
+    }
+
+    *stddev = sqrtf(SquareSumLocal[0] / num_points);
+    *mean = SumLocal[0] / num_points;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
+                                                         float* mean,
+                                                         const float* inputBuffer,
+                                                         unsigned int num_points)
+{
+    if (num_points < 16) {
+        volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
+        return;
+    }
+
+    const float* in_ptr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
+    __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
+
+    const unsigned int sixteenth_points = num_points / 16;
+
+    __m256 Sum0 = _mm256_load_ps(in_ptr);
+    in_ptr += 8;
+    __m256 Sum1 = _mm256_load_ps(in_ptr);
+    in_ptr += 8;
+
+    __m256 SquareSum0 = _mm256_setzero_ps();
+    __m256 SquareSum1 = _mm256_setzero_ps();
+    __m256 Values0, Values1;
+    __m256 Aux0, Aux1;
+    __m256 Reciprocal;
+
+    for (uint32_t number = 1; number < sixteenth_points; number++) {
+        Values0 = _mm256_load_ps(in_ptr);
+        in_ptr += 8;
+        __VOLK_PREFETCH(in_ptr + 8);
+
+        Values1 = _mm256_load_ps(in_ptr);
+        in_ptr += 8;
+        __VOLK_PREFETCH(in_ptr + 8);
+
+        float n = (float)number;
+        float n_plus_one = n + 1.f;
+
+        Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
+
+        Sum0 = _mm256_add_ps(Sum0, Values0);
+        Aux0 = _mm256_set1_ps(n_plus_one);
+        SquareSum0 =
+            _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
+
+        Sum1 = _mm256_add_ps(Sum1, Values1);
+        Aux1 = _mm256_set1_ps(n_plus_one);
+        SquareSum1 =
+            _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
+    }
+
+    _mm256_store_ps(&SumLocal[0], Sum0);
+    _mm256_store_ps(&SumLocal[8], Sum1);
+    _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
+    _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
+
+    accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
+
+    uint32_t points_done = sixteenth_points * 16;
+
+    for (; points_done < num_points; points_done++) {
+        float val = (*in_ptr++);
+        SumLocal[0] += val;
+        SquareSumLocal[0] =
+            update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
+    }
+
+    *stddev = sqrtf(SquareSumLocal[0] / num_points);
+    *mean = SumLocal[0] / num_points;
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
diff --git a/kernels/volk/volk_32f_tan_32f.h b/kernels/volk/volk_32f_tan_32f.h

new file mode 100644 (file)

index 0000000..1ec0202
--- /dev/null
+++ b/kernels/volk/volk_32f_tan_32f.h
@@ -0,0 +1,754 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_tan_32f
+ *
+ * \b Overview
+ *
+ * Computes the tangent of each element of the aVector.
+ *
+ * b[i] = tan(a[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_tan_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The buffer of points.
+ * \li num_points: The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li bVector: The output buffer.
+ *
+ * \b Example
+ * Calculate tan(theta) for common angles.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0.000;
+ *   in[1] = 0.524;
+ *   in[2] = 0.785;
+ *   in[3] = 1.047;
+ *   in[4] = 1.571  ;
+ *   in[5] = 1.571  ;
+ *   in[6] = -1.047;
+ *   in[7] = -0.785;
+ *   in[8] = -0.524;
+ *   in[9] = -0.000;
+ *
+ *   volk_32f_tan_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("tan(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifndef INCLUDED_volk_32f_tan_32f_a_H
+#define INCLUDED_volk_32f_tan_32f_a_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_tan_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_store_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_tan_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_store_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_tan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_store_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#endif /* INCLUDED_volk_32f_tan_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_tan_32f_u_H
+#define INCLUDED_volk_32f_tan_32f_u_H
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_tan_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
+        s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_fmadd_ps(
+                _mm256_fmsub_ps(
+                    _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
+                s,
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_storeu_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_tan_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int eighthPoints = num_points / 8;
+    unsigned int i = 0;
+
+    __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m256 sine, cosine, tangent, condition1, condition2, condition3;
+    __m256i q, r, ones, twos, fours;
+
+    m4pi = _mm256_set1_ps(1.273239545);
+    pio4A = _mm256_set1_ps(0.78515625);
+    pio4B = _mm256_set1_ps(0.241876e-3);
+    ffours = _mm256_set1_ps(4.0);
+    ftwos = _mm256_set1_ps(2.0);
+    fones = _mm256_set1_ps(1.0);
+    fzeroes = _mm256_setzero_ps();
+    ones = _mm256_set1_epi32(1);
+    twos = _mm256_set1_epi32(2);
+    fours = _mm256_set1_epi32(4);
+
+    cp1 = _mm256_set1_ps(1.0);
+    cp2 = _mm256_set1_ps(0.83333333e-1);
+    cp3 = _mm256_set1_ps(0.2777778e-2);
+    cp4 = _mm256_set1_ps(0.49603e-4);
+    cp5 = _mm256_set1_ps(0.551e-6);
+
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        s = _mm256_sub_ps(aVal,
+                          _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
+                                        _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
+        q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
+        r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
+
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
+        s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
+
+        s = _mm256_div_ps(
+            s,
+            _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm256_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm256_mul_ps(
+            _mm256_add_ps(
+                _mm256_mul_ps(
+                    _mm256_sub_ps(
+                        _mm256_mul_ps(
+                            _mm256_add_ps(
+                                _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
+                                              s),
+                                cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
+        }
+        s = _mm256_div_ps(s, ftwos);
+
+        sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
+        cosine = _mm256_sub_ps(fones, s);
+
+        condition1 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+        condition2 = _mm256_cmp_ps(
+            _mm256_cmp_ps(
+                _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
+            _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
+            _CMP_NEQ_UQ);
+        condition3 = _mm256_cmp_ps(
+            _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, twos), fours)),
+            fzeroes,
+            _CMP_NEQ_UQ);
+
+        __m256 temp = cosine;
+        cosine =
+            _mm256_add_ps(cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1));
+        sine = _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(temp, sine), condition1));
+        sine = _mm256_sub_ps(
+            sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
+        cosine = _mm256_sub_ps(
+            cosine,
+            _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)), condition3));
+        tangent = _mm256_div_ps(sine, cosine);
+        _mm256_storeu_ps(bPtr, tangent);
+        aPtr += 8;
+        bPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *bPtr++ = tan(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_tan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    unsigned int quarterPoints = num_points / 4;
+    unsigned int i = 0;
+
+    __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
+        fzeroes;
+    __m128 sine, cosine, tangent, condition1, condition2, condition3;
+    __m128i q, r, ones, twos, fours;
+
+    m4pi = _mm_set1_ps(1.273239545);
+    pio4A = _mm_set1_ps(0.78515625);
+    pio4B = _mm_set1_ps(0.241876e-3);
+    ffours = _mm_set1_ps(4.0);
+    ftwos = _mm_set1_ps(2.0);
+    fones = _mm_set1_ps(1.0);
+    fzeroes = _mm_setzero_ps();
+    ones = _mm_set1_epi32(1);
+    twos = _mm_set1_epi32(2);
+    fours = _mm_set1_epi32(4);
+
+    cp1 = _mm_set1_ps(1.0);
+    cp2 = _mm_set1_ps(0.83333333e-1);
+    cp3 = _mm_set1_ps(0.2777778e-2);
+    cp4 = _mm_set1_ps(0.49603e-4);
+    cp5 = _mm_set1_ps(0.551e-6);
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_loadu_ps(aPtr);
+        s = _mm_sub_ps(aVal,
+                       _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
+        q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
+        r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+        s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+        s = _mm_div_ps(
+            s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
+        s = _mm_mul_ps(s, s);
+        // Evaluate Taylor series
+        s = _mm_mul_ps(
+            _mm_add_ps(
+                _mm_mul_ps(
+                    _mm_sub_ps(
+                        _mm_mul_ps(
+                            _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
+                                       cp3),
+                            s),
+                        cp2),
+                    s),
+                cp1),
+            s);
+
+        for (i = 0; i < 3; i++) {
+            s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+        }
+        s = _mm_div_ps(s, ftwos);
+
+        sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+        cosine = _mm_sub_ps(fones, s);
+
+        condition1 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
+        condition2 = _mm_cmpneq_ps(
+            _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
+            _mm_cmplt_ps(aVal, fzeroes));
+        condition3 = _mm_cmpneq_ps(
+            _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
+
+        __m128 temp = cosine;
+        cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
+        sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
+        sine =
+            _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
+        cosine = _mm_sub_ps(
+            cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
+        tangent = _mm_div_ps(sine, cosine);
+        _mm_storeu_ps(bPtr, tangent);
+        aPtr += 4;
+        bPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+    float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (; number < num_points; number++) {
+        *bPtr++ = tanf(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32f_tan_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    float* bVectorPtr = bVector;
+    const float* aVectorPtr = aVector;
+
+    float32x4_t b_vec;
+    float32x4_t a_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aVectorPtr);
+        // Prefetch next one, speeds things up
+        __VOLK_PREFETCH(aVectorPtr + 4);
+        b_vec = _vtanq_f32(a_vec);
+        vst1q_f32(bVectorPtr, b_vec);
+        // move pointers ahead
+        bVectorPtr += 4;
+        aVectorPtr += 4;
+    }
+
+    // Deal with the rest
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *bVectorPtr++ = tanf(*aVectorPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_tan_32f_u_H */
diff --git a/kernels/volk/volk_32f_tanh_32f.h b/kernels/volk/volk_32f_tanh_32f.h

new file mode 100644 (file)

index 0000000..3e36adb
--- /dev/null
+++ b/kernels/volk/volk_32f_tanh_32f.h
@@ -0,0 +1,415 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_tanh_32f
+ *
+ * \b Overview
+ *
+ * Computes the hyperbolic tangent of each element of the aVector:
+ *
+ * c[i] = tanh(a[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_tanh_32f(float* cVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The buffer of points.
+ * \li num_points: The number of values in input buffer.
+ *
+ * \b Outputs
+ * \li cVector: The output buffer.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       // the approximate artanh(x) for x<1
+ *       float x = (float)ii / (float)N;
+ *       in[ii] = 0.5 * std::log((1.f+x)/(1.f-x));
+ *   }
+ *
+ *   volk_32f_tanh_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_tanh_32f_a_H
+#define INCLUDED_volk_32f_tanh_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    for (; number < num_points; number++) {
+        *cPtr++ = tanhf(*aPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    for (unsigned int number = 0; number < num_points; number++) {
+        if (*aPtr > 4.97)
+            *cPtr++ = 1;
+        else if (*aPtr <= -4.97)
+            *cPtr++ = -1;
+        else {
+            float x2 = (*aPtr) * (*aPtr);
+            float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+            float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+            *cPtr++ = a / b;
+            aPtr++;
+        }
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal, x2, a, b;
+    __m128 const1, const2, const3, const4, const5, const6;
+    const1 = _mm_set_ps1(135135.0f);
+    const2 = _mm_set_ps1(17325.0f);
+    const3 = _mm_set_ps1(378.0f);
+    const4 = _mm_set_ps1(62370.0f);
+    const5 = _mm_set_ps1(3150.0f);
+    const6 = _mm_set_ps1(28.0f);
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        x2 = _mm_mul_ps(aVal, aVal);
+        a = _mm_mul_ps(
+            aVal,
+            _mm_add_ps(
+                const1,
+                _mm_mul_ps(x2,
+                           _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+        b = _mm_add_ps(
+            const1,
+            _mm_mul_ps(
+                x2,
+                _mm_add_ps(const4,
+                           _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+        cVal = _mm_div_ps(a, b);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_add_ps(
+                const1,
+                _mm256_mul_ps(
+                    x2,
+                    _mm256_add_ps(const2,
+                                  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+        b = _mm256_add_ps(
+            const1,
+            _mm256_mul_ps(
+                x2,
+                _mm256_add_ps(
+                    const4,
+                    _mm256_mul_ps(x2,
+                                  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_fmadd_ps(
+                x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+        b = _mm256_fmadd_ps(
+            x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_tanh_32f_u_H
+#define INCLUDED_volk_32f_tanh_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m128 aVal, cVal, x2, a, b;
+    __m128 const1, const2, const3, const4, const5, const6;
+    const1 = _mm_set_ps1(135135.0f);
+    const2 = _mm_set_ps1(17325.0f);
+    const3 = _mm_set_ps1(378.0f);
+    const4 = _mm_set_ps1(62370.0f);
+    const5 = _mm_set_ps1(3150.0f);
+    const6 = _mm_set_ps1(28.0f);
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_loadu_ps(aPtr);
+        x2 = _mm_mul_ps(aVal, aVal);
+        a = _mm_mul_ps(
+            aVal,
+            _mm_add_ps(
+                const1,
+                _mm_mul_ps(x2,
+                           _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+        b = _mm_add_ps(
+            const1,
+            _mm_mul_ps(
+                x2,
+                _mm_add_ps(const4,
+                           _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+        cVal = _mm_div_ps(a, b);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_add_ps(
+                const1,
+                _mm256_mul_ps(
+                    x2,
+                    _mm256_add_ps(const2,
+                                  _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+        b = _mm256_add_ps(
+            const1,
+            _mm256_mul_ps(
+                x2,
+                _mm256_add_ps(
+                    const4,
+                    _mm256_mul_ps(x2,
+                                  _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void
+volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+
+    __m256 aVal, cVal, x2, a, b;
+    __m256 const1, const2, const3, const4, const5, const6;
+    const1 = _mm256_set1_ps(135135.0f);
+    const2 = _mm256_set1_ps(17325.0f);
+    const3 = _mm256_set1_ps(378.0f);
+    const4 = _mm256_set1_ps(62370.0f);
+    const5 = _mm256_set1_ps(3150.0f);
+    const6 = _mm256_set1_ps(28.0f);
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        x2 = _mm256_mul_ps(aVal, aVal);
+        a = _mm256_mul_ps(
+            aVal,
+            _mm256_fmadd_ps(
+                x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
+        b = _mm256_fmadd_ps(
+            x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
+
+        cVal = _mm256_div_ps(a, b);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_add_32f.h b/kernels/volk/volk_32f_x2_add_32f.h

new file mode 100644 (file)

index 0000000..3d3da0a
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_add_32f.h
@@ -0,0 +1,413 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_add_32f
+ *
+ * \b Overview
+ *
+ * Adds two vectors together element by element:
+ *
+ * c[i] = a[i] + b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_add_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ *
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
+ *
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_add_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_add_32f_u_H
+#define INCLUDED_volk_32f_x2_add_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_add_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_add_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_add_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_add_32f_u_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_loadu_ps(aPtr);
+        bVal = _mm_loadu_ps(bPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_add_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_u_H */
+#ifndef INCLUDED_volk_32f_x2_add_32f_a_H
+#define INCLUDED_volk_32f_x2_add_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_add_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_add_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_add_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_add_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_add_32f_u_neon(float* cVector,
+                                              const float* aVector,
+                                              const float* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    float32x4_t aVal, bVal, cVal;
+    for (number = 0; number < quarterPoints; number++) {
+        // Load in to NEON registers
+        aVal = vld1q_f32(aPtr);
+        bVal = vld1q_f32(bPtr);
+        __VOLK_PREFETCH(aPtr + 4);
+        __VOLK_PREFETCH(bPtr + 4);
+
+        // vector add
+        cVal = vaddq_f32(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f32(cPtr, cVal);
+
+        aPtr += 4; // q uses quadwords, 4 floats per vadd
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32f_x2_add_32f_a_neonasm(float* cVector,
+                                          const float* aVector,
+                                          const float* bVector,
+                                          unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32f_x2_add_32f_a_neonpipeline(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_add_32f_a_generic(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_x2_add_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32f_x2_add_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    volk_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_add_32f_a_H */
diff --git a/kernels/volk/volk_32f_x2_divide_32f.h b/kernels/volk/volk_32f_x2_divide_32f.h

new file mode 100644 (file)

index 0000000..d7527d0
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_divide_32f.h
@@ -0,0 +1,350 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_divide_32f
+ *
+ * \b Overview
+ *
+ * Divides aVector by bVector to produce cVector:
+ *
+ * c[i] = a[i] / b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_divide_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Divide an increasing vector by a decreasing vector
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_divide_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
+#define INCLUDED_volk_32f_x2_divide_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_div_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_div_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_div_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_divide_32f_neon(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    float32x4x4_t aVal, bVal, bInv, cVal;
+
+    const unsigned int eighthPoints = num_points / 16;
+    unsigned int number = 0;
+    for (; number < eighthPoints; number++) {
+        aVal = vld4q_f32(aPtr);
+        aPtr += 16;
+        bVal = vld4q_f32(bPtr);
+        bPtr += 16;
+
+        __VOLK_PREFETCH(aPtr + 16);
+        __VOLK_PREFETCH(bPtr + 16);
+
+        bInv.val[0] = vrecpeq_f32(bVal.val[0]);
+        bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+        bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
+        cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
+
+        bInv.val[1] = vrecpeq_f32(bVal.val[1]);
+        bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+        bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
+        cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
+
+        bInv.val[2] = vrecpeq_f32(bVal.val[2]);
+        bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+        bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
+        cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
+
+        bInv.val[3] = vrecpeq_f32(bVal.val[3]);
+        bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+        bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
+        cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
+
+        vst4q_f32(cPtr, cVal);
+        cPtr += 16;
+    }
+
+    for (number = eighthPoints * 16; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_divide_32f_generic(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
+                                              const float* aVector,
+                                              const float* bVector,
+                                              unsigned int num_points);
+
+static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
+#define INCLUDED_volk_32f_x2_divide_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_div_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_div_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_dot_prod_16i.h b/kernels/volk/volk_32f_x2_dot_prod_16i.h

new file mode 100644 (file)

index 0000000..92eaa6d
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_dot_prod_16i.h
@@ -0,0 +1,682 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_dot_prod_16i
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is conerted to a fixed-point short.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_dot_prod_16i(int16_t* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+ * \li taps:  float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a short value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32f_x2_dot_prod_16i();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
+#define INCLUDED_volk_32f_x2_dot_prod_16i_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
+                                                    const float* input,
+                                                    const float* taps,
+                                                    unsigned int num_points)
+{
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (int16_t)dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+
+
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+        b2Val = _mm256_load_ps(bPtr + 16);
+        b3Val = _mm256_load_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_AVX512F
+
+static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixtyfourthPoints = num_points / 64;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 a0Val, a1Val, a2Val, a3Val;
+    __m512 b0Val, b1Val, b2Val, b3Val;
+
+    __m512 dotProdVal0 = _mm512_setzero_ps();
+    __m512 dotProdVal1 = _mm512_setzero_ps();
+    __m512 dotProdVal2 = _mm512_setzero_ps();
+    __m512 dotProdVal3 = _mm512_setzero_ps();
+
+    for (; number < sixtyfourthPoints; number++) {
+
+        a0Val = _mm512_load_ps(aPtr);
+        a1Val = _mm512_load_ps(aPtr + 16);
+        a2Val = _mm512_load_ps(aPtr + 32);
+        a3Val = _mm512_load_ps(aPtr + 48);
+        b0Val = _mm512_load_ps(bPtr);
+        b1Val = _mm512_load_ps(bPtr + 16);
+        b2Val = _mm512_load_ps(bPtr + 32);
+        b3Val = _mm512_load_ps(bPtr + 48);
+
+        dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 64;
+        bPtr += 64;
+    }
+
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+    _mm512_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+    dotProduct += dotProductVector[8];
+    dotProduct += dotProductVector[9];
+    dotProduct += dotProductVector[10];
+    dotProduct += dotProductVector[11];
+    dotProduct += dotProductVector[12];
+    dotProduct += dotProductVector[13];
+    dotProduct += dotProductVector[14];
+    dotProduct += dotProductVector[15];
+
+    number = sixtyfourthPoints * 64;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX512F*/
+
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
+
+
+#ifdef LV_HAVE_AVX
+
+static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int thirtysecondPoints = num_points / 32;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < thirtysecondPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+        b2Val = _mm256_loadu_ps(bPtr + 16);
+        b3Val = _mm256_loadu_ps(bPtr + 24);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 32;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_AVX512F
+
+static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixtyfourthPoints = num_points / 64;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 a0Val, a1Val, a2Val, a3Val;
+    __m512 b0Val, b1Val, b2Val, b3Val;
+
+    __m512 dotProdVal0 = _mm512_setzero_ps();
+    __m512 dotProdVal1 = _mm512_setzero_ps();
+    __m512 dotProdVal2 = _mm512_setzero_ps();
+    __m512 dotProdVal3 = _mm512_setzero_ps();
+
+    for (; number < sixtyfourthPoints; number++) {
+
+        a0Val = _mm512_loadu_ps(aPtr);
+        a1Val = _mm512_loadu_ps(aPtr + 16);
+        a2Val = _mm512_loadu_ps(aPtr + 32);
+        a3Val = _mm512_loadu_ps(aPtr + 48);
+        b0Val = _mm512_loadu_ps(bPtr);
+        b1Val = _mm512_loadu_ps(bPtr + 16);
+        b2Val = _mm512_loadu_ps(bPtr + 32);
+        b3Val = _mm512_loadu_ps(bPtr + 48);
+
+        dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 64;
+        bPtr += 64;
+    }
+
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+
+    _mm512_storeu_ps(dotProductVector,
+                     dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+    dotProduct += dotProductVector[8];
+    dotProduct += dotProductVector[9];
+    dotProduct += dotProductVector[10];
+    dotProduct += dotProductVector[11];
+    dotProduct += dotProductVector[12];
+    dotProduct += dotProductVector[13];
+    dotProduct += dotProductVector[14];
+    dotProduct += dotProductVector[15];
+
+    number = sixtyfourthPoints * 64;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = (short)dotProduct;
+}
+
+#endif /*LV_HAVE_AVX512F*/
+
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
diff --git a/kernels/volk/volk_32f_x2_dot_prod_32f.h b/kernels/volk/volk_32f_x2_dot_prod_32f.h

new file mode 100644 (file)

index 0000000..4f9d8f3
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_dot_prod_32f.h
@@ -0,0 +1,976 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_dot_prod_32f
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is returned as a float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_dot_prod_32f(float* result, const float* input, const float* taps,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of floats.
+ * \li taps:  float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a float value to hold the dot product result.
+ *
+ * \b Example
+ * Take the dot product of an increasing vector and a vector of ones. The result is the
+ * sum of integers (0,9). \code int N = 10; unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* ones = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*1, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       ones[ii] = 1.f;
+ *   }
+ *
+ *   volk_32f_x2_dot_prod_32f(out, increasing, ones, N);
+ *
+ *   printf("out = %1.2f\n", *out);
+ *
+ *   volk_free(increasing);
+ *   volk_free(ones);
+ *   volk_free(out);
+ *
+ *   return 0;
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_generic(float* result,
+                                                    const float* input,
+                                                    const float* taps,
+                                                    unsigned int num_points)
+{
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse3(float* result,
+                                                   const float* input,
+                                                   const float* taps,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+        b0Val = _mm_loadu_ps(bPtr);
+        b1Val = _mm_loadu_ps(bPtr + 4);
+        b2Val = _mm_loadu_ps(bPtr + 8);
+        b3Val = _mm_loadu_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+        dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+        dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+        dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 aVal1, bVal1, cVal1;
+    __m128 aVal2, bVal2, cVal2;
+    __m128 aVal3, bVal3, cVal3;
+    __m128 aVal4, bVal4, cVal4;
+
+    __m128 dotProdVal = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal2 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal3 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+        aVal4 = _mm_loadu_ps(aPtr);
+        aPtr += 4;
+
+        bVal1 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal2 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal3 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+        bVal4 = _mm_loadu_ps(bPtr);
+        bPtr += 4;
+
+        cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+        cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+        cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+        cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+        cVal1 = _mm_or_ps(cVal1, cVal2);
+        cVal3 = _mm_or_ps(cVal3, cVal4);
+        cVal1 = _mm_or_ps(cVal1, cVal3);
+
+        dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+    }
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_u_avx(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val;
+    __m256 b0Val, b1Val;
+    __m256 c0Val, c1Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        b0Val = _mm256_loadu_ps(bPtr);
+        b1Val = _mm256_loadu_ps(bPtr + 8);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_storeu_ps(dotProductVector,
+                     dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+static inline void volk_32f_x2_dot_prod_32f_u_avx2_fma(float* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 dotProdVal = _mm256_setzero_ps();
+    __m256 aVal1, bVal1;
+
+    for (number = 0; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_loadu_ps(aPtr);
+        bVal1 = _mm256_loadu_ps(bPtr);
+        aPtr += 8;
+        bPtr += 8;
+
+        dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    _mm256_storeu_ps(dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
+
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7];
+
+    for (number = eighthPoints * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+#if LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void volk_32f_x2_dot_prod_32f_u_avx512f(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 dotProdVal = _mm512_setzero_ps();
+    __m512 aVal1, bVal1;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm512_loadu_ps(aPtr);
+        bVal1 = _mm512_loadu_ps(bPtr);
+        aPtr += 16;
+        bPtr += 16;
+
+        dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+    _mm512_storeu_ps(dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
+
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+                       dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+                       dotProductVector[12] + dotProductVector[13] +
+                       dotProductVector[14] + dotProductVector[15];
+
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_u_H*/
+
+#ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+#define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_generic(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse3(float* result,
+                                                   const float* input,
+                                                   const float* taps,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+        b0Val = _mm_load_ps(bPtr);
+        b1Val = _mm_load_ps(bPtr + 4);
+        b2Val = _mm_load_ps(bPtr + 8);
+        b3Val = _mm_load_ps(bPtr + 12);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
+        dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
+        dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
+        dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE4_1
+
+#include <smmintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m128 aVal1, bVal1, cVal1;
+    __m128 aVal2, bVal2, cVal2;
+    __m128 aVal3, bVal3, cVal3;
+    __m128 aVal4, bVal4, cVal4;
+
+    __m128 dotProdVal = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal2 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal3 = _mm_load_ps(aPtr);
+        aPtr += 4;
+        aVal4 = _mm_load_ps(aPtr);
+        aPtr += 4;
+
+        bVal1 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal2 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal3 = _mm_load_ps(bPtr);
+        bPtr += 4;
+        bVal4 = _mm_load_ps(bPtr);
+        bPtr += 4;
+
+        cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
+        cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
+        cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
+        cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
+
+        cVal1 = _mm_or_ps(cVal1, cVal2);
+        cVal3 = _mm_or_ps(cVal3, cVal4);
+        cVal1 = _mm_or_ps(cVal1, cVal3);
+
+        dotProdVal = _mm_add_ps(dotProdVal, cVal1);
+    }
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+    _mm_store_ps(dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32f_x2_dot_prod_32f_a_avx(float* result,
+                                                  const float* input,
+                                                  const float* taps,
+                                                  unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val;
+    __m256 b0Val, b1Val;
+    __m256 c0Val, c1Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        b0Val = _mm256_load_ps(bPtr);
+        b1Val = _mm256_load_ps(bPtr + 8);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+
+        aPtr += 16;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    dotProduct = dotProductVector[0];
+    dotProduct += dotProductVector[1];
+    dotProduct += dotProductVector[2];
+    dotProduct += dotProductVector[3];
+    dotProduct += dotProductVector[4];
+    dotProduct += dotProductVector[5];
+    dotProduct += dotProductVector[6];
+    dotProduct += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+#endif /*LV_HAVE_AVX*/
+
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+static inline void volk_32f_x2_dot_prod_32f_a_avx2_fma(float* result,
+                                                       const float* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m256 dotProdVal = _mm256_setzero_ps();
+    __m256 aVal1, bVal1;
+
+    for (number = 0; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_load_ps(aPtr);
+        bVal1 = _mm256_load_ps(bPtr);
+        aPtr += 8;
+        bPtr += 8;
+
+        dotProdVal = _mm256_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
+
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7];
+
+    for (number = eighthPoints * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+#if LV_HAVE_AVX512F
+#include <immintrin.h>
+static inline void volk_32f_x2_dot_prod_32f_a_avx512f(float* result,
+                                                      const float* input,
+                                                      const float* taps,
+                                                      unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = input;
+    const float* bPtr = taps;
+
+    __m512 dotProdVal = _mm512_setzero_ps();
+    __m512 aVal1, bVal1;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+
+        aVal1 = _mm512_load_ps(aPtr);
+        bVal1 = _mm512_load_ps(bPtr);
+        aPtr += 16;
+        bPtr += 16;
+
+        dotProdVal = _mm512_fmadd_ps(aVal1, bVal1, dotProdVal);
+    }
+
+    __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
+    _mm512_store_ps(dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
+
+    float dotProduct = dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                       dotProductVector[3] + dotProductVector[4] + dotProductVector[5] +
+                       dotProductVector[6] + dotProductVector[7] + dotProductVector[8] +
+                       dotProductVector[9] + dotProductVector[10] + dotProductVector[11] +
+                       dotProductVector[12] + dotProductVector[13] +
+                       dotProductVector[14] + dotProductVector[15];
+
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_dot_prod_32f_neonopts(float* result,
+                                                     const float* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 16;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    float32x4x4_t a_val, b_val, accumulator0;
+    accumulator0.val[0] = vdupq_n_f32(0);
+    accumulator0.val[1] = vdupq_n_f32(0);
+    accumulator0.val[2] = vdupq_n_f32(0);
+    accumulator0.val[3] = vdupq_n_f32(0);
+    // factor of 4 loop unroll with independent accumulators
+    // uses 12 out of 16 neon q registers
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld4q_f32(aPtr);
+        b_val = vld4q_f32(bPtr);
+        accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
+        accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
+        accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
+        accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
+        aPtr += 16;
+        bPtr += 16;
+    }
+    accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
+    accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
+    accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
+    __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+    vst1q_f32(accumulator, accumulator0.val[0]);
+    dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+    for (number = quarter_points * 16; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif
+
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32f_x2_dot_prod_32f_neon(float* result,
+                                                 const float* input,
+                                                 const float* taps,
+                                                 unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 8;
+    float dotProduct = 0;
+    const float* aPtr = input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    float32x4x2_t a_val, b_val, accumulator_val;
+    accumulator_val.val[0] = vdupq_n_f32(0);
+    accumulator_val.val[1] = vdupq_n_f32(0);
+    // factor of 2 loop unroll with independent accumulators
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32(aPtr);
+        b_val = vld2q_f32(bPtr);
+        accumulator_val.val[0] =
+            vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
+        accumulator_val.val[1] =
+            vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
+        aPtr += 8;
+        bPtr += 8;
+    }
+    accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
+    __VOLK_ATTR_ALIGNED(32) float accumulator[4];
+    vst1q_f32(accumulator, accumulator_val.val[0]);
+    dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
+
+    for (number = quarter_points * 8; number < num_points; number++) {
+        dotProduct += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = dotProduct;
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+#endif /*INCLUDED_volk_32f_x2_dot_prod_32f_a_H*/
diff --git a/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h

new file mode 100644 (file)

index 0000000..e4196d5
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_fm_detectpuppet_32f.h
@@ -0,0 +1,82 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H
+
+#include "volk_32f_s32f_32f_fm_detect_32f.h"
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    const float bound = 1.0f;
+
+    volk_32f_s32f_32f_fm_detect_32f_a_avx(
+        outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_a_sse(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    const float bound = 1.0f;
+
+    volk_32f_s32f_32f_fm_detect_32f_a_sse(
+        outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_generic(float* outputVector,
+                                                           const float* inputVector,
+                                                           float* saveValue,
+                                                           unsigned int num_points)
+{
+    const float bound = 1.0f;
+
+    volk_32f_s32f_32f_fm_detect_32f_generic(
+        outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+#define INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H
+
+#include "volk_32f_s32f_32f_fm_detect_32f.h"
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_fm_detectpuppet_32f_u_avx(float* outputVector,
+                                                         const float* inputVector,
+                                                         float* saveValue,
+                                                         unsigned int num_points)
+{
+    const float bound = 1.0f;
+
+    volk_32f_s32f_32f_fm_detect_32f_u_avx(
+        outputVector, inputVector, bound, saveValue, num_points);
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32f_x2_fm_detectpuppet_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_interleave_32fc.h b/kernels/volk/volk_32f_x2_interleave_32fc.h

new file mode 100644 (file)

index 0000000..140fa9f
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_interleave_32fc.h
@@ -0,0 +1,258 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_interleave_32fc
+ *
+ * \b Overview
+ *
+ * Takes input vector iBuffer as the real (inphase) part and input
+ * vector qBuffer as the imag (quadrature) part and combines them into
+ * a complex output vector.
+ *
+ * c[i] = complex(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_interleave_32fc(lv_32fc_t* complexVector, const float* iBuffer, const
+ * float* qBuffer, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+ * \li qBuffer: Input vector of samples for the imaginary part.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of complex numbers.
+ *
+ * \b Example
+ * Generate the top half of the unit circle with real points equally spaced.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* imag = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* real = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       real[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *       imag[ii] = std::sqrt(1.f - real[ii] * real[ii]);
+ *   }
+ *
+ *   volk_32f_x2_interleave_32fc(out, imag, real, N);
+ *
+ *  for(unsigned int ii = 0; ii < N; ++ii){
+ *      printf("out[%u] = %1.2f + %1.2fj\n", ii, std::real(out[ii]), std::imag(out[ii]));
+ *  }
+ *
+ *   volk_free(imag);
+ *   volk_free(real);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_a_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_interleave_32fc_a_avx(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_load_ps(iBufferPtr);
+        qValue = _mm256_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        _mm256_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        _mm256_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
+}
+
+#endif /* LV_HAV_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_interleave_32fc_a_sse(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t quarterPoints = num_points / 4;
+
+    __m128 iValue, qValue, cplxValue;
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue = _mm_unpacklo_ps(iValue, qValue);
+        _mm_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 4;
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue = _mm_unpackhi_ps(iValue, qValue);
+        _mm_store_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 4;
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_interleave_32fc_neon(lv_32fc_t* complexVector,
+                                                    const float* iBuffer,
+                                                    const float* qBuffer,
+                                                    unsigned int num_points)
+{
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float* complexVectorPtr = (float*)complexVector;
+
+    float32x4x2_t complex_vec;
+    for (number = 0; number < quarter_points; ++number) {
+        complex_vec.val[0] = vld1q_f32(iBuffer);
+        complex_vec.val[1] = vld1q_f32(qBuffer);
+        vst2q_f32(complexVectorPtr, complex_vec);
+        iBuffer += 4;
+        qBuffer += 4;
+        complexVectorPtr += 8;
+    }
+
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *complexVectorPtr++ = *iBuffer++;
+        *complexVectorPtr++ = *qBuffer++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_interleave_32fc_generic(lv_32fc_t* complexVector,
+                                                       const float* iBuffer,
+                                                       const float* qBuffer,
+                                                       unsigned int num_points)
+{
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+    unsigned int number;
+
+    for (number = 0; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_interleave_32fc_u_H
+#define INCLUDED_volk_32f_x2_interleave_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_interleave_32fc_u_avx(lv_32fc_t* complexVector,
+                                                     const float* iBuffer,
+                                                     const float* qBuffer,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* complexVectorPtr = (float*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    const uint64_t eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2, cplxValue;
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_loadu_ps(iBufferPtr);
+        qValue = _mm256_loadu_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        _mm256_storeu_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        cplxValue = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+        _mm256_storeu_ps(complexVectorPtr, cplxValue);
+        complexVectorPtr += 8;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = *iBufferPtr++;
+        *complexVectorPtr++ = *qBufferPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_interleave_32fc_u_H */
diff --git a/kernels/volk/volk_32f_x2_max_32f.h b/kernels/volk/volk_32f_x2_max_32f.h

new file mode 100644 (file)

index 0000000..2ba0562
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_max_32f.h
@@ -0,0 +1,333 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_max_32f
+ *
+ * \b Overview
+ *
+ * Selects maximum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_max_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_max_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_max_32f_a_H
+#define INCLUDED_volk_32f_x2_max_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_max_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_max_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_max_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_max_32f_neon(float* cVector,
+                                            const float* aVector,
+                                            const float* bVector,
+                                            unsigned int num_points)
+{
+    unsigned int quarter_points = num_points / 4;
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    float32x4_t a_vec, b_vec, c_vec;
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vmaxq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_max_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_ORC
+extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_max_32f_u_H
+#define INCLUDED_volk_32f_x2_max_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_max_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_max_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_min_32f.h b/kernels/volk/volk_32f_x2_min_32f.h

new file mode 100644 (file)

index 0000000..3618737
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_min_32f.h
@@ -0,0 +1,337 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_min_32f
+ *
+ * \b Overview
+ *
+ * Selects minimum value from each entry between bVector and aVector
+ * and store their results in the cVector
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_min_32f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_min_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_min_32f_a_H
+#define INCLUDED_volk_32f_x2_min_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_min_32f_a_sse(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_min_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_min_32f_neon(float* cVector,
+                                            const float* aVector,
+                                            const float* bVector,
+                                            unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+
+        c_vec = vminq_f32(a_vec, b_vec);
+
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_min_32f_generic(float* cVector,
+                                               const float* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32f_x2_min_32f_a_orc_impl(float* cVector,
+                                           const float* aVector,
+                                           const float* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32f_x2_min_32f_u_orc(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    volk_32f_x2_min_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_min_32f_a_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_min_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_min_32f_a_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_min_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_min_32f_u_H
+#define INCLUDED_volk_32f_x2_min_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_min_32f_u_avx512f(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_min_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_min_32f_u_avx(float* cVector,
+                                             const float* aVector,
+                                             const float* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_min_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        const float a = *aPtr++;
+        const float b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_min_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_multiply_32f.h b/kernels/volk/volk_32f_x2_multiply_32f.h

new file mode 100644 (file)

index 0000000..b184847
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_multiply_32f.h
@@ -0,0 +1,379 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_multiply_32f
+ *
+ * \b Overview
+ *
+ * Multiplies two input floating point vectors together.
+ *
+ * c[i] = a[i] * b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_multiply_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Multiply elements of an increasing vector by those of a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_multiply_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
+#define INCLUDED_volk_32f_x2_multiply_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_u_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_loadu_ps(aPtr);
+        bVal = _mm_loadu_ps(bPtr);
+
+        cVal = _mm_mul_ps(aVal, bVal);
+
+        _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_u_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_mul_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_u_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_mul_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_multiply_32f_generic(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
+#define INCLUDED_volk_32f_x2_multiply_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_a_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_mul_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_a_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_mul_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_multiply_32f_a_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_mul_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_multiply_32f_neon(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+    float32x4_t avec, bvec, cvec;
+    for (number = 0; number < quarter_points; ++number) {
+        avec = vld1q_f32(aVector);
+        bvec = vld1q_f32(bVector);
+        cvec = vmulq_f32(avec, bvec);
+        vst1q_f32(cVector, cvec);
+        aVector += 4;
+        bVector += 4;
+        cVector += 4;
+    }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *cVector++ = *aVector++ * *bVector++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_multiply_32f_a_generic(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_32f_x2_multiply_32f_a_orc_impl(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points);
+
+static inline void volk_32f_x2_multiply_32f_u_orc(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_multiply_32f_a_H */
diff --git a/kernels/volk/volk_32f_x2_pow_32f.h b/kernels/volk/volk_32f_x2_pow_32f.h

new file mode 100644 (file)

index 0000000..4edafd5
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_pow_32f.h
@@ -0,0 +1,977 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_pow_32f
+ *
+ * \b Overview
+ *
+ * Raises the sample in aVector to the power of the number in bVector.
+ *
+ * c[i] = pow(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_pow_32f(float* cVector, const float* bVector, const float* aVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li bVector: The input vector of indices (power values).
+ * \li aVector: The input vector of base values.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Calculate the first two powers of two (2^x).
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* twos = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       twos[ii] = 2.f;
+ *   }
+ *
+ *   volk_32f_x2_pow_32f(out, increasing, twos, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(twos);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
+#define INCLUDED_volk_32f_x2_pow_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define POW_POLY_DEGREE 3
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
+                                                  const float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2_FMA(frac,
+                                  3.1157899f,
+                                  -3.3241990f,
+                                  2.5988452f,
+                                  -1.2315303f,
+                                  3.1821337e-1f,
+                                  -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2_FMA(frac,
+                                  2.8882704548164776201f,
+                                  -2.52074962577807006663f,
+                                  1.48116647521213171641f,
+                                  -0.465725644288844778798f,
+                                  0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2_FMA(frac,
+                                  2.61761038894603480148f,
+                                  -1.75647175389045657003f,
+                                  0.688243882994381274313f,
+                                  -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2_FMA(frac,
+                                  2.28330284476918490682f,
+                                  -1.04913055217340124191f,
+                                  0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
+
+        // Now calculate b*lna
+        bVal = _mm256_load_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm256_fmadd_ps(bVal, log2EF, half);
+
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
+
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
+
+        tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+        bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+        z = _mm256_mul_ps(bVal, bVal);
+
+        y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+        y = _mm256_fmadd_ps(y, bVal, exp_p2);
+        y = _mm256_fmadd_ps(y, bVal, exp_p3);
+        y = _mm256_fmadd_ps(y, bVal, exp_p4);
+        y = _mm256_fmadd_ps(y, bVal, exp_p5);
+        y = _mm256_fmadd_ps(y, z, bVal);
+        y = _mm256_add_ps(y, one);
+
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm256_castsi256_ps(emm0);
+        cVal = _mm256_mul_ps(y, pow2n);
+
+        _mm256_store_ps(cPtr, cVal);
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
+                                              const float* bVector,
+                                              const float* aVector,
+                                              unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_load_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm = _mm256_add_ps(
+            _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
+
+        // Now calculate b*lna
+        bVal = _mm256_load_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
+
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
+
+        tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+        bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+        z = _mm256_mul_ps(bVal, bVal);
+
+        y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+        y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+        y = _mm256_add_ps(y, one);
+
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm256_castsi256_ps(emm0);
+        cVal = _mm256_mul_ps(y, pow2n);
+
+        _mm256_store_ps(cPtr, cVal);
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
+                                                const float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for (; number < quarterPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm_load_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        logarithm = _mm_cvtepi32_ps(exp);
+
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm =
+            _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+        // Now calculate b*lna
+        bVal = _mm_load_ps(bPtr);
+        bVal = _mm_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
+
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
+
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+        z = _mm_mul_ps(bVal, bVal);
+
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+        y = _mm_add_ps(y, one);
+
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm_castsi128_ps(emm0);
+        cVal = _mm_mul_ps(y, pow2n);
+
+        _mm_store_ps(cPtr, cVal);
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
+#define INCLUDED_volk_32f_x2_pow_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define POW_POLY_DEGREE 3
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_pow_32f_generic(float* cVector,
+                                               const float* bVector,
+                                               const float* aVector,
+                                               unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) \
+    _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) \
+    _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) \
+    _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
+                                                const float* bVector,
+                                                const float* aVector,
+                                                unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m128 tmp, fx, mask, pow2n, z, y;
+    __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m128i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm_set1_ps(1.0);
+    exp_hi = _mm_set1_ps(88.3762626647949);
+    exp_lo = _mm_set1_ps(-88.3762626647949);
+    ln2 = _mm_set1_ps(0.6931471805);
+    log2EF = _mm_set1_ps(1.44269504088896341);
+    half = _mm_set1_ps(0.5);
+    exp_C1 = _mm_set1_ps(0.693359375);
+    exp_C2 = _mm_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm_set1_epi32(0x7f);
+
+    exp_p0 = _mm_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+    for (; number < quarterPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm_loadu_ps(aPtr);
+        bias = _mm_set1_epi32(127);
+        leadingOne = _mm_set1_ps(1.0f);
+        exp = _mm_sub_epi32(
+            _mm_srli_epi32(
+                _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
+            bias);
+        logarithm = _mm_cvtepi32_ps(exp);
+
+        frac = _mm_or_ps(leadingOne,
+                         _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5(frac,
+                         3.1157899f,
+                         -3.3241990f,
+                         2.5988452f,
+                         -1.2315303f,
+                         3.1821337e-1f,
+                         -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4(frac,
+                         2.8882704548164776201f,
+                         -2.52074962577807006663f,
+                         1.48116647521213171641f,
+                         -0.465725644288844778798f,
+                         0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3(frac,
+                         2.61761038894603480148f,
+                         -1.75647175389045657003f,
+                         0.688243882994381274313f,
+                         -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2(frac,
+                         2.28330284476918490682f,
+                         -1.04913055217340124191f,
+                         0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm =
+            _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
+        logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+        // Now calculate b*lna
+        bVal = _mm_loadu_ps(bPtr);
+        bVal = _mm_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+        emm0 = _mm_cvttps_epi32(fx);
+        tmp = _mm_cvtepi32_ps(emm0);
+
+        mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+        fx = _mm_sub_ps(tmp, mask);
+
+        tmp = _mm_mul_ps(fx, exp_C1);
+        z = _mm_mul_ps(fx, exp_C2);
+        bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+        z = _mm_mul_ps(bVal, bVal);
+
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+        y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+        y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+        y = _mm_add_ps(y, one);
+
+        emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm_castsi128_ps(emm0);
+        cVal = _mm_mul_ps(y, pow2n);
+
+        _mm_storeu_ps(cPtr, cVal);
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = powf(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+
+#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2_FMA(x, c0, c1) \
+    _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
+#define POLY2_AVX2_FMA(x, c0, c1, c2) \
+    _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
+#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
+    _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
+#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
+    _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
+#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
+                                                  const float* bVector,
+                                                  const float* aVector,
+                                                  unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2_FMA(frac,
+                                  3.1157899f,
+                                  -3.3241990f,
+                                  2.5988452f,
+                                  -1.2315303f,
+                                  3.1821337e-1f,
+                                  -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2_FMA(frac,
+                                  2.8882704548164776201f,
+                                  -2.52074962577807006663f,
+                                  1.48116647521213171641f,
+                                  -0.465725644288844778798f,
+                                  0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2_FMA(frac,
+                                  2.61761038894603480148f,
+                                  -1.75647175389045657003f,
+                                  0.688243882994381274313f,
+                                  -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2_FMA(frac,
+                                  2.28330284476918490682f,
+                                  -1.04913055217340124191f,
+                                  0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
+
+
+        // Now calculate b*lna
+        bVal = _mm256_loadu_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm256_fmadd_ps(bVal, log2EF, half);
+
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
+
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
+
+        tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
+        bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
+        z = _mm256_mul_ps(bVal, bVal);
+
+        y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
+        y = _mm256_fmadd_ps(y, bVal, exp_p2);
+        y = _mm256_fmadd_ps(y, bVal, exp_p3);
+        y = _mm256_fmadd_ps(y, bVal, exp_p4);
+        y = _mm256_fmadd_ps(y, bVal, exp_p5);
+        y = _mm256_fmadd_ps(y, z, bVal);
+        y = _mm256_add_ps(y, one);
+
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm256_castsi256_ps(emm0);
+        cVal = _mm256_mul_ps(y, pow2n);
+
+        _mm256_storeu_ps(cPtr, cVal);
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
+#define POLY1_AVX2(x, c0, c1) \
+    _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
+#define POLY2_AVX2(x, c0, c1, c2) \
+    _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
+#define POLY3_AVX2(x, c0, c1, c2, c3) \
+    _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
+#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
+    _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
+#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
+    _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
+
+static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
+                                              const float* bVector,
+                                              const float* aVector,
+                                              unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* bPtr = bVector;
+    const float* aPtr = aVector;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+    __m256 tmp, fx, mask, pow2n, z, y;
+    __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+    __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+    __m256i bias, exp, emm0, pi32_0x7f;
+
+    one = _mm256_set1_ps(1.0);
+    exp_hi = _mm256_set1_ps(88.3762626647949);
+    exp_lo = _mm256_set1_ps(-88.3762626647949);
+    ln2 = _mm256_set1_ps(0.6931471805);
+    log2EF = _mm256_set1_ps(1.44269504088896341);
+    half = _mm256_set1_ps(0.5);
+    exp_C1 = _mm256_set1_ps(0.693359375);
+    exp_C2 = _mm256_set1_ps(-2.12194440e-4);
+    pi32_0x7f = _mm256_set1_epi32(0x7f);
+
+    exp_p0 = _mm256_set1_ps(1.9875691500e-4);
+    exp_p1 = _mm256_set1_ps(1.3981999507e-3);
+    exp_p2 = _mm256_set1_ps(8.3334519073e-3);
+    exp_p3 = _mm256_set1_ps(4.1665795894e-2);
+    exp_p4 = _mm256_set1_ps(1.6666665459e-1);
+    exp_p5 = _mm256_set1_ps(5.0000001201e-1);
+
+    for (; number < eighthPoints; number++) {
+        // First compute the logarithm
+        aVal = _mm256_loadu_ps(aPtr);
+        bias = _mm256_set1_epi32(127);
+        leadingOne = _mm256_set1_ps(1.0f);
+        exp = _mm256_sub_epi32(
+            _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
+                                               _mm256_set1_epi32(0x7f800000)),
+                              23),
+            bias);
+        logarithm = _mm256_cvtepi32_ps(exp);
+
+        frac = _mm256_or_ps(
+            leadingOne,
+            _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
+
+#if POW_POLY_DEGREE == 6
+        mantissa = POLY5_AVX2(frac,
+                              3.1157899f,
+                              -3.3241990f,
+                              2.5988452f,
+                              -1.2315303f,
+                              3.1821337e-1f,
+                              -3.4436006e-2f);
+#elif POW_POLY_DEGREE == 5
+        mantissa = POLY4_AVX2(frac,
+                              2.8882704548164776201f,
+                              -2.52074962577807006663f,
+                              1.48116647521213171641f,
+                              -0.465725644288844778798f,
+                              0.0596515482674574969533f);
+#elif POW_POLY_DEGREE == 4
+        mantissa = POLY3_AVX2(frac,
+                              2.61761038894603480148f,
+                              -1.75647175389045657003f,
+                              0.688243882994381274313f,
+                              -0.107254423828329604454f);
+#elif POW_POLY_DEGREE == 3
+        mantissa = POLY2_AVX2(frac,
+                              2.28330284476918490682f,
+                              -1.04913055217340124191f,
+                              0.204446009836232697516f);
+#else
+#error
+#endif
+
+        logarithm = _mm256_add_ps(
+            _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
+        logarithm = _mm256_mul_ps(logarithm, ln2);
+
+        // Now calculate b*lna
+        bVal = _mm256_loadu_ps(bPtr);
+        bVal = _mm256_mul_ps(bVal, logarithm);
+
+        // Now compute exp(b*lna)
+        bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
+
+        fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
+
+        emm0 = _mm256_cvttps_epi32(fx);
+        tmp = _mm256_cvtepi32_ps(emm0);
+
+        mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
+        fx = _mm256_sub_ps(tmp, mask);
+
+        tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
+        bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
+        z = _mm256_mul_ps(bVal, bVal);
+
+        y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
+        y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
+        y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
+        y = _mm256_add_ps(y, one);
+
+        emm0 =
+            _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
+
+        pow2n = _mm256_castsi256_ps(emm0);
+        cVal = _mm256_mul_ps(y, pow2n);
+
+        _mm256_storeu_ps(cPtr, cVal);
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = pow(*aPtr++, *bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX2 for unaligned */
+
+#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
diff --git a/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h

new file mode 100644 (file)

index 0000000..2ddfb0f
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
@@ -0,0 +1,330 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_s32f_interleave_16ic
+ *
+ * \b Overview
+ *
+ * Takes input vector iBuffer as the real (inphase) part and input
+ * vector qBuffer as the imag (quadrature) part and combines them into
+ * a complex output vector. The output is scaled by the input scalar
+ * value and convert to a 16-bit short comlex number.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_s32f_interleave_16ic(lv_16sc_t* complexVector, const float* iBuffer,
+ * const float* qBuffer, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li iBuffer: Input vector of samples for the real part.
+ * \li qBuffer: Input vector of samples for the imaginary part.
+ * \;i scalar:  The scalar value used to scale the values before converting to shorts.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of complex numbers.
+ *
+ * \b Example
+ * Generate points around the unit circle and convert to complex integers.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* imag = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* real = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   lv_16sc_t* out = (lv_16sc_t*)volk_malloc(sizeof(lv_16sc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       real[ii] = 2.f * ((float)ii / (float)N) - 1.f;
+ *       imag[ii] = std::sqrt(1.f - real[ii] * real[ii]);
+ *   }
+ *   // Normalize by smallest delta (0.02 in this example)
+ *   float scale = 50.f;
+ *
+ *   volk_32f_x2_s32f_interleave_16ic(out, imag, real, scale, N);
+ *
+ *  for(unsigned int ii = 0; ii < N; ++ii){
+ *      printf("out[%u] = %i + %ij\n", ii, std::real(out[ii]), std::imag(out[ii]));
+ *  }
+ *
+ *   volk_free(imag);
+ *   volk_free(real);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2;
+    __m256i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_load_ps(iBufferPtr);
+        qValue = _mm256_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+        intValue1 = _mm256_cvtps_epi32(cplxValue1);
+        intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+        intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+        _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 16;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 iValue, qValue, cplxValue1, cplxValue2;
+    __m128i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
+
+        intValue1 = _mm_cvtps_epi32(cplxValue1);
+        intValue2 = _mm_cvtps_epi32(cplxValue2);
+
+        intValue1 = _mm_packs_epi32(intValue1, intValue2);
+
+        _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 8;
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
+                                                          const float* iBuffer,
+                                                          const float* qBuffer,
+                                                          const float scalar,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 iValue, qValue, cplxValue;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        iValue = _mm_load_ps(iBufferPtr);
+        qValue = _mm_load_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue = _mm_unpacklo_ps(iValue, qValue);
+        cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+        _mm_store_ps(floatBuffer, cplxValue);
+
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue = _mm_unpackhi_ps(iValue, qValue);
+        cplxValue = _mm_mul_ps(cplxValue, vScalar);
+
+        _mm_store_ps(floatBuffer, cplxValue);
+
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
+                                                            const float* iBuffer,
+                                                            const float* qBuffer,
+                                                            const float scalar,
+                                                            unsigned int num_points)
+{
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
+                                                           const float* iBuffer,
+                                                           const float* qBuffer,
+                                                           const float scalar,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* iBufferPtr = iBuffer;
+    const float* qBufferPtr = qBuffer;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+
+    const unsigned int eighthPoints = num_points / 8;
+
+    __m256 iValue, qValue, cplxValue1, cplxValue2;
+    __m256i intValue1, intValue2;
+
+    int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+    for (; number < eighthPoints; number++) {
+        iValue = _mm256_loadu_ps(iBufferPtr);
+        qValue = _mm256_loadu_ps(qBufferPtr);
+
+        // Interleaves the lower two values in the i and q variables into one buffer
+        cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+        cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+        // Interleaves the upper two values in the i and q variables into one buffer
+        cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+        cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+        intValue1 = _mm256_cvtps_epi32(cplxValue1);
+        intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+        intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+        _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+        complexVectorPtr += 16;
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    complexVectorPtr = (int16_t*)(&complexVector[number]);
+    for (; number < num_points; number++) {
+        *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
+        *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
diff --git a/kernels/volk/volk_32f_x2_subtract_32f.h b/kernels/volk/volk_32f_x2_subtract_32f.h

new file mode 100644 (file)

index 0000000..acc544e
--- /dev/null
+++ b/kernels/volk/volk_32f_x2_subtract_32f.h
@@ -0,0 +1,328 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x2_subtract_32f
+ *
+ * \b Overview
+ *
+ * Subtracts values in bVector from values in aVector.
+ *
+ * c[i] = a[i] - b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x2_subtract_32f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The initial vector.
+ * \li bVector: The vector to be subtracted.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector.
+ *
+ * \b Example
+ * Subtract and increasing vector from a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32f_x2_subtract_32f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_a_H
+#define INCLUDED_volk_32f_x2_subtract_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_subtract_32f_a_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_load_ps(aPtr);
+        bVal = _mm512_load_ps(bPtr);
+
+        cVal = _mm512_sub_ps(aVal, bVal);
+
+        _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_subtract_32f_a_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_load_ps(aPtr);
+        bVal = _mm256_load_ps(bPtr);
+
+        cVal = _mm256_sub_ps(aVal, bVal);
+
+        _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32f_x2_subtract_32f_a_sse(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_sub_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x2_subtract_32f_generic(float* cVector,
+                                                    const float* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32f_x2_subtract_32f_neon(float* cVector,
+                                                 const float* aVector,
+                                                 const float* bVector,
+                                                 unsigned int num_points)
+{
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4_t a_vec, b_vec, c_vec;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_vec = vld1q_f32(aPtr);
+        b_vec = vld1q_f32(bPtr);
+        c_vec = vsubq_f32(a_vec, b_vec);
+        vst1q_f32(cPtr, c_vec);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_32f_x2_subtract_32f_a_orc_impl(float* cVector,
+                                                const float* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points);
+
+static inline void volk_32f_x2_subtract_32f_u_orc(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    volk_32f_x2_subtract_32f_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
+#define INCLUDED_volk_32f_x2_subtract_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32f_x2_subtract_32f_u_avx512f(float* cVector,
+                                                      const float* aVector,
+                                                      const float* bVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m512 aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_loadu_ps(aPtr);
+        bVal = _mm512_loadu_ps(bPtr);
+
+        cVal = _mm512_sub_ps(aVal, bVal);
+
+        _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x2_subtract_32f_u_avx(float* cVector,
+                                                  const float* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    float* cPtr = cVector;
+    const float* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < eighthPoints; number++) {
+
+        aVal = _mm256_loadu_ps(aPtr);
+        bVal = _mm256_loadu_ps(bPtr);
+
+        cVal = _mm256_sub_ps(aVal, bVal);
+
+        _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) - (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */
diff --git a/kernels/volk/volk_32f_x3_sum_of_poly_32f.h b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h

new file mode 100644 (file)

index 0000000..6afd262
--- /dev/null
+++ b/kernels/volk/volk_32f_x3_sum_of_poly_32f.h
@@ -0,0 +1,657 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32f_x3_sum_of_poly_32f
+ *
+ * \b Overview
+ *
+ * Calculates the unscaled area under a fourth order polynomial using the
+ * rectangular method. The result is the sum of y-values. To get the area,
+ * multiply by the rectangle/bin width.
+ *
+ * Expressed as a formula, this function calculates
+ * \f$ \sum f(x) = \sum (c_0 + c_1 \cdot x + c_2 \cdot x^2 + c_3 \cdot x^3 + c_4 \cdot
+ * x^4)\f$
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_x3_sum_of_poly_32f(float* target, float* src0, float* center_point_array,
+ * float* cutoff, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: x values
+ * \li center_point_array: polynomial coefficients in order {c1, c2, c3, c4, c0}
+ * \li cutoff: the minimum x value to use (will clamp to cutoff if input < cutoff)
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li complexVector: The sum of y values generated by polynomial.
+ *
+ * \b Example
+ * The following estimates \f$\int_0^\pi e^x dx\f$ by using the Taylor expansion
+ * centered around \f$x=1.5\f$,
+ * \f$ e^x = e^1.5 * (1 + x + \frac{1}{2} x^2 + \frac{1}{6}x^3 + \frac{1}{24}x^4) \f$
+ * \code
+ *   int npoints = 4096;
+ *   float* coefficients = (float*)volk_malloc(sizeof(float) * 5, volk_get_alignment());
+ *   float* input        = (float*)volk_malloc(sizeof(float) * npoints,
+ * volk_get_alignment()); float* result       = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment()); float* cutoff       = (float*)volk_malloc(sizeof(float),
+ * volk_get_alignment());
+ *   // load precomputed Taylor series coefficients
+ *   coefficients[0] = 4.48168907033806f;            // c1
+ *   coefficients[1] = coefficients[0] * 0.5f;       // c2
+ *   coefficients[2] = coefficients[0] * 1.0f/6.0f;  // c3
+ *   coefficients[3] = coefficients[0] * 1.0f/24.0f; // c4
+ *   coefficients[4] = coefficients[0];              // c0
+ *   *cutoff = -2.0;
+ *   *result = 0.0f;
+ *   // generate uniform input data
+ *   float dx = (float)M_PI/ (float)npoints;
+ *   for(unsigned int ii=0; ii < npoints; ++ii){
+ *       input[ii] = dx * (float)ii - 1.5f;
+ *   }
+ *   volk_32f_x3_sum_of_poly_32f(result, input, coefficients, cutoff, npoints);
+ *   // multiply by bin width to get approximate area
+ *   std::cout << "result is " << *result * (input[1]-input[0]) << std::endl;
+ *   volk_free(coefficients);
+ *   volk_free(input);
+ *   volk_free(result);
+ *   volk_free(cutoff);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+#endif
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target,
+                                                      float* src0,
+                                                      float* center_point_array,
+                                                      float* cutoff,
+                                                      unsigned int num_points)
+{
+    float result = 0.0f;
+    float fst = 0.0f;
+    float sq = 0.0f;
+    float thrd = 0.0f;
+    float frth = 0.0f;
+
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
+
+    xmm9 = _mm_setzero_ps();
+    xmm1 = _mm_setzero_ps();
+    xmm0 = _mm_load1_ps(&center_point_array[0]);
+    xmm6 = _mm_load1_ps(&center_point_array[1]);
+    xmm7 = _mm_load1_ps(&center_point_array[2]);
+    xmm8 = _mm_load1_ps(&center_point_array[3]);
+    xmm10 = _mm_load1_ps(cutoff);
+
+    int bound = num_points / 8;
+    int leftovers = num_points - 8 * bound;
+    int i = 0;
+    for (; i < bound; ++i) {
+        // 1st
+        xmm2 = _mm_load_ps(src0);
+        xmm2 = _mm_max_ps(xmm10, xmm2);
+        xmm3 = _mm_mul_ps(xmm2, xmm2);
+        xmm4 = _mm_mul_ps(xmm2, xmm3);
+        xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+        xmm2 = _mm_mul_ps(xmm2, xmm0);
+        xmm3 = _mm_mul_ps(xmm3, xmm6);
+        xmm4 = _mm_mul_ps(xmm4, xmm7);
+        xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+        xmm2 = _mm_add_ps(xmm2, xmm3);
+        xmm3 = _mm_add_ps(xmm4, xmm5);
+
+        src0 += 4;
+
+        xmm9 = _mm_add_ps(xmm2, xmm9);
+        xmm9 = _mm_add_ps(xmm3, xmm9);
+
+        // 2nd
+        xmm2 = _mm_load_ps(src0);
+        xmm2 = _mm_max_ps(xmm10, xmm2);
+        xmm3 = _mm_mul_ps(xmm2, xmm2);
+        xmm4 = _mm_mul_ps(xmm2, xmm3);
+        xmm5 = _mm_mul_ps(xmm3, xmm3);
+
+        xmm2 = _mm_mul_ps(xmm2, xmm0);
+        xmm3 = _mm_mul_ps(xmm3, xmm6);
+        xmm4 = _mm_mul_ps(xmm4, xmm7);
+        xmm5 = _mm_mul_ps(xmm5, xmm8);
+
+        xmm2 = _mm_add_ps(xmm2, xmm3);
+        xmm3 = _mm_add_ps(xmm4, xmm5);
+
+        src0 += 4;
+
+        xmm1 = _mm_add_ps(xmm2, xmm1);
+        xmm1 = _mm_add_ps(xmm3, xmm1);
+    }
+    xmm2 = _mm_hadd_ps(xmm9, xmm1);
+    xmm3 = _mm_hadd_ps(xmm2, xmm2);
+    xmm4 = _mm_hadd_ps(xmm3, xmm3);
+    _mm_store_ss(&result, xmm4);
+
+    for (i = 0; i < leftovers; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        result += (center_point_array[0] * fst + center_point_array[1] * sq +
+                   center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    result += (float)(num_points)*center_point_array[4];
+    *target = result;
+}
+
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx2_fma(float* target,
+                                                          float* src0,
+                                                          float* center_point_array,
+                                                          float* cutoff,
+                                                          unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_load_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+        x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_store_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
+}
+#endif // LV_HAVE_AVX && LV_HAVE_FMA
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_a_avx(float* target,
+                                                     float* src0,
+                                                     float* center_point_array,
+                                                     float* cutoff,
+                                                     unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_load_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+        x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_store_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
+}
+#endif // LV_HAVE_AVX
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target,
+                                                       float* src0,
+                                                       float* center_point_array,
+                                                       float* cutoff,
+                                                       unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+
+    float result[8] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+    float fst = 0.0f;
+    float sq = 0.0f;
+    float thrd = 0.0f;
+    float frth = 0.0f;
+
+    unsigned int i = 0;
+    unsigned int k = 0;
+    for (i = 0; i < eighth_points; ++i) {
+        for (k = 0; k < 8; ++k) {
+            fst = *src0++;
+            fst = MAX(fst, *cutoff);
+            sq = fst * fst;
+            thrd = fst * sq;
+            frth = fst * thrd;
+            result[k] += center_point_array[0] * fst + center_point_array[1] * sq;
+            result[k] += center_point_array[2] * thrd + center_point_array[3] * frth;
+        }
+    }
+    for (k = 0; k < 8; k += 2)
+        result[k] = result[k] + result[k + 1];
+
+    *target = result[0] + result[2] + result[4] + result[6];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = fst * thrd;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+    *target += (float)(num_points)*center_point_array[4];
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_a_neon(float* __restrict target,
+                                   float* __restrict src0,
+                                   float* __restrict center_point_array,
+                                   float* __restrict cutoff,
+                                   unsigned int num_points)
+{
+    unsigned int i;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    float32x2_t x_to_1, x_to_2, x_to_3, x_to_4;
+    float32x2_t cutoff_vector;
+    float32x2x2_t x_low, x_high;
+    float32x4_t x_qvector, c_qvector, cpa_qvector;
+    float accumulator;
+    float res_accumulators[4];
+
+    c_qvector = vld1q_f32(zero);
+    // load the cutoff in to a vector
+    cutoff_vector = vdup_n_f32(*cutoff);
+    // ... center point array
+    cpa_qvector = vld1q_f32(center_point_array);
+
+    for (i = 0; i < num_points; ++i) {
+        // load x  (src0)
+        x_to_1 = vdup_n_f32(*src0++);
+
+        // Get a vector of max(src0, cutoff)
+        x_to_1 = vmax_f32(x_to_1, cutoff_vector); // x^1
+        x_to_2 = vmul_f32(x_to_1, x_to_1);        // x^2
+        x_to_3 = vmul_f32(x_to_2, x_to_1);        // x^3
+        x_to_4 = vmul_f32(x_to_3, x_to_1);        // x^4
+        // zip up doubles to interleave
+        x_low = vzip_f32(x_to_1, x_to_2);  // [x^2 | x^1 || x^2 | x^1]
+        x_high = vzip_f32(x_to_3, x_to_4); // [x^4 | x^3 || x^4 | x^3]
+        // float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+        x_qvector = vcombine_f32(x_low.val[0], x_high.val[0]);
+        // now we finally have [x^4 | x^3 | x^2 | x] !
+
+        c_qvector = vmlaq_f32(c_qvector, x_qvector, cpa_qvector);
+    }
+    // there should be better vector reduction techniques
+    vst1q_f32(res_accumulators, c_qvector);
+    accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+                  res_accumulators[3];
+
+    *target = accumulator + (float)num_points * center_point_array[4];
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+
+static inline void
+volk_32f_x3_sum_of_poly_32f_neonvert(float* __restrict target,
+                                     float* __restrict src0,
+                                     float* __restrict center_point_array,
+                                     float* __restrict cutoff,
+                                     unsigned int num_points)
+{
+    unsigned int i;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+    float accumulator;
+
+    float32x4_t accumulator1_vec, accumulator2_vec, accumulator3_vec, accumulator4_vec;
+    accumulator1_vec = vld1q_f32(zero);
+    accumulator2_vec = vld1q_f32(zero);
+    accumulator3_vec = vld1q_f32(zero);
+    accumulator4_vec = vld1q_f32(zero);
+    float32x4_t x_to_1, x_to_2, x_to_3, x_to_4;
+    float32x4_t cutoff_vector, cpa_0, cpa_1, cpa_2, cpa_3;
+
+    // load the cutoff in to a vector
+    cutoff_vector = vdupq_n_f32(*cutoff);
+    // ... center point array
+    cpa_0 = vdupq_n_f32(center_point_array[0]);
+    cpa_1 = vdupq_n_f32(center_point_array[1]);
+    cpa_2 = vdupq_n_f32(center_point_array[2]);
+    cpa_3 = vdupq_n_f32(center_point_array[3]);
+
+    // nathan is not sure why this is slower *and* wrong compared to neonvertfma
+    for (i = 0; i < num_points / 4; ++i) {
+        // load x
+        x_to_1 = vld1q_f32(src0);
+
+        // Get a vector of max(src0, cutoff)
+        x_to_1 = vmaxq_f32(x_to_1, cutoff_vector); // x^1
+        x_to_2 = vmulq_f32(x_to_1, x_to_1);        // x^2
+        x_to_3 = vmulq_f32(x_to_2, x_to_1);        // x^3
+        x_to_4 = vmulq_f32(x_to_3, x_to_1);        // x^4
+        x_to_1 = vmulq_f32(x_to_1, cpa_0);
+        x_to_2 = vmulq_f32(x_to_2, cpa_1);
+        x_to_3 = vmulq_f32(x_to_3, cpa_2);
+        x_to_4 = vmulq_f32(x_to_4, cpa_3);
+        accumulator1_vec = vaddq_f32(accumulator1_vec, x_to_1);
+        accumulator2_vec = vaddq_f32(accumulator2_vec, x_to_2);
+        accumulator3_vec = vaddq_f32(accumulator3_vec, x_to_3);
+        accumulator4_vec = vaddq_f32(accumulator4_vec, x_to_4);
+
+        src0 += 4;
+    }
+    accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator2_vec);
+    accumulator3_vec = vaddq_f32(accumulator3_vec, accumulator4_vec);
+    accumulator1_vec = vaddq_f32(accumulator1_vec, accumulator3_vec);
+
+    __VOLK_ATTR_ALIGNED(32) float res_accumulators[4];
+    vst1q_f32(res_accumulators, accumulator1_vec);
+    accumulator = res_accumulators[0] + res_accumulators[1] + res_accumulators[2] +
+                  res_accumulators[3];
+
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    for (i = 4 * (num_points / 4); i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        // fith = sq * thrd;
+
+        accumulator += (center_point_array[0] * fst + center_point_array[1] * sq +
+                        center_point_array[2] * thrd + center_point_array[3] * frth); //+
+    }
+
+    *target = accumulator + (float)num_points * center_point_array[4];
+}
+
+#endif /* LV_HAVE_NEON */
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/
+
+#ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
+#define INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifndef MAX
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+#endif
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx_fma(float* target,
+                                                         float* src0,
+                                                         float* center_point_array,
+                                                         float* cutoff,
+                                                         unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_loadu_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_fmadd_ps(x_to_1, cpa0, x_to_2);
+        x_to_3 = _mm256_fmadd_ps(x_to_3, cpa2, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_storeu_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    *target += (float)(num_points)*center_point_array[4];
+}
+#endif // LV_HAVE_AVX && LV_HAVE_FMA
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32f_x3_sum_of_poly_32f_u_avx(float* target,
+                                                     float* src0,
+                                                     float* center_point_array,
+                                                     float* cutoff,
+                                                     unsigned int num_points)
+{
+    const unsigned int eighth_points = num_points / 8;
+    float fst = 0.0;
+    float sq = 0.0;
+    float thrd = 0.0;
+    float frth = 0.0;
+
+    __m256 cpa0, cpa1, cpa2, cpa3, cutoff_vec;
+    __m256 target_vec;
+    __m256 x_to_1, x_to_2, x_to_3, x_to_4;
+
+    cpa0 = _mm256_set1_ps(center_point_array[0]);
+    cpa1 = _mm256_set1_ps(center_point_array[1]);
+    cpa2 = _mm256_set1_ps(center_point_array[2]);
+    cpa3 = _mm256_set1_ps(center_point_array[3]);
+    cutoff_vec = _mm256_set1_ps(*cutoff);
+    target_vec = _mm256_setzero_ps();
+
+    unsigned int i;
+
+    for (i = 0; i < eighth_points; ++i) {
+        x_to_1 = _mm256_loadu_ps(src0);
+        x_to_1 = _mm256_max_ps(x_to_1, cutoff_vec);
+        x_to_2 = _mm256_mul_ps(x_to_1, x_to_1); // x^2
+        x_to_3 = _mm256_mul_ps(x_to_1, x_to_2); // x^3
+        // x^1 * x^3 is slightly faster than x^2 * x^2
+        x_to_4 = _mm256_mul_ps(x_to_1, x_to_3); // x^4
+
+        x_to_1 = _mm256_mul_ps(x_to_1, cpa0); // cpa[0] * x^1
+        x_to_2 = _mm256_mul_ps(x_to_2, cpa1); // cpa[1] * x^2
+        x_to_3 = _mm256_mul_ps(x_to_3, cpa2); // cpa[2] * x^3
+        x_to_4 = _mm256_mul_ps(x_to_4, cpa3); // cpa[3] * x^4
+
+        x_to_1 = _mm256_add_ps(x_to_1, x_to_2);
+        x_to_3 = _mm256_add_ps(x_to_3, x_to_4);
+        // this is slightly faster than result += (x_to_1 + x_to_3)
+        target_vec = _mm256_add_ps(x_to_1, target_vec);
+        target_vec = _mm256_add_ps(x_to_3, target_vec);
+
+        src0 += 8;
+    }
+
+    // the hadd for vector reduction has very very slight impact @ 50k iters
+    __VOLK_ATTR_ALIGNED(32) float temp_results[8];
+    target_vec = _mm256_hadd_ps(
+        target_vec,
+        target_vec); // x0+x1 | x2+x3 | x0+x1 | x2+x3 || x4+x5 | x6+x7 | x4+x5 | x6+x7
+    _mm256_storeu_ps(temp_results, target_vec);
+    *target = temp_results[0] + temp_results[1] + temp_results[4] + temp_results[5];
+
+    for (i = eighth_points * 8; i < num_points; ++i) {
+        fst = *src0++;
+        fst = MAX(fst, *cutoff);
+        sq = fst * fst;
+        thrd = fst * sq;
+        frth = sq * sq;
+
+        *target += (center_point_array[0] * fst + center_point_array[1] * sq +
+                    center_point_array[2] * thrd + center_point_array[3] * frth);
+    }
+
+    *target += (float)(num_points)*center_point_array[4];
+}
+#endif // LV_HAVE_AVX
+
+#endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_u_H*/
diff --git a/kernels/volk/volk_32fc_32f_add_32fc.h b/kernels/volk/volk_32fc_32f_add_32fc.h

new file mode 100644 (file)

index 0000000..b820ed5
--- /dev/null
+++ b/kernels/volk/volk_32fc_32f_add_32fc.h
@@ -0,0 +1,234 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_32f_add_32fc
+ *
+ * \b Overview
+ *
+ * Adds two vectors together element by element:
+ *
+ * c[i] = a[i] + b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_32f_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ *
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
+ *
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* decreasing = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (lv_32fc_t)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_32fc_32f_add_32fc(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_32f_add_32fc_u_H
+#define INCLUDED_volk_32fc_32f_add_32fc_u_H
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_32f_add_32fc_generic(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  const float* bVector,
+                                                  unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_add_32fc_u_avx(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+    __m256 cpx_b1, cpx_b2;
+    __m256 zero;
+    zero = _mm256_setzero_ps();
+    __m256 tmp1, tmp2;
+    for (; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_loadu_ps((float*)aPtr);
+        aVal2 = _mm256_loadu_ps((float*)(aPtr + 4));
+        bVal = _mm256_loadu_ps(bPtr);
+        cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+        cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+        tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+        tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+        cVal1 = _mm256_add_ps(aVal1, tmp1);
+        cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+        _mm256_storeu_ps((float*)cPtr,
+                         cVal1); // Store the results back into the C container
+        _mm256_storeu_ps((float*)(cPtr + 4),
+                         cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_add_32fc_a_avx(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const float* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal1, aVal2, bVal, cVal1, cVal2;
+    __m256 cpx_b1, cpx_b2;
+    __m256 zero;
+    zero = _mm256_setzero_ps();
+    __m256 tmp1, tmp2;
+    for (; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_load_ps((float*)aPtr);
+        aVal2 = _mm256_load_ps((float*)(aPtr + 4));
+        bVal = _mm256_load_ps(bPtr);
+        cpx_b1 = _mm256_unpacklo_ps(bVal, zero); // b0, 0, b1, 0, b4, 0, b5, 0
+        cpx_b2 = _mm256_unpackhi_ps(bVal, zero); // b2, 0, b3, 0, b6, 0, b7, 0
+
+        tmp1 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x0 + (0x2 << 4));
+        tmp2 = _mm256_permute2f128_ps(cpx_b1, cpx_b2, 0x1 + (0x3 << 4));
+
+        cVal1 = _mm256_add_ps(aVal1, tmp1);
+        cVal2 = _mm256_add_ps(aVal2, tmp2);
+
+        _mm256_store_ps((float*)cPtr,
+                        cVal1); // Store the results back into the C container
+        _mm256_store_ps((float*)(cPtr + 4),
+                        cVal2); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_add_32fc_neon(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const float* bVector,
+                                               unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    float32x4x4_t aVal0, aVal1;
+    float32x4x2_t bVal0, bVal1;
+
+    const unsigned int sixteenthPoints = num_points / 16;
+    unsigned int number = 0;
+    for (; number < sixteenthPoints; number++) {
+        aVal0 = vld4q_f32((const float*)aPtr);
+        aPtr += 8;
+        aVal1 = vld4q_f32((const float*)aPtr);
+        aPtr += 8;
+        __VOLK_PREFETCH(aPtr + 16);
+
+        bVal0 = vld2q_f32((const float*)bPtr);
+        bPtr += 8;
+        bVal1 = vld2q_f32((const float*)bPtr);
+        bPtr += 8;
+        __VOLK_PREFETCH(bPtr + 16);
+
+        aVal0.val[0] = vaddq_f32(aVal0.val[0], bVal0.val[0]);
+        aVal0.val[2] = vaddq_f32(aVal0.val[2], bVal0.val[1]);
+
+        aVal1.val[2] = vaddq_f32(aVal1.val[2], bVal1.val[1]);
+        aVal1.val[0] = vaddq_f32(aVal1.val[0], bVal1.val[0]);
+
+        vst4q_f32((float*)(cPtr), aVal0);
+        cPtr += 8;
+        vst4q_f32((float*)(cPtr), aVal1);
+        cPtr += 8;
+    }
+
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32fc_32f_add_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_32f_dot_prod_32fc.h b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h

new file mode 100644 (file)

index 0000000..2c6c6eb
--- /dev/null
+++ b/kernels/volk/volk_32fc_32f_dot_prod_32fc.h
@@ -0,0 +1,774 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_32f_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and will be complex.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_32f_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const float
+ * * taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex samples
+ * \li taps:  floating point taps
+ * \li num_points: number of samples in both \p input and \p taps
+ *
+ * \b Outputs
+ * \li result: pointer to a complex value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ * lv_32fc_t y;
+ * lv_32fc_t *x = (lv_32fc_t*)volk_malloc(N*sizeof(lv_32fc_t), volk_get_alignment());
+ * float *t = (float*)volk_malloc(N*sizeof(float), volk_get_alignment());
+ *
+ * <populate x and t with some values>
+ *
+ * volk_32fc_dot_prod_32fc(&y, x, t, N);
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_32f_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const float* taps,
+                                                       unsigned int num_points)
+{
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+    unsigned int number = 0;
+
+    *realpt = 0;
+    *imagpt = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const float* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_load_ps(aPtr);
+        a1Val = _mm256_load_ps(aPtr + 8);
+        a2Val = _mm256_load_ps(aPtr + 16);
+        a3Val = _mm256_load_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX*/
+
+
+#ifdef LV_HAVE_SSE
+
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 x0Val, x1Val, x2Val, x3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_load_ps(aPtr);
+        a1Val = _mm_load_ps(aPtr + 4);
+        a2Val = _mm_load_ps(aPtr + 8);
+        a3Val = _mm_load_ps(aPtr + 12);
+
+        x0Val = _mm_load_ps(bPtr);
+        x1Val = _mm_load_ps(bPtr);
+        x2Val = _mm_load_ps(bPtr + 4);
+        x3Val = _mm_load_ps(bPtr + 4);
+        b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+        b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+        b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+        b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 8;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const float* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+
+        x0Val = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_load_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
+        dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
+        dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
+        dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val, b1Val, b2Val, b3Val;
+    __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
+    __m256 c0Val, c1Val, c2Val, c3Val;
+
+    __m256 dotProdVal0 = _mm256_setzero_ps();
+    __m256 dotProdVal1 = _mm256_setzero_ps();
+    __m256 dotProdVal2 = _mm256_setzero_ps();
+    __m256 dotProdVal3 = _mm256_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr + 8);
+        a2Val = _mm256_loadu_ps(aPtr + 16);
+        a3Val = _mm256_loadu_ps(aPtr + 24);
+
+        x0Val = _mm256_loadu_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
+        x1Val = _mm256_loadu_ps(bPtr + 8);
+        x0loVal = _mm256_unpacklo_ps(x0Val, x0Val); // t0|t0|t1|t1|t4|t4|t5|t5
+        x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val); // t2|t2|t3|t3|t6|t6|t7|t7
+        x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
+        x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
+
+        // TODO: it may be possible to rearrange swizzling to better pipeline data
+        b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+        b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+        b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
+        b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
+
+        c0Val = _mm256_mul_ps(a0Val, b0Val);
+        c1Val = _mm256_mul_ps(a1Val, b1Val);
+        c2Val = _mm256_mul_ps(a2Val, b2Val);
+        c3Val = _mm256_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 32;
+        bPtr += 16;
+    }
+
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
+
+    _mm256_store_ps(dotProductVector,
+                    dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+    *realpt += dotProductVector[4];
+    *imagpt += dotProductVector[5];
+    *realpt += dotProductVector[6];
+    *imagpt += dotProductVector[7];
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+#endif /*LV_HAVE_AVX*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_32f_dot_prod_32fc_neon_unroll(lv_32fc_t* __restrict result,
+                                        const lv_32fc_t* __restrict input,
+                                        const float* __restrict taps,
+                                        unsigned int num_points)
+{
+
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* inputPtr = (float*)input;
+    const float* tapsPtr = taps;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float accVector_real[4];
+    float accVector_imag[4];
+
+    float32x4x2_t inputVector0, inputVector1;
+    float32x4_t tapsVector0, tapsVector1;
+    float32x4_t tmp_real0, tmp_imag0;
+    float32x4_t tmp_real1, tmp_imag1;
+    float32x4_t real_accumulator0, imag_accumulator0;
+    float32x4_t real_accumulator1, imag_accumulator1;
+
+    // zero out accumulators
+    // take a *float, return float32x4_t
+    real_accumulator0 = vld1q_f32(zero);
+    imag_accumulator0 = vld1q_f32(zero);
+    real_accumulator1 = vld1q_f32(zero);
+    imag_accumulator1 = vld1q_f32(zero);
+
+    for (number = 0; number < quarterPoints; number++) {
+        // load doublewords and duplicate in to second lane
+        tapsVector0 = vld1q_f32(tapsPtr);
+        tapsVector1 = vld1q_f32(tapsPtr + 4);
+
+        // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+        inputVector0 = vld2q_f32(inputPtr);
+        inputVector1 = vld2q_f32(inputPtr + 8);
+        // inputVector is now a struct of two vectors, 0th is real, 1st is imag
+
+        tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
+        tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
+
+        tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
+        tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
+
+        real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
+        imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
+
+        real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
+        imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
+
+        tapsPtr += 8;
+        inputPtr += 16;
+    }
+
+    real_accumulator0 = vaddq_f32(real_accumulator0, real_accumulator1);
+    imag_accumulator0 = vaddq_f32(imag_accumulator0, imag_accumulator1);
+    // void vst1q_f32( float32_t * ptr, float32x4_t val);
+    // store results back to a complex (array of 2 floats)
+    vst1q_f32(accVector_real, real_accumulator0);
+    vst1q_f32(accVector_imag, imag_accumulator0);
+    *realpt =
+        accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
+
+    *imagpt =
+        accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
+
+    // clean up the remainder
+    for (number = quarterPoints * 8; number < num_points; number++) {
+        *realpt += ((*inputPtr++) * (*tapsPtr));
+        *imagpt += ((*inputPtr++) * (*tapsPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_dot_prod_32fc_a_neon(lv_32fc_t* __restrict result,
+                                                      const lv_32fc_t* __restrict input,
+                                                      const float* __restrict taps,
+                                                      unsigned int num_points)
+{
+
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* inputPtr = (float*)input;
+    const float* tapsPtr = taps;
+    float zero[4] = { 0.0f, 0.0f, 0.0f, 0.0f };
+    float accVector_real[4];
+    float accVector_imag[4];
+
+    float32x4x2_t inputVector;
+    float32x4_t tapsVector;
+    float32x4_t tmp_real, tmp_imag;
+    float32x4_t real_accumulator, imag_accumulator;
+
+
+    // zero out accumulators
+    // take a *float, return float32x4_t
+    real_accumulator = vld1q_f32(zero);
+    imag_accumulator = vld1q_f32(zero);
+
+    for (number = 0; number < quarterPoints; number++) {
+        // load taps ( float32x2x2_t = vld1q_f32( float32_t const * ptr) )
+        // load doublewords and duplicate in to second lane
+        tapsVector = vld1q_f32(tapsPtr);
+
+        // load quadword of complex numbers in to 2 lanes. 1st lane is real, 2dn imag
+        inputVector = vld2q_f32(inputPtr);
+
+        tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
+        tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
+
+        real_accumulator = vaddq_f32(real_accumulator, tmp_real);
+        imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
+
+
+        tapsPtr += 4;
+        inputPtr += 8;
+    }
+
+    // store results back to a complex (array of 2 floats)
+    vst1q_f32(accVector_real, real_accumulator);
+    vst1q_f32(accVector_imag, imag_accumulator);
+    *realpt =
+        accVector_real[0] + accVector_real[1] + accVector_real[2] + accVector_real[3];
+
+    *imagpt =
+        accVector_imag[0] + accVector_imag[1] + accVector_imag[2] + accVector_imag[3];
+
+    // clean up the remainder
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *realpt += ((*inputPtr++) * (*tapsPtr));
+        *imagpt += ((*inputPtr++) * (*tapsPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasm(lv_32fc_t* result,
+                                                  const lv_32fc_t* input,
+                                                  const float* taps,
+                                                  unsigned int num_points);
+#endif /*LV_HAVE_NEONV7*/
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla(lv_32fc_t* result,
+                                                      const lv_32fc_t* input,
+                                                      const float* taps,
+                                                      unsigned int num_points);
+#endif /*LV_HAVE_NEONV7*/
+
+#ifdef LV_HAVE_NEONV7
+extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const float* taps,
+                                                       unsigned int num_points);
+#endif /*LV_HAVE_NEONV7*/
+
+#ifdef LV_HAVE_SSE
+
+static inline void volk_32fc_32f_dot_prod_32fc_u_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const float* taps,
+                                                     unsigned int num_points)
+{
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 8;
+
+    float res[2];
+    float *realpt = &res[0], *imagpt = &res[1];
+    const float* aPtr = (float*)input;
+    const float* bPtr = taps;
+
+    __m128 a0Val, a1Val, a2Val, a3Val;
+    __m128 b0Val, b1Val, b2Val, b3Val;
+    __m128 x0Val, x1Val, x2Val, x3Val;
+    __m128 c0Val, c1Val, c2Val, c3Val;
+
+    __m128 dotProdVal0 = _mm_setzero_ps();
+    __m128 dotProdVal1 = _mm_setzero_ps();
+    __m128 dotProdVal2 = _mm_setzero_ps();
+    __m128 dotProdVal3 = _mm_setzero_ps();
+
+    for (; number < sixteenthPoints; number++) {
+
+        a0Val = _mm_loadu_ps(aPtr);
+        a1Val = _mm_loadu_ps(aPtr + 4);
+        a2Val = _mm_loadu_ps(aPtr + 8);
+        a3Val = _mm_loadu_ps(aPtr + 12);
+
+        x0Val = _mm_loadu_ps(bPtr);
+        x1Val = _mm_loadu_ps(bPtr);
+        x2Val = _mm_loadu_ps(bPtr + 4);
+        x3Val = _mm_loadu_ps(bPtr + 4);
+        b0Val = _mm_unpacklo_ps(x0Val, x1Val);
+        b1Val = _mm_unpackhi_ps(x0Val, x1Val);
+        b2Val = _mm_unpacklo_ps(x2Val, x3Val);
+        b3Val = _mm_unpackhi_ps(x2Val, x3Val);
+
+        c0Val = _mm_mul_ps(a0Val, b0Val);
+        c1Val = _mm_mul_ps(a1Val, b1Val);
+        c2Val = _mm_mul_ps(a2Val, b2Val);
+        c3Val = _mm_mul_ps(a3Val, b3Val);
+
+        dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
+        dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
+        dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
+        dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
+
+        aPtr += 16;
+        bPtr += 8;
+    }
+
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
+    dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
+
+    __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
+
+    _mm_store_ps(dotProductVector,
+                 dotProdVal0); // Store the results back into the dot product vector
+
+    *realpt = dotProductVector[0];
+    *imagpt = dotProductVector[1];
+    *realpt += dotProductVector[2];
+    *imagpt += dotProductVector[3];
+
+    number = sixteenthPoints * 8;
+    for (; number < num_points; number++) {
+        *realpt += ((*aPtr++) * (*bPtr));
+        *imagpt += ((*aPtr++) * (*bPtr++));
+    }
+
+    *result = *(lv_32fc_t*)(&res[0]);
+}
+
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_32f_dot_prod_32fc_H*/
diff --git a/kernels/volk/volk_32fc_32f_multiply_32fc.h b/kernels/volk/volk_32fc_32f_multiply_32fc.h

new file mode 100644 (file)

index 0000000..86e6d44
--- /dev/null
+++ b/kernels/volk/volk_32fc_32f_multiply_32fc.h
@@ -0,0 +1,228 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_32f_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies a complex vector by a floating point vector and returns
+ * the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_32f_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+ * \li bVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_32f_multiply_32fc();
+ *
+ * volk_free(x);
+ * volk_free(t);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
+
+    __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+
+    for (; number < eighthPoints; number++) {
+
+        aVal1 = _mm256_load_ps((float*)aPtr);
+        aPtr += 4;
+
+        aVal2 = _mm256_load_ps((float*)aPtr);
+        aPtr += 4;
+
+        bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
+        bPtr += 8;
+
+        bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
+        bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
+
+        bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
+        bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
+
+        cVal1 = _mm256_mul_ps(aVal1, bVal1);
+        cVal2 = _mm256_mul_ps(aVal2, bVal2);
+
+        _mm256_store_ps((float*)cPtr,
+                        cVal1); // Store the results back into the C container
+        cPtr += 4;
+
+        _mm256_store_ps((float*)cPtr,
+                        cVal2); // Store the results back into the C container
+        cPtr += 4;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; ++number) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+
+    __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal1 = _mm_load_ps((const float*)aPtr);
+        aPtr += 2;
+
+        aVal2 = _mm_load_ps((const float*)aPtr);
+        aPtr += 2;
+
+        bVal = _mm_load_ps(bPtr);
+        bPtr += 4;
+
+        bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
+        bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
+
+        cVal = _mm_mul_ps(aVal1, bVal1);
+
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+        cPtr += 2;
+
+        cVal = _mm_mul_ps(aVal2, bVal2);
+
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+        cPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr);
+        bPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const float* bVector,
+                                                       unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const float* bVector,
+                                                    unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const float* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t inputVector, outputVector;
+    float32x4_t tapsVector;
+    for (number = 0; number < quarter_points; number++) {
+        inputVector = vld2q_f32((float*)aPtr);
+        tapsVector = vld1q_f32(bPtr);
+
+        outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
+        outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
+
+        vst2q_f32((float*)cPtr, outputVector);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const float* bVector,
+                                                   unsigned int num_points);
+
+static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float* bVector,
+                                                     unsigned int num_points)
+{
+    volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_accumulator_s32fc.h b/kernels/volk/volk_32fc_accumulator_s32fc.h

new file mode 100644 (file)

index 0000000..d7267ea
--- /dev/null
+++ b/kernels/volk/volk_32fc_accumulator_s32fc.h
@@ -0,0 +1,279 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_accumulator_s32fc
+ *
+ * \b Overview
+ *
+ * Accumulates the values in the input buffer.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_accumulator_s32fc(lv_32fc_t* result, const lv_32fc_t* inputBuffer,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputBuffer: The buffer of data to be accumulated
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li result: The accumulated result.
+ *
+ * \b Example
+ * Calculate the sum of numbers  0 through 99
+ * \code
+ *   int N = 100;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* vec = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t), alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       vec[ii] = lv_cmake( (float) ii, (float) -ii );
+ *   }
+ *
+ *   volk_32fc_accumulator_s32fc(out, vec, N);
+ *
+ *   printf("sum(0..99)+1j*sum(0..-99) = %1.2f %1.2f \n", lv_creal(*out) , lv_cimag(*out)
+ * );
+ *
+ *   volk_free(vec);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
+#define INCLUDED_volk_32fc_accumulator_s32fc_a_H
+
+#include <inttypes.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_accumulator_s32fc_generic(lv_32fc_t* result,
+                                                       const lv_32fc_t* inputBuffer,
+                                                       unsigned int num_points)
+{
+    const lv_32fc_t* aPtr = inputBuffer;
+    unsigned int number = 0;
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* inputBuffer,
+                                                     unsigned int num_points)
+{
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const lv_32fc_t* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm256_loadu_ps((float*)aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+    returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
+    returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* inputBuffer,
+                                                     unsigned int num_points)
+{
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    const lv_32fc_t* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < halfPoints; number++) {
+        aVal = _mm_loadu_ps((float*)aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 2;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t* result,
+                                                     const lv_32fc_t* inputBuffer,
+                                                     unsigned int num_points)
+{
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const lv_32fc_t* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
+
+    __m256 accumulator = _mm256_setzero_ps();
+    __m256 aVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        aVal = _mm256_load_ps((float*)aPtr);
+        accumulator = _mm256_add_ps(accumulator, aVal);
+        aPtr += 4;
+    }
+
+    _mm256_store_ps(tempBuffer, accumulator);
+
+    returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+    returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
+    returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t* result,
+                                                     const lv_32fc_t* inputBuffer,
+                                                     unsigned int num_points)
+{
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    const lv_32fc_t* aPtr = inputBuffer;
+    __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
+
+    __m128 accumulator = _mm_setzero_ps();
+    __m128 aVal = _mm_setzero_ps();
+
+    for (; number < halfPoints; number++) {
+        aVal = _mm_load_ps((float*)aPtr);
+        accumulator = _mm_add_ps(accumulator, aVal);
+        aPtr += 2;
+    }
+
+    _mm_store_ps(tempBuffer, accumulator);
+
+    returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_accumulator_s32fc_neon(lv_32fc_t* result,
+                                                    const lv_32fc_t* inputBuffer,
+                                                    unsigned int num_points)
+{
+    const lv_32fc_t* aPtr = inputBuffer;
+    unsigned int number = 0;
+    lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
+    unsigned int eighthPoints = num_points / 8;
+    float32x4_t in_vec;
+    float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
+    float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
+    float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
+    float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
+    __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
+
+    for (; number < eighthPoints; number++) {
+        in_vec = vld1q_f32((float*)aPtr);
+        out_vec0 = vaddq_f32(in_vec, out_vec0);
+        aPtr += 2;
+
+        in_vec = vld1q_f32((float*)aPtr);
+        out_vec1 = vaddq_f32(in_vec, out_vec1);
+        aPtr += 2;
+
+        in_vec = vld1q_f32((float*)aPtr);
+        out_vec2 = vaddq_f32(in_vec, out_vec2);
+        aPtr += 2;
+
+        in_vec = vld1q_f32((float*)aPtr);
+        out_vec3 = vaddq_f32(in_vec, out_vec3);
+        aPtr += 2;
+    }
+    vst1q_f32(tempBuffer, out_vec0);
+    returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    vst1q_f32(tempBuffer, out_vec1);
+    returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    vst1q_f32(tempBuffer, out_vec2);
+    returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    vst1q_f32(tempBuffer, out_vec3);
+    returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
+    returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        returnValue += (*aPtr++);
+    }
+    *result = returnValue;
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
diff --git a/kernels/volk/volk_32fc_conjugate_32fc.h b/kernels/volk/volk_32fc_conjugate_32fc.h

new file mode 100644 (file)

index 0000000..265c27e
--- /dev/null
+++ b/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -0,0 +1,280 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Takes the conjugate of a complex vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned
+ * int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of complex floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The output vector of complex floats.
+ *
+ * \b Example
+ * Generate points around the top half of the unit circle and conjugate them
+ * to give bottom half of the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_conjugate_32fc(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %.1f + %.1fi\n", ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_conjugate_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for (; number < quarterPoints; number++) {
+
+        x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+        x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+        _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+        x = _mm_xor_ps(x, conjugator); // conjugate register
+
+        _mm_storeu_ps((float*)c, x); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = lv_conj(*a);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = lv_conj(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_conjugate_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+    for (; number < quarterPoints; number++) {
+
+        x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+        x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+        _mm256_store_ps((float*)c, x); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+        x = _mm_xor_ps(x, conjugator); // conjugate register
+
+        _mm_store_ps((float*)c, x); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = lv_conj(*a);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float32x4x2_t x;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    for (number = 0; number < quarterPoints; number++) {
+        __VOLK_PREFETCH(a + 4);
+        x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
+
+        // xor the imaginary lane
+        x.val[1] = vnegq_f32(x.val[1]);
+
+        vst2q_f32((float*)c, x); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *c++ = lv_conj(*a++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = lv_conj(*aPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_convert_16ic.h b/kernels/volk/volk_32fc_convert_16ic.h

new file mode 100644 (file)

index 0000000..d116165
--- /dev/null
+++ b/kernels/volk/volk_32fc_convert_16ic.h
@@ -0,0 +1,419 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_convert_16ic
+ *
+ * \b Overview
+ *
+ * Converts a complex vector of 32-bits float each component into
+ * a complex vector of 16-bits integer each component.
+ * Values are saturated to the limit values of the output data type.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_convert_16ic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector,
+ * unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li inputVector:  The complex 32-bit float input data buffer.
+ * \li num_points:   The number of data values to be converted.
+ *
+ * \b Outputs
+ * \li outputVector: The complex 16-bit integer output data buffer.
+ *
+ */
+
+#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
+#define INCLUDED_volk_32fc_convert_16ic_a_H
+
+#include "volk/volk_complex.h"
+#include <limits.h>
+#include <math.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+
+    const float min_val = (float)SHRT_MIN;
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+    unsigned int i;
+
+    for (i = 0; i < avx_iters; i++) {
+        inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        __VOLK_PREFETCH(inputVectorPtr + 16);
+
+        // Clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+        _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    for (i = avx_iters * 16; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+
+    const float min_val = (float)SHRT_MIN;
+    const float max_val = (float)SHRT_MAX;
+
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    const __m128 vmin_val = _mm_set_ps1(min_val);
+    const __m128 vmax_val = _mm_set_ps1(max_val);
+    unsigned int i;
+
+    for (i = 0; i < sse_iters; i++) {
+        inputVal1 = _mm_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_load_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        // Clip
+        ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    for (i = sse_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#if LV_HAVE_NEONV7
+#include <arm_neon.h>
+
+#define VCVTRQ_S32_F32(result, value)                                       \
+    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[0]) : "t"(value[0]) :); \
+    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[1]) : "t"(value[1]) :); \
+    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[2]) : "t"(value[2]) :); \
+    __VOLK_ASM("VCVTR.S32.F32 %0, %1" : "=t"(result[3]) : "t"(value[3]) :);
+
+static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
+                                               const lv_32fc_t* inputVector,
+                                               unsigned int num_points)
+{
+
+    const unsigned int neon_iters = num_points / 4;
+
+    float32_t* inputVectorPtr = (float32_t*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+    const float min_val_f = (float)SHRT_MIN;
+    const float max_val_f = (float)SHRT_MAX;
+    float32_t aux;
+    unsigned int i;
+
+    const float32x4_t min_val = vmovq_n_f32(min_val_f);
+    const float32x4_t max_val = vmovq_n_f32(max_val_f);
+    float32x4_t ret1, ret2, a, b;
+
+    int32x4_t toint_a = { 0, 0, 0, 0 };
+    int32x4_t toint_b = { 0, 0, 0, 0 };
+    int16x4_t intInputVal1, intInputVal2;
+    int16x8_t res;
+
+    for (i = 0; i < neon_iters; i++) {
+        a = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        b = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+        ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+        // vcvtr takes into account the current rounding mode (as does rintf)
+        VCVTRQ_S32_F32(toint_a, ret1);
+        VCVTRQ_S32_F32(toint_b, ret2);
+
+        intInputVal1 = vqmovn_s32(toint_a);
+        intInputVal2 = vqmovn_s32(toint_b);
+
+        res = vcombine_s16(intInputVal1, intInputVal2);
+        vst1q_s16((int16_t*)outputVectorPtr, res);
+        outputVectorPtr += 8;
+    }
+
+    for (i = neon_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val_f)
+            aux = max_val_f;
+        else if (aux < min_val_f)
+            aux = min_val_f;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+
+#undef VCVTRQ_S32_F32
+#endif /* LV_HAVE_NEONV7 */
+
+#if LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int neon_iters = num_points / 4;
+
+    float32_t* inputVectorPtr = (float32_t*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+
+    const float min_val_f = (float)SHRT_MIN;
+    const float max_val_f = (float)SHRT_MAX;
+    float32_t aux;
+    unsigned int i;
+
+    const float32x4_t min_val = vmovq_n_f32(min_val_f);
+    const float32x4_t max_val = vmovq_n_f32(max_val_f);
+    float32x4_t ret1, ret2, a, b;
+
+    int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
+    int16x4_t intInputVal1, intInputVal2;
+    int16x8_t res;
+
+    for (i = 0; i < neon_iters; i++) {
+        a = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        b = vld1q_f32((const float32_t*)(inputVectorPtr));
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
+        ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
+
+        // vrndiq takes into account the current rounding mode (as does rintf)
+        toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
+        toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
+
+        intInputVal1 = vqmovn_s32(toint_a);
+        intInputVal2 = vqmovn_s32(toint_b);
+
+        res = vcombine_s16(intInputVal1, intInputVal2);
+        vst1q_s16((int16_t*)outputVectorPtr, res);
+        outputVectorPtr += 8;
+    }
+
+    for (i = neon_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val_f)
+            aux = max_val_f;
+        else if (aux < min_val_f)
+            aux = min_val_f;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_NEONV8 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
+                                                  const lv_32fc_t* inputVector,
+                                                  unsigned int num_points)
+{
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    const float min_val = (float)SHRT_MIN;
+    const float max_val = (float)SHRT_MAX;
+    float aux;
+    unsigned int i;
+    for (i = 0; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
+
+#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
+#define INCLUDED_volk_32fc_convert_16ic_u_H
+
+#include "volk/volk_complex.h"
+#include <limits.h>
+#include <math.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+
+    const float min_val = (float)SHRT_MIN;
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+    unsigned int i;
+
+    for (i = 0; i < avx_iters; i++) {
+        inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 8;
+        __VOLK_PREFETCH(inputVectorPtr + 16);
+
+        // Clip
+        ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm256_cvtps_epi32(ret1);
+        intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+        intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 16;
+    }
+
+    for (i = avx_iters * 16; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
+                                                 const lv_32fc_t* inputVector,
+                                                 unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+
+    const float min_val = (float)SHRT_MIN;
+    const float max_val = (float)SHRT_MAX;
+
+    __m128 inputVal1, inputVal2;
+    __m128i intInputVal1, intInputVal2;
+    __m128 ret1, ret2;
+    const __m128 vmin_val = _mm_set_ps1(min_val);
+    const __m128 vmax_val = _mm_set_ps1(max_val);
+
+    unsigned int i;
+    for (i = 0; i < sse_iters; i++) {
+        inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
+        inputVectorPtr += 4;
+        __VOLK_PREFETCH(inputVectorPtr + 8);
+
+        // Clip
+        ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
+        ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
+
+        intInputVal1 = _mm_cvtps_epi32(ret1);
+        intInputVal2 = _mm_cvtps_epi32(ret2);
+
+        intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+        _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+        outputVectorPtr += 8;
+    }
+
+    for (i = sse_iters * 8; i < num_points * 2; i++) {
+        aux = *inputVectorPtr++;
+        if (aux > max_val)
+            aux = max_val;
+        else if (aux < min_val)
+            aux = min_val;
+        *outputVectorPtr++ = (int16_t)rintf(aux);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_32f_x2.h b/kernels/volk/volk_32fc_deinterleave_32f_x2.h

new file mode 100644 (file)

index 0000000..f269d66
--- /dev/null
+++ b/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -0,0 +1,257 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector into I & Q vector
+ * data.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * deinterleave in to real and imaginary buffers.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* re = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* im = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_deinterleave_32f_x2(re, im, in, N);
+ *
+ *   printf("          re  | im\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1f | %+.1f\n", ii, re[ii], im[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(re);
+ *   volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    // Mask for real and imaginary parts
+    const unsigned int eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_store_ps(iBufferPtr, iValue);
+        _mm256_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_deinterleave_32f_x2_neon(float* iBuffer,
+                                                      float* qBuffer,
+                                                      const lv_32fc_t* complexVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(iBufferPtr, complexInput.val[0]);
+        vst1q_f32(qBufferPtr, complexInput.val[1]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_32f_x2_generic(float* iBuffer,
+                                                         float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_a_H */
+
+
+#ifndef INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_32f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_32fc_deinterleave_32f_x2_u_avx(float* iBuffer,
+                                                       float* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    // Mask for real and imaginary parts
+    const unsigned int eighthPoints = num_points / 8;
+    __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(complex1, complex2, 0x88);
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_storeu_ps(iBufferPtr, iValue);
+        _mm256_storeu_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32fc_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_64f_x2.h b/kernels/volk/volk_32fc_deinterleave_64f_x2.h

new file mode 100644 (file)

index 0000000..51718dd
--- /dev/null
+++ b/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -0,0 +1,336 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_64f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector into I & Q vector
+ * data. The output vectors are converted to doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_64f_x2(double* iBuffer, double* qBuffer, const
+ * lv_32fc_t* complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * deinterleave in to real and imaginary double buffers.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   double* re = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* im = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_deinterleave_64f_x2(re, im, in, N);
+ *
+ *   printf("          re  | im\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1g | %+.1g\n", ii, re[ii], im[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(re);
+ *   volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer,
+                                                       double* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexH = _mm256_extractf128_ps(cplxValue, 1);
+        complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_deinterleave_64f_x2_u_sse2(double* iBuffer,
+                                                        double* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for (; number < halfPoints; number++) {
+
+        cplxValue = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_storeu_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_storeu_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_64f_x2_generic(double* iBuffer,
+                                                         double* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_u_H */
+#ifndef INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+#define INCLUDED_volk_32fc_deinterleave_64f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer,
+                                                       double* qBuffer,
+                                                       const lv_32fc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 complexH, complexL, fVal;
+    __m256d dVal;
+
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complexH = _mm256_extractf128_ps(cplxValue, 1);
+        complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_deinterleave_64f_x2_a_sse2(double* iBuffer,
+                                                        double* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+
+    for (; number < halfPoints; number++) {
+
+        cplxValue = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(iBufferPtr, dVal);
+
+        // Arrange in q1q2q1q2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(3, 1, 3, 1));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(qBufferPtr, dVal);
+
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_64f_x2_a_generic(double* iBuffer,
+                                                           double* qBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_32fc_deinterleave_64f_x2_neon(double* iBuffer,
+                                                      double* qBuffer,
+                                                      const lv_32fc_t* complexVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int half_points = num_points / 2;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    double* qBufferPtr = qBuffer;
+    float32x2x2_t complexInput;
+    float64x2_t iVal, qVal;
+
+    for (number = 0; number < half_points; number++) {
+        complexInput = vld2_f32(complexVectorPtr);
+
+        iVal = vcvt_f64_f32(complexInput.val[0]);
+        qVal = vcvt_f64_f32(complexInput.val[1]);
+
+        vst1q_f64(iBufferPtr, iVal);
+        vst1q_f64(qBufferPtr, qVal);
+
+        complexVectorPtr += 4;
+        iBufferPtr += 2;
+        qBufferPtr += 2;
+    }
+
+    for (number = half_points * 2; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        *qBufferPtr++ = (double)*complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEONV8 */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_64f_x2_a_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_imag_32f.h b/kernels/volk/volk_32fc_deinterleave_imag_32f.h

new file mode 100644 (file)

index 0000000..9e330d3
--- /dev/null
+++ b/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -0,0 +1,232 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_imag_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the imaginary
+ * part (quadrature) of the samples.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_image_32f(float* qBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the imaginary parts to a float buffer.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* im = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_deinterleave_imag_32f(im, in, N);
+ *
+ *   printf("          imaginary part\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1f\n", ii, im[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(im);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+    for (; number < eighthPoints; number++) {
+
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_store_ps(qBufferPtr, qValue);
+
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_deinterleave_imag_32f_a_sse(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
+
+    __m128 cplxValue1, cplxValue2, iValue;
+    for (; number < quarterPoints; number++) {
+
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in q1q2q3q4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        _mm_store_ps(qBufferPtr, iValue);
+
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_deinterleave_imag_32f_neon(float* qBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* qBufferPtr = qBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(qBufferPtr, complexInput.val[1]);
+        complexVectorPtr += 8;
+        qBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_imag_32f_generic(float* qBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* qBufferPtr = qBuffer;
+    for (number = 0; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_a_H */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_imag_32f_u_H
+#define INCLUDED_volk_32fc_deinterleave_imag_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_imag_32f_u_avx(float* qBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* qBufferPtr = qBuffer;
+
+    __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+    for (; number < eighthPoints; number++) {
+
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+        complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+        // Arrange in q1q2q3q4 format
+        qValue = _mm256_shuffle_ps(complex1, complex2, 0xdd);
+
+        _mm256_storeu_ps(qBufferPtr, qValue);
+
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        complexVectorPtr++;
+        *qBufferPtr++ = *complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+#endif /* INCLUDED_volk_32fc_deinterleave_imag_32f_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_real_32f.h b/kernels/volk/volk_32fc_deinterleave_real_32f.h

new file mode 100644 (file)

index 0000000..6fc0679
--- /dev/null
+++ b/kernels/volk/volk_32fc_deinterleave_real_32f.h
@@ -0,0 +1,237 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_real_32f(float* iBuffer, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the real parts to a float buffer.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* re = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_deinterleave_real_32f(re, in, N);
+ *
+ *   printf("          real part\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1f\n", ii, re[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(re);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
+
+    __m256 cplxValue1, cplxValue2;
+    __m256 iValue;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; number < eighthPoints; number++) {
+
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+
+        _mm256_store_ps(iBufferPtr, iValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_deinterleave_real_32f_a_sse(float* iBuffer,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
+
+    __m128 cplxValue1, cplxValue2, iValue;
+    for (; number < quarterPoints; number++) {
+
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+        _mm_store_ps(iBufferPtr, iValue);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_real_32f_generic(float* iBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_deinterleave_real_32f_neon(float* iBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float32x4x2_t complexInput;
+
+    for (number = 0; number < quarter_points; number++) {
+        complexInput = vld2q_f32(complexVectorPtr);
+        vst1q_f32(iBufferPtr, complexInput.val[0]);
+        complexVectorPtr += 8;
+        iBufferPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_32f_u_H
+#define INCLUDED_volk_32fc_deinterleave_real_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    float* iBufferPtr = iBuffer;
+
+    __m256 cplxValue1, cplxValue2;
+    __m256 iValue;
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    for (; number < eighthPoints; number++) {
+
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        iValue = _mm256_permutevar8x32_ps(iValue, idx);
+
+        _mm256_storeu_ps(iBufferPtr, iValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_32fc_deinterleave_real_64f.h b/kernels/volk/volk_32fc_deinterleave_real_64f.h

new file mode 100644 (file)

index 0000000..31d8f3e
--- /dev/null
+++ b/kernels/volk/volk_32fc_deinterleave_real_64f.h
@@ -0,0 +1,243 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_deinterleave_real_64f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples that have been converted to doubles.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_deinterleave_real_64f(double* iBuffer, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * Generate complex numbers around the top half of the unit circle and
+ * extract all of the real parts to a double buffer.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   double* re = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *
+ *   volk_32fc_deinterleave_real_64f(re, in, N);
+ *
+ *   printf("          real part\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1g\n", ii, re[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(re);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_real_64f_a_avx2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 fVal;
+    __m256d dVal;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i1i2 format
+        cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+        fVal = _mm256_extractf128_ps(cplxValue, 0);
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_store_pd(iBufferPtr, dVal);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32fc_deinterleave_real_64f_a_sse2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+
+    const unsigned int halfPoints = num_points / 2;
+    __m128 cplxValue, fVal;
+    __m128d dVal;
+    for (; number < halfPoints; number++) {
+
+        cplxValue = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i1i2 format
+        fVal = _mm_shuffle_ps(cplxValue, cplxValue, _MM_SHUFFLE(2, 0, 2, 0));
+        dVal = _mm_cvtps_pd(fVal);
+        _mm_store_pd(iBufferPtr, dVal);
+
+        iBufferPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_deinterleave_real_64f_generic(double* iBuffer,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_32fc_deinterleave_real_64f_neon(double* iBuffer,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+    float32x2x4_t complexInput;
+    float64x2_t iVal1;
+    float64x2_t iVal2;
+    float64x2x2_t iVal;
+
+    for (number = 0; number < quarter_points; number++) {
+        // Load data into register
+        complexInput = vld4_f32(complexVectorPtr);
+
+        // Perform single to double precision conversion
+        iVal1 = vcvt_f64_f32(complexInput.val[0]);
+        iVal2 = vcvt_f64_f32(complexInput.val[2]);
+        iVal.val[0] = iVal1;
+        iVal.val[1] = iVal2;
+
+        // Store results into memory buffer
+        vst2q_f64(iBufferPtr, iVal);
+
+        // Update pointers
+        iBufferPtr += 4;
+        complexVectorPtr += 8;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_a_H */
+
+#ifndef INCLUDED_volk_32fc_deinterleave_real_64f_u_H
+#define INCLUDED_volk_32fc_deinterleave_real_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_deinterleave_real_64f_u_avx2(double* iBuffer,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    double* iBufferPtr = iBuffer;
+
+    const unsigned int quarterPoints = num_points / 4;
+    __m256 cplxValue;
+    __m128 fVal;
+    __m256d dVal;
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0);
+    for (; number < quarterPoints; number++) {
+
+        cplxValue = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i1i2 format
+        cplxValue = _mm256_permutevar8x32_ps(cplxValue, idx);
+        fVal = _mm256_extractf128_ps(cplxValue, 0);
+        dVal = _mm256_cvtps_pd(fVal);
+        _mm256_storeu_pd(iBufferPtr, dVal);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (double)*complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_32fc_deinterleave_real_64f_u_H */
diff --git a/kernels/volk/volk_32fc_index_max_16u.h b/kernels/volk/volk_32fc_index_max_16u.h

new file mode 100644 (file)

index 0000000..28b5176
--- /dev/null
+++ b/kernels/volk/volk_32fc_index_max_16u.h
@@ -0,0 +1,469 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014-2016, 2018-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_index_max_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
+ * maximum magnitude for complex points in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with maximum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_index_max_16u(max, in, N);
+ *
+ *   printf("index of max value = %u\n",  *max);
+ *
+ *   volk_free(in);
+ *   volk_free(max);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_max_16u_a_H
+#define INCLUDED_volk_32fc_index_max_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_16u_a_avx2_variant_0(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_16u_a_avx2_variant_1(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+    const uint32_t num_bytes = num_points * 8;
+
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
+
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+    xmm5.int_vec = _mm_setzero_si128();
+    xmm4.int_vec = _mm_setzero_si128();
+    holderf.int_vec = _mm_setzero_si128();
+    holderi.int_vec = _mm_setzero_si128();
+
+    int bound = num_bytes >> 5;
+    int i = 0;
+
+    xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+    xmm3 = _mm_setzero_ps();
+
+    for (; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)src0);
+        xmm2 = _mm_load_ps((float*)&src0[2]);
+
+        src0 += 4;
+
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+        xmm3 = _mm_max_ps(xmm1, xmm3);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm_load_ps((float*)src0);
+
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
+
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        src0 += 2;
+
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+        xmm3 = _mm_max_ps(xmm1, xmm3);
+
+        xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_bytes >> 3 & 1) {
+        sq_dist =
+            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+        xmm2 = _mm_load1_ps(&sq_dist);
+
+        xmm1 = xmm3;
+
+        xmm3 = _mm_max_ss(xmm3, xmm2);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
+
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const uint32_t num_bytes = num_points * 8;
+
+    float sq_dist = 0.0;
+    float max = 0.0;
+    uint16_t index = 0;
+
+    uint32_t i = 0;
+
+    for (; i<num_bytes>> 3; ++i) {
+        sq_dist =
+            lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+        if (sq_dist > max) {
+            index = i;
+            max = sq_dist;
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_max_16u_u_H
+#define INCLUDED_volk_32fc_index_max_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_16u_u_avx2_variant_0(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_16u_u_avx2_variant_1(uint16_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#endif /*INCLUDED_volk_32fc_index_max_16u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_max_32u.h b/kernels/volk/volk_32fc_index_max_32u.h

new file mode 100644 (file)

index 0000000..fafff48
--- /dev/null
+++ b/kernels/volk/volk_32fc_index_max_32u.h
@@ -0,0 +1,512 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_index_max_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
+ * maximum magnitude for complex points in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with maximum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_index_max_32u(max, in, N);
+ *
+ *   printf("index of max value = %u\n",  *max);
+ *
+ *   volk_free(in);
+ *   volk_free(max);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
+#define INCLUDED_volk_32fc_index_max_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)src0);
+        __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+    const uint32_t num_bytes = num_points * 8;
+
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
+
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+    xmm5.int_vec = _mm_setzero_si128();
+    xmm4.int_vec = _mm_setzero_si128();
+    holderf.int_vec = _mm_setzero_si128();
+    holderi.int_vec = _mm_setzero_si128();
+
+    int bound = num_bytes >> 5;
+    int i = 0;
+
+    xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+    xmm3 = _mm_setzero_ps();
+
+    for (; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)src0);
+        xmm2 = _mm_load_ps((float*)&src0[2]);
+
+        src0 += 4;
+
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+        xmm3 = _mm_max_ps(xmm1, xmm3);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_bytes >> 4 & 1) {
+        xmm2 = _mm_load_ps((float*)src0);
+
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
+
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        src0 += 2;
+
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+        xmm3 = _mm_max_ps(xmm1, xmm3);
+
+        xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_bytes >> 3 & 1) {
+        sq_dist =
+            lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
+
+        xmm2 = _mm_load1_ps(&sq_dist);
+
+        xmm1 = xmm3;
+
+        xmm3 = _mm_max_ss(xmm3, xmm2);
+
+        xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
+
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+    const uint32_t num_bytes = num_points * 8;
+
+    float sq_dist = 0.0;
+    float max = 0.0;
+    uint32_t index = 0;
+
+    uint32_t i = 0;
+
+    for (; i<num_bytes>> 3; ++i) {
+        sq_dist =
+            lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+
+        if (sq_dist > max) {
+            index = i;
+            max = sq_dist;
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
+#define INCLUDED_volk_32fc_index_max_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant0(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
+                                                            lv_32fc_t* src0,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_max_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 max_values = _mm256_setzero_ps();
+    __m256i max_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)src0);
+        __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
+        vector_32fc_index_max_variant1(
+            in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
+        src0 += 8;
+    }
+
+    // determine maximum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
+    _mm256_store_ps(max_values_buffer, max_values);
+    _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
+
+    float max = 0.f;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (max_values_buffer[i] > max) {
+            max = max_values_buffer[i];
+            index = max_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
+        if (abs_squared > max) {
+            max = abs_squared;
+            index = i;
+        }
+        ++src0;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
+{
+    unsigned int number = 0;
+    const uint32_t quarter_points = num_points / 4;
+    const lv_32fc_t* src0Ptr = src0;
+
+    uint32_t indices[4] = { 0, 1, 2, 3 };
+    const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
+    uint32x4_t vec_indices = vld1q_u32(indices);
+    uint32x4_t vec_max_indices = vec_indices;
+
+    if (num_points) {
+        float max = FLT_MIN;
+        uint32_t index = 0;
+
+        float32x4_t vec_max = vdupq_n_f32(FLT_MIN);
+
+        for (; number < quarter_points; number++) {
+            // Load complex and compute magnitude squared
+            const float32x4_t vec_mag2 =
+                _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
+            __VOLK_PREFETCH(src0Ptr += 4);
+            // a > b?
+            const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
+            vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
+            vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
+            vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
+        }
+        uint32_t tmp_max_indices[4];
+        float tmp_max[4];
+        vst1q_u32(tmp_max_indices, vec_max_indices);
+        vst1q_f32(tmp_max, vec_max);
+
+        for (int i = 0; i < 4; i++) {
+            if (tmp_max[i] > max) {
+                max = tmp_max[i];
+                index = tmp_max_indices[i];
+            }
+        }
+
+        // Deal with the rest
+        for (number = quarter_points * 4; number < num_points; number++) {
+            const float re = lv_creal(*src0Ptr);
+            const float im = lv_cimag(*src0Ptr);
+            const float sq_dist = re * re + im * im;
+            if (sq_dist > max) {
+                max = sq_dist;
+                index = number;
+            }
+            src0Ptr++;
+        }
+        *target = index;
+    }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h

new file mode 100644 (file)

index 0000000..6cf6d84
--- /dev/null
+++ b/kernels/volk/volk_32fc_index_min_16u.h
@@ -0,0 +1,465 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_index_min_16u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
+ * minimum magnitude for complex points in the given vector.
+ *
+ * Note that num_points is a uint32_t, but the return value is
+ * uint16_t. Providing a vector larger than the max of a uint16_t
+ * (65536) would miss anything outside of this boundary. The kernel
+ * will check the length of num_points and cap it to this max value,
+ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li source: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with minimum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_index_min_16u(min, in, N);
+ *
+ *   printf("index of min value = %u\n",  *min);
+ *
+ *   volk_free(in);
+ *   volk_free(min);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
+#define INCLUDED_volk_32fc_index_min_16u_a_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)source);
+        __m256 in1 = _mm256_load_ps((float*)(source + 4));
+        vector_32fc_index_min_variant0(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)source);
+        __m256 in1 = _mm256_load_ps((float*)(source + 4));
+        vector_32fc_index_min_variant1(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
+                                                  const lv_32fc_t* source,
+                                                  uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
+
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+    xmm5.int_vec = _mm_setzero_si128();
+    xmm4.int_vec = _mm_setzero_si128();
+    holderf.int_vec = _mm_setzero_si128();
+    holderi.int_vec = _mm_setzero_si128();
+
+    xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+    xmm3 = _mm_set_ps1(FLT_MAX);
+
+    int bound = num_points >> 2;
+
+    for (int i = 0; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)source);
+        xmm2 = _mm_load_ps((float*)&source[2]);
+
+        source += 4;
+
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+        xmm3 = _mm_min_ps(xmm1, xmm3);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_points >> 1 & 1) {
+        xmm2 = _mm_load_ps((float*)source);
+
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
+
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        source += 2;
+
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+        xmm3 = _mm_min_ps(xmm1, xmm3);
+
+        xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_points & 1) {
+        sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
+                  lv_cimag(source[0]) * lv_cimag(source[0]);
+
+        xmm2 = _mm_load1_ps(&sq_dist);
+
+        xmm1 = xmm3;
+
+        xmm3 = _mm_min_ss(xmm3, xmm2);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
+
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
+                                                   const lv_32fc_t* source,
+                                                   uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    float sq_dist = 0.0;
+    float min = FLT_MAX;
+    uint16_t index = 0;
+
+    for (uint32_t i = 0; i < num_points; ++i) {
+        sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
+                  lv_cimag(source[i]) * lv_cimag(source[i]);
+
+        if (sq_dist < min) {
+            index = i;
+            min = sq_dist;
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
+#define INCLUDED_volk_32fc_index_min_16u_u_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)source);
+        __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
+        vector_32fc_index_min_variant0(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)source);
+        __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
+        vector_32fc_index_min_variant1(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h

new file mode 100644 (file)

index 0000000..5e409b9
--- /dev/null
+++ b/kernels/volk/volk_32fc_index_min_32u.h
@@ -0,0 +1,507 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_index_min_32u
+ *
+ * \b Overview
+ *
+ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the
+ * minimum magnitude for complex points in the given vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li source: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li target: The index of the point with minimum magnitude.
+ *
+ * \b Example
+ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around
+ * the unit circle.
+ * \code
+ *   int N = 10;
+ *   uint32_t alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
+ *
+ *   for(uint32_t ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_index_min_32u(min, in, N);
+ *
+ *   printf("index of min value = %u\n",  *min);
+ *
+ *   volk_free(in);
+ *   volk_free(min);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_index_min_32u_a_H
+#define INCLUDED_volk_32fc_index_min_32u_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)source);
+        __m256 in1 = _mm256_load_ps((float*)(source + 4));
+        vector_32fc_index_min_variant0(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_load_ps((float*)source);
+        __m256 in1 = _mm256_load_ps((float*)(source + 4));
+        vector_32fc_index_min_variant1(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target,
+                                                  const lv_32fc_t* source,
+                                                  uint32_t num_points)
+{
+    union bit128 holderf;
+    union bit128 holderi;
+    float sq_dist = 0.0;
+
+    union bit128 xmm5, xmm4;
+    __m128 xmm1, xmm2, xmm3;
+    __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
+
+    xmm5.int_vec = _mm_setzero_si128();
+    xmm4.int_vec = _mm_setzero_si128();
+    holderf.int_vec = _mm_setzero_si128();
+    holderi.int_vec = _mm_setzero_si128();
+
+    xmm8 = _mm_setr_epi32(0, 1, 2, 3);
+    xmm9 = _mm_setzero_si128();
+    xmm10 = _mm_setr_epi32(4, 4, 4, 4);
+    xmm3 = _mm_set_ps1(FLT_MAX);
+
+    int bound = num_points >> 2;
+
+    for (int i = 0; i < bound; ++i) {
+        xmm1 = _mm_load_ps((float*)source);
+        xmm2 = _mm_load_ps((float*)&source[2]);
+
+        source += 4;
+
+        xmm1 = _mm_mul_ps(xmm1, xmm1);
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        xmm1 = _mm_hadd_ps(xmm1, xmm2);
+
+        xmm3 = _mm_min_ps(xmm1, xmm3);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_points >> 1 & 1) {
+        xmm2 = _mm_load_ps((float*)source);
+
+        xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
+        xmm8 = bit128_p(&xmm1)->int_vec;
+
+        xmm2 = _mm_mul_ps(xmm2, xmm2);
+
+        source += 2;
+
+        xmm1 = _mm_hadd_ps(xmm2, xmm2);
+
+        xmm3 = _mm_min_ps(xmm1, xmm3);
+
+        xmm10 = _mm_setr_epi32(2, 2, 2, 2);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+
+        xmm8 = _mm_add_epi32(xmm8, xmm10);
+    }
+
+    if (num_points & 1) {
+        sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
+                  lv_cimag(source[0]) * lv_cimag(source[0]);
+
+        xmm2 = _mm_load1_ps(&sq_dist);
+
+        xmm1 = xmm3;
+
+        xmm3 = _mm_min_ss(xmm3, xmm2);
+
+        xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
+        xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
+
+        xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
+
+        xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
+        xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
+
+        xmm9 = _mm_add_epi32(xmm11, xmm12);
+    }
+
+    _mm_store_ps((float*)&(holderf.f), xmm3);
+    _mm_store_si128(&(holderi.int_vec), xmm9);
+
+    target[0] = holderi.i[0];
+    sq_dist = holderf.f[0];
+    target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
+    sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
+    target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
+    sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
+    target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
+    sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_index_min_32u_generic(uint32_t* target,
+                                                   const lv_32fc_t* source,
+                                                   uint32_t num_points)
+{
+    float sq_dist = 0.0;
+    float min = FLT_MAX;
+    uint32_t index = 0;
+
+    for (uint32_t i = 0; i < num_points; ++i) {
+        sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
+                  lv_cimag(source[i]) * lv_cimag(source[i]);
+
+        if (sq_dist < min) {
+            index = i;
+            min = sq_dist;
+        }
+    }
+    target[0] = index;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/
+
+#ifndef INCLUDED_volk_32fc_index_min_32u_u_H
+#define INCLUDED_volk_32fc_index_min_32u_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)source);
+        __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
+        vector_32fc_index_min_variant0(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target,
+                                                            const lv_32fc_t* source,
+                                                            uint32_t num_points)
+{
+    const __m256i indices_increment = _mm256_set1_epi32(8);
+    /*
+     * At the start of each loop iteration current_indices holds the indices of
+     * the complex numbers loaded from memory. Explanation for odd order is given
+     * in implementation of vector_32fc_index_min_variant0().
+     */
+    __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    __m256 min_values = _mm256_set1_ps(FLT_MAX);
+    __m256i min_indices = _mm256_setzero_si256();
+
+    for (unsigned i = 0; i < num_points / 8u; ++i) {
+        __m256 in0 = _mm256_loadu_ps((float*)source);
+        __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
+        vector_32fc_index_min_variant1(
+            in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
+        source += 8;
+    }
+
+    // determine minimum value and index in the result of the vectorized loop
+    __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
+    __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
+    _mm256_store_ps(min_values_buffer, min_values);
+    _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
+
+    float min = FLT_MAX;
+    uint32_t index = 0;
+    for (unsigned i = 0; i < 8; i++) {
+        if (min_values_buffer[i] < min) {
+            min = min_values_buffer[i];
+            index = min_indices_buffer[i];
+        }
+    }
+
+    // handle tail not processed by the vectorized loop
+    for (unsigned i = num_points & (~7u); i < num_points; ++i) {
+        const float abs_squared =
+            lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
+        if (abs_squared < min) {
+            min = abs_squared;
+            index = i;
+        }
+        ++source;
+    }
+
+    *target = index;
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32fc_index_min_32u_neon(uint32_t* target,
+                                                const lv_32fc_t* source,
+                                                uint32_t num_points)
+{
+    const uint32_t quarter_points = num_points / 4;
+    const lv_32fc_t* sourcePtr = source;
+
+    uint32_t indices[4] = { 0, 1, 2, 3 };
+    const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
+    uint32x4_t vec_indices = vld1q_u32(indices);
+    uint32x4_t vec_min_indices = vec_indices;
+
+    if (num_points) {
+        float min = FLT_MAX;
+        uint32_t index = 0;
+
+        float32x4_t vec_min = vdupq_n_f32(FLT_MAX);
+
+        for (uint32_t number = 0; number < quarter_points; number++) {
+            // Load complex and compute magnitude squared
+            const float32x4_t vec_mag2 =
+                _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr));
+            __VOLK_PREFETCH(sourcePtr += 4);
+            // a < b?
+            const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min);
+            vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min);
+            vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices);
+            vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
+        }
+        uint32_t tmp_min_indices[4];
+        float tmp_min[4];
+        vst1q_u32(tmp_min_indices, vec_min_indices);
+        vst1q_f32(tmp_min, vec_min);
+
+        for (int i = 0; i < 4; i++) {
+            if (tmp_min[i] < min) {
+                min = tmp_min[i];
+                index = tmp_min_indices[i];
+            }
+        }
+
+        // Deal with the rest
+        for (uint32_t number = quarter_points * 4; number < num_points; number++) {
+            const float re = lv_creal(*sourcePtr);
+            const float im = lv_cimag(*sourcePtr);
+            const float sq_dist = re * re + im * im;
+            if (sq_dist < min) {
+                min = sq_dist;
+                index = number;
+            }
+            sourcePtr++;
+        }
+        *target = index;
+    }
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/
diff --git a/kernels/volk/volk_32fc_magnitude_32f.h b/kernels/volk/volk_32fc_magnitude_32f.h

new file mode 100644 (file)

index 0000000..f997e59
--- /dev/null
+++ b/kernels/volk/volk_32fc_magnitude_32f.h
@@ -0,0 +1,457 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_magnitude_32f
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude of the complexVector and stores the
+ * results in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_magnitude_32f(float* magnitudeVector, const lv_32fc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value.
+ *
+ * \b Example
+ * Calculate the magnitude of \f$x^2 + x\f$ for points around the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* magnitude = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_magnitude_32f(magnitude, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1f\n", ii, magnitude[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(magnitude);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_magnitude_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+        result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+        complexVectorPtr += 16;
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_magnitude_32f_u_sse3(float* magnitudeVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_magnitude_32f_u_sse(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_magnitude_32f_generic(float* magnitudeVector,
+                                                   const lv_32fc_t* complexVector,
+                                                   unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        result = _mm256_magnitude_ps(cplxValue1, cplxValue2);
+        _mm256_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_magnitude_32f_a_sse3(float* magnitudeVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps_sse3(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_magnitude_32f_a_sse(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitude_ps(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag * val1Imag));
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_magnitude_32f_a_generic(float* magnitudeVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_magnitude_32f_neon(float* magnitudeVector,
+                                                const lv_32fc_t* complexVector,
+                                                unsigned int num_points)
+{
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t complex_vec;
+    float32x4_t magnitude_vec;
+    for (number = 0; number < quarter_points; number++) {
+        complex_vec = vld2q_f32(complexVectorPtr);
+        complex_vec.val[0] = vmulq_f32(complex_vec.val[0], complex_vec.val[0]);
+        magnitude_vec =
+            vmlaq_f32(complex_vec.val[0], complex_vec.val[1], complex_vec.val[1]);
+        magnitude_vec = vrsqrteq_f32(magnitude_vec);
+        magnitude_vec = vrecpeq_f32(magnitude_vec); // no plain ol' sqrt
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+/*!
+  \brief Calculates the magnitude of the complexVector and stores the results in the
+  magnitudeVector
+
+  This is an approximation from "Streamlining Digital Signal Processing" by
+  Richard Lyons. Apparently max error is about 1% and mean error is about 0.6%.
+  The basic idea is to do a weighted sum of the abs. value of imag and real parts
+  where weight A is always assigned to max(imag, real) and B is always min(imag,real).
+  There are two pairs of cofficients chosen based on whether min <= 0.4142 max.
+  This method is called equiripple-error magnitude estimation proposed by Filip in '73
+
+  \param complexVector The vector containing the complex input values
+  \param magnitudeVector The vector containing the real output values
+  \param num_points The number of complex values in complexVector to be calculated and
+  stored into cVector
+*/
+static inline void volk_32fc_magnitude_32f_neon_fancy_sweet(
+    float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+{
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    const float threshold = 0.4142135;
+
+    float32x4_t a_vec, b_vec, a_high, a_low, b_high, b_low;
+    a_high = vdupq_n_f32(0.84);
+    b_high = vdupq_n_f32(0.561);
+    a_low = vdupq_n_f32(0.99);
+    b_low = vdupq_n_f32(0.197);
+
+    uint32x4_t comp0, comp1;
+
+    float32x4x2_t complex_vec;
+    float32x4_t min_vec, max_vec, magnitude_vec;
+    float32x4_t real_abs, imag_abs;
+    for (number = 0; number < quarter_points; number++) {
+        complex_vec = vld2q_f32(complexVectorPtr);
+
+        real_abs = vabsq_f32(complex_vec.val[0]);
+        imag_abs = vabsq_f32(complex_vec.val[1]);
+
+        min_vec = vminq_f32(real_abs, imag_abs);
+        max_vec = vmaxq_f32(real_abs, imag_abs);
+
+        // effective branch to choose coefficient pair.
+        comp0 = vcgtq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+        comp1 = vcleq_f32(min_vec, vmulq_n_f32(max_vec, threshold));
+
+        // and 0s or 1s with coefficients from previous effective branch
+        a_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)a_high),
+                                       vandq_s32((int32x4_t)comp1, (int32x4_t)a_low));
+        b_vec = (float32x4_t)vaddq_s32(vandq_s32((int32x4_t)comp0, (int32x4_t)b_high),
+                                       vandq_s32((int32x4_t)comp1, (int32x4_t)b_low));
+
+        // coefficients chosen, do the weighted sum
+        min_vec = vmulq_f32(min_vec, b_vec);
+        max_vec = vmulq_f32(max_vec, a_vec);
+
+        magnitude_vec = vaddq_f32(min_vec, max_vec);
+        vst1q_f32(magnitudeVectorPtr, magnitude_vec);
+
+        complexVectorPtr += 8;
+        magnitudeVectorPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = sqrtf((real * real) + (imag * imag));
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32fc_magnitude_32f_a_orc_impl(float* magnitudeVector,
+                                               const lv_32fc_t* complexVector,
+                                               unsigned int num_points);
+
+static inline void volk_32fc_magnitude_32f_u_orc(float* magnitudeVector,
+                                                 const lv_32fc_t* complexVector,
+                                                 unsigned int num_points)
+{
+    volk_32fc_magnitude_32f_a_orc_impl(magnitudeVector, complexVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/kernels/volk/volk_32fc_magnitude_squared_32f.h b/kernels/volk/volk_32fc_magnitude_squared_32f.h

new file mode 100644 (file)

index 0000000..39dc3fe
--- /dev/null
+++ b/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -0,0 +1,370 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_magnitude_squared_32f
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude squared of the complexVector and stores
+ * the results in the magnitudeVector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_magnitude_squared_32f(float* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value.
+ *
+ * \b Example
+ * Calculate the magnitude squared of \f$x^2 + x\f$ for points around the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* magnitude = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       float real = 2.f * ((float)ii / (float)N) - 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii] = in[ii] * in[ii] + in[ii];
+ *       in[N-ii] = lv_cmake(real, imag);
+ *       in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
+ *   }
+ *
+ *   volk_32fc_magnitude_32f(magnitude, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out(%i) = %+.1f\n", ii, magnitude[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(magnitude);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_magnitude_squared_32f_u_avx(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr + 8);
+        result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm256_storeu_ps(magnitudeVectorPtr, result);
+
+        complexVectorPtr += 16;
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm_storeu_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_magnitude_squared_32f_generic(float* magnitudeVector,
+                                                           const lv_32fc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_u_H */
+#ifndef INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+#define INCLUDED_volk_32fc_magnitude_squared_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_magnitude_squared_32f_a_avx(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 cplxValue1, cplxValue2, result;
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        result = _mm256_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm256_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector,
+                                                          const lv_32fc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector,
+                                                         const lv_32fc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 cplxValue1, cplxValue2, result;
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        result = _mm_magnitudesquared_ps(cplxValue1, cplxValue2);
+        _mm_store_ps(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_magnitude_squared_32f_neon(float* magnitudeVector,
+                                                        const lv_32fc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+
+    float32x4x2_t cmplx_val;
+    float32x4_t result;
+    for (; number < quarterPoints; number++) {
+        cmplx_val = vld2q_f32(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cmplx_val.val[0] =
+            vmulq_f32(cmplx_val.val[0], cmplx_val.val[0]); // Square the values
+        cmplx_val.val[1] =
+            vmulq_f32(cmplx_val.val[1], cmplx_val.val[1]); // Square the values
+
+        result =
+            vaddq_f32(cmplx_val.val[0], cmplx_val.val[1]); // Add the I2 and Q2 values
+
+        vst1q_f32(magnitudeVectorPtr, result);
+        magnitudeVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        float val1Real = *complexVectorPtr++;
+        float val1Imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_magnitude_squared_32f_a_generic(
+    float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *magnitudeVectorPtr++ = (real * real) + (imag * imag);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_magnitude_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_atan2_32f.h b/kernels/volk/volk_32fc_s32f_atan2_32f.h

new file mode 100644 (file)

index 0000000..9b279ee
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_atan2_32f.h
@@ -0,0 +1,208 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_atan2_32f
+ *
+ * \b Overview
+ *
+ * Computes the arctan for each value in a complex vector and applies
+ * a normalization factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_atan2_32f(float* outputVector, const lv_32fc_t* complexVector,
+ * const float normalizeFactor, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The byte-aligned input vector containing interleaved IQ data (I = cos,
+ * Q = sin). \li normalizeFactor: The atan results are divided by this normalization
+ * factor. \li num_points: The number of complex values in \p inputVector.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Calculate the arctangent of points around the unit circle.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float scale = 1.f; // we want unit circle
+ *
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       // Generate points around the unit circle
+ *       float real = -4.f * ((float)ii / (float)N) + 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii+N/2] = lv_cmake(-real, -imag);
+ *   }
+ *
+ *   volk_32fc_s32f_atan2_32f(out, in, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("atan2(%1.2f, %1.2f) = %1.2f\n",
+ *           lv_cimag(in[ii]), lv_creal(in[ii]), out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32fc_s32f_atan2_32f_a_sse4_1(float* outputVector,
+                                                     const lv_32fc_t* complexVector,
+                                                     const float normalizeFactor,
+                                                     unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* outPtr = outputVector;
+
+    unsigned int number = 0;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 testVector = _mm_set_ps1(2 * M_PI);
+    __m128 correctVector = _mm_set_ps1(M_PI);
+    __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+    __m128 phase;
+    __m128 complex1, complex2, iValue, qValue;
+    __m128 keepMask;
+
+    for (; number < quarterPoints; number++) {
+        // Load IQ data:
+        complex1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        complex2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        // Deinterleave IQ data:
+        iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+        // Arctan to get phase:
+        phase = atan2f4(qValue, iValue);
+        // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+        // Compare to 2pi:
+        keepMask = _mm_cmpneq_ps(phase, testVector);
+        phase = _mm_blendv_ps(correctVector, phase, keepMask);
+        // done with above correction.
+        phase = _mm_mul_ps(phase, vNormalizeFactor);
+        _mm_store_ps((float*)outPtr, phase);
+        outPtr += 4;
+    }
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+    for (; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32fc_s32f_atan2_32f_a_sse(float* outputVector,
+                                                  const lv_32fc_t* complexVector,
+                                                  const float normalizeFactor,
+                                                  unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    float* outPtr = outputVector;
+
+    unsigned int number = 0;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 testVector = _mm_set_ps1(2 * M_PI);
+    __m128 correctVector = _mm_set_ps1(M_PI);
+    __m128 vNormalizeFactor = _mm_set_ps1(invNormalizeFactor);
+    __m128 phase;
+    __m128 complex1, complex2, iValue, qValue;
+    __m128 mask;
+    __m128 keepMask;
+
+    for (; number < quarterPoints; number++) {
+        // Load IQ data:
+        complex1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        complex2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+        // Deinterleave IQ data:
+        iValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(complex1, complex2, _MM_SHUFFLE(3, 1, 3, 1));
+        // Arctan to get phase:
+        phase = atan2f4(qValue, iValue);
+        // When Q = 0 and I < 0, atan2f4 sucks and returns 2pi vice pi.
+        // Compare to 2pi:
+        keepMask = _mm_cmpneq_ps(phase, testVector);
+        phase = _mm_and_ps(phase, keepMask);
+        mask = _mm_andnot_ps(keepMask, correctVector);
+        phase = _mm_or_ps(phase, mask);
+        // done with above correction.
+        phase = _mm_mul_ps(phase, vNormalizeFactor);
+        _mm_store_ps((float*)outPtr, phase);
+        outPtr += 4;
+    }
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+    for (; number < num_points; number++) {
+        const float real = *complexVectorPtr++;
+        const float imag = *complexVectorPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
+                                                    const lv_32fc_t* inputVector,
+                                                    const float normalizeFactor,
+                                                    unsigned int num_points)
+{
+    float* outPtr = outputVector;
+    const float* inPtr = (float*)inputVector;
+    const float invNormalizeFactor = 1.0 / normalizeFactor;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        const float real = *inPtr++;
+        const float imag = *inPtr++;
+        *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h

new file mode 100644 (file)

index 0000000..cf47571
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_deinterleave_real_16i.h
@@ -0,0 +1,258 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex floating point vector and return the real
+ * part (inphase) of the samples scaled to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_deinterleave_real_16i(int16_t* iBuffer, const lv_32fc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The value to be multiplied against each of the input vectors..
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * Generate points around the unit circle and map them to integers with
+ * magnitude 50 to preserve smallest deltas.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *   float scale = 50.f;
+ *
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       // Generate points around the unit circle
+ *       float real = -4.f * ((float)ii / (float)N) + 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii+N/2] = lv_cmake(-real, -imag);
+ *   }
+ *
+ *   volk_32fc_s32f_deinterleave_real_16i(out, in, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                            const lv_32fc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+
+    __m256 cplxValue1, cplxValue2, iValue;
+    __m256i a;
+    __m128i b;
+
+    __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+        iValue = _mm256_mul_ps(iValue, vScalar);
+
+        iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+        a = _mm256_cvtps_epi32(iValue);
+        a = _mm256_packs_epi32(a, a);
+        a = _mm256_permutevar8x32_epi32(a, idx);
+        b = _mm256_extracti128_si256(a, 0);
+
+        _mm_store_si128((__m128i*)iBufferPtr, b);
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
+}
+
+
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_a_sse(int16_t* iBuffer,
+                                           const lv_32fc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    __m128 cplxValue1, cplxValue2, iValue;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+        iValue = _mm_mul_ps(iValue, vScalar);
+
+        _mm_store_ps(floatBuffer, iValue);
+        *iBufferPtr++ = (int16_t)(floatBuffer[0]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[1]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[2]);
+        *iBufferPtr++ = (int16_t)(floatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
+}
+
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                             const lv_32fc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_a_H */
+
+#ifndef INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
+#define INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32fc_s32f_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                            const lv_32fc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+
+    __m256 cplxValue1, cplxValue2, iValue;
+    __m256i a;
+    __m128i b;
+
+    __m256i idx = _mm256_set_epi32(3, 3, 3, 3, 5, 1, 4, 0);
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm256_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+
+        iValue = _mm256_mul_ps(iValue, vScalar);
+
+        iValue = _mm256_round_ps(iValue, _MM_FROUND_TO_ZERO);
+        a = _mm256_cvtps_epi32(iValue);
+        a = _mm256_packs_epi32(a, a);
+        a = _mm256_permutevar8x32_epi32(a, idx);
+        b = _mm256_extracti128_si256(a, 0);
+
+        _mm_storeu_si128((__m128i*)iBufferPtr, b);
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    iBufferPtr = &iBuffer[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++ * scalar);
+        complexVectorPtr++;
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_32fc_s32f_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_magnitude_16i.h b/kernels/volk/volk_32fc_s32f_magnitude_16i.h

new file mode 100644 (file)

index 0000000..80155f6
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_magnitude_16i.h
@@ -0,0 +1,309 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_magnitude_16i
+ *
+ * \b Overview
+ *
+ * Calculates the magnitude of the complexVector and stores the
+ * results in the magnitudeVector. The results are scaled and
+ * converted into 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_magnitude_16i(int16_t* magnitudeVector, const lv_32fc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li magnitudeVector: The output value as 16-bit shorts.
+ *
+ * \b Example
+ * Generate points around the unit circle and map them to integers with
+ * magnitude 50 to preserve smallest deltas.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ *   float scale = 50.f;
+ *
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       // Generate points around the unit circle
+ *       float real = -4.f * ((float)ii / (float)N) + 1.f;
+ *       float imag = std::sqrt(1.f - real * real);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii+N/2] = lv_cmake(-real, -imag);
+ *   }
+ *
+ *   volk_32fc_s32f_magnitude_16i(out, in, scale, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %i\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifdef LV_HAVE_GENERIC
+#include <volk/volk_common.h>
+
+static inline void volk_32fc_s32f_magnitude_16i_generic(int16_t* magnitudeVector,
+                                                        const lv_32fc_t* complexVector,
+                                                        const float scalar,
+                                                        unsigned int num_points)
+{
+    const float* complexVectorPtr = (float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+    unsigned int number = 0;
+    for (number = 0; number < num_points; number++) {
+        __VOLK_VOLATILE float real = *complexVectorPtr++;
+        __VOLK_VOLATILE float imag = *complexVectorPtr++;
+        real *= real;
+        imag *= imag;
+        *magnitudeVectorPtr++ = (int16_t)rintf(scalar * sqrtf(real + imag));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_s32f_magnitude_16i_a_avx2(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i resultInt;
+    __m128i resultShort;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_load_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result);
+
+        result = _mm256_mul_ps(result, vScalar);
+
+        resultInt = _mm256_cvtps_epi32(result);
+        resultInt = _mm256_packs_epi32(resultInt, resultInt);
+        resultInt = _mm256_permutevar8x32_epi32(
+            resultInt, idx); // permute to compensate for shuffling in hadd and packs
+        resultShort = _mm256_extracti128_si256(resultInt, 0);
+        _mm_store_si128((__m128i*)magnitudeVectorPtr, resultShort);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32f_magnitude_16i_a_sse3(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    __m128 cplxValue1, cplxValue2, result;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result);
+
+        result = _mm_mul_ps(result, vScalar);
+
+        _mm_store_ps(floatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_s32f_magnitude_16i_a_sse(int16_t* magnitudeVector,
+                                                      const lv_32fc_t* complexVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m128 vScalar = _mm_set_ps1(scalar);
+
+    __m128 cplxValue1, cplxValue2, result;
+    __m128 iValue, qValue;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        cplxValue1 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        cplxValue2 = _mm_load_ps(complexVectorPtr);
+        complexVectorPtr += 4;
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        __VOLK_VOLATILE __m128 iValue2 =
+            _mm_mul_ps(iValue, iValue); // Square the I values
+        __VOLK_VOLATILE __m128 qValue2 =
+            _mm_mul_ps(qValue, qValue); // Square the Q Values
+
+        result = _mm_add_ps(iValue2, qValue2); // Add the I2 and Q2 values
+
+        result = _mm_sqrt_ps(result);
+
+        result = _mm_mul_ps(result, vScalar);
+
+        _mm_store_ps(floatBuffer, result);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
+        *magnitudeVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
+    }
+
+    number = quarterPoints * 4;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
+}
+#endif /* LV_HAVE_SSE */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_a_H */
+
+#ifndef INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
+#define INCLUDED_volk_32fc_s32f_magnitude_16i_u_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_s32f_magnitude_16i_u_avx2(int16_t* magnitudeVector,
+                                                       const lv_32fc_t* complexVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+
+    const float* complexVectorPtr = (const float*)complexVector;
+    int16_t* magnitudeVectorPtr = magnitudeVector;
+
+    __m256 vScalar = _mm256_set1_ps(scalar);
+    __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
+    __m256 cplxValue1, cplxValue2, result;
+    __m256i resultInt;
+    __m128i resultShort;
+
+    for (; number < eighthPoints; number++) {
+        cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+        cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+        result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
+
+        result = _mm256_sqrt_ps(result);
+
+        result = _mm256_mul_ps(result, vScalar);
+
+        resultInt = _mm256_cvtps_epi32(result);
+        resultInt = _mm256_packs_epi32(resultInt, resultInt);
+        resultInt = _mm256_permutevar8x32_epi32(
+            resultInt, idx); // permute to compensate for shuffling in hadd and packs
+        resultShort = _mm256_extracti128_si256(resultInt, 0);
+        _mm_storeu_si128((__m128i*)magnitudeVectorPtr, resultShort);
+        magnitudeVectorPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    volk_32fc_s32f_magnitude_16i_generic(
+        magnitudeVector + number, complexVector + number, scalar, num_points - number);
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_32fc_s32f_magnitude_16i_u_H */
diff --git a/kernels/volk/volk_32fc_s32f_power_32fc.h b/kernels/volk/volk_32fc_s32f_power_32fc.h

new file mode 100644 (file)

index 0000000..0ac2c6c
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_power_32fc.h
@@ -0,0 +1,159 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_power_32fc
+ *
+ * \b Overview
+ *
+ * Takes each the input complex vector value to the specified power
+ * and stores the results in the return vector. The output is scaled
+ * and converted to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * float power, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: The complex input vector.
+ * \li power: The power value to be applied to each data point.
+ * \li num_points: The number of samples.
+ *
+ * \b Outputs
+ * \li cVector: The output value as 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_power_32fc();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H
+#define INCLUDED_volk_32fc_s32f_power_32fc_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+//! raise a complex float to a real float power
+static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp,
+                                                        const float power)
+{
+    const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp));
+    const float mag =
+        powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2);
+    return mag * lv_cmake(-cosf(arg), sinf(arg));
+}
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void volk_32fc_s32f_power_32fc_a_sse(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const float power,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 vPower = _mm_set_ps1(power);
+
+    __m128 cplxValue1, cplxValue2, magnitude, phase, iValue, qValue;
+    for (; number < quarterPoints; number++) {
+
+        cplxValue1 = _mm_load_ps((float*)aPtr);
+        aPtr += 2;
+
+        cplxValue2 = _mm_load_ps((float*)aPtr);
+        aPtr += 2;
+
+        // Convert to polar coordinates
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        // Arrange in q1q2q3q4 format
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        phase = atan2f4(qValue, iValue); // Calculate the Phase
+
+        magnitude = _mm_sqrt_ps(
+            _mm_add_ps(_mm_mul_ps(iValue, iValue),
+                       _mm_mul_ps(qValue, qValue))); // Calculate the magnitude by square
+                                                     // rooting the added I2 and Q2 values
+
+        // Now calculate the power of the polar coordinate data
+        magnitude = powf4(magnitude, vPower); // Take the magnitude to the specified power
+
+        phase = _mm_mul_ps(phase, vPower); // Multiply the phase by the specified power
+
+        // Convert back to cartesian coordinates
+        iValue = _mm_mul_ps(cosf4(phase),
+                            magnitude); // Multiply the cos of the phase by the magnitude
+        qValue = _mm_mul_ps(sinf4(phase),
+                            magnitude); // Multiply the sin of the phase by the magnitude
+
+        cplxValue1 =
+            _mm_unpacklo_ps(iValue, qValue); // Interleave the lower two i & q values
+        cplxValue2 =
+            _mm_unpackhi_ps(iValue, qValue); // Interleave the upper two i & q values
+
+        _mm_store_ps((float*)cPtr,
+                     cplxValue1); // Store the results back into the C container
+
+        cPtr += 2;
+
+        _mm_store_ps((float*)cPtr,
+                     cplxValue2); // Store the results back into the C container
+
+        cPtr += 2;
+    }
+
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+    for (; number < num_points; number++) {
+        *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const float power,
+                                                     unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_power_spectral_densitypuppet_32f.h b/kernels/volk/volk_32fc_s32f_power_spectral_densitypuppet_32f.h

new file mode 100644 (file)

index 0000000..127a512
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_power_spectral_densitypuppet_32f.h
@@ -0,0 +1,63 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H
+#define INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H
+
+
+#include <volk/volk_32fc_s32f_x2_power_spectral_density_32f.h>
+
+
+#ifdef LV_HAVE_AVX
+
+static inline void
+volk_32fc_s32f_power_spectral_densitypuppet_32f_a_avx(float* logPowerOutput,
+                                                      const lv_32fc_t* complexFFTInput,
+                                                      const float normalizationFactor,
+                                                      unsigned int num_points)
+{
+    volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(
+        logPowerOutput, complexFFTInput, normalizationFactor, 2.5, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+
+static inline void
+volk_32fc_s32f_power_spectral_densitypuppet_32f_a_sse3(float* logPowerOutput,
+                                                       const lv_32fc_t* complexFFTInput,
+                                                       const float normalizationFactor,
+                                                       unsigned int num_points)
+{
+    volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(
+        logPowerOutput, complexFFTInput, normalizationFactor, 2.5, num_points);
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_power_spectral_densitypuppet_32f_generic(float* logPowerOutput,
+                                                        const lv_32fc_t* complexFFTInput,
+                                                        const float normalizationFactor,
+                                                        unsigned int num_points)
+{
+    volk_32fc_s32f_x2_power_spectral_density_32f_generic(
+        logPowerOutput, complexFFTInput, normalizationFactor, 2.5, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h

new file mode 100644 (file)

index 0000000..5bc158e
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_power_spectrum_32f.h
@@ -0,0 +1,225 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_power_spectrum_32f
+ *
+ * \b Overview
+ *
+ * Calculates the log10 power value for each input point.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_power_spectrum_32f(float* logPowerOutput, const lv_32fc_t*
+ * complexFFTInput, const float normalizationFactor, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10(r*r + i*i) for each data point.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_power_spectrum_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+#define INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_generic(float* logPowerOutput,
+                                          const lv_32fc_t* complexFFTInput,
+                                          const float normalizationFactor,
+                                          unsigned int num_points)
+{
+    // Calculate the Power of the complex point
+    const float normFactSq = 1.0 / (normalizationFactor * normalizationFactor);
+
+    // Calculate dBm
+    // 50 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+    // 75 ohm load assumption
+    // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+    /*
+     * For generic reference, the code below is a volk-optimized
+     * approach that also leverages a faster log2 calculation
+     * to calculate the log10:
+     * n*log10(x) = n*log2(x)/log2(10) = (n/log2(10)) * log2(x)
+     *
+     * Generic code:
+     *
+     * const float real = *inputPtr++ * iNormalizationFactor;
+     * const float imag = *inputPtr++ * iNormalizationFactor;
+     * realFFTDataPointsPtr = 10.0*log10f(((real * real) + (imag * imag)) + 1e-20);
+     *  realFFTDataPointsPtr++;
+     *
+     */
+
+    // Calc mag^2
+    volk_32fc_magnitude_squared_32f(logPowerOutput, complexFFTInput, num_points);
+
+    // Finish ((real * real) + (imag * imag)) calculation:
+    volk_32f_s32f_multiply_32f(logPowerOutput, logPowerOutput, normFactSq, num_points);
+
+    // The following calculates 10*log10(x) = 10*log2(x)/log2(10) = (10/log2(10))
+    // * log2(x)
+    volk_32f_log2_32f(logPowerOutput, logPowerOutput, num_points);
+    volk_32f_s32f_multiply_32f(
+        logPowerOutput, logPowerOutput, volk_log2to10factor, num_points);
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_a_sse3(float* logPowerOutput,
+                                         const lv_32fc_t* complexFFTInput,
+                                         const float normalizationFactor,
+                                         unsigned int num_points)
+{
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+#ifdef LV_HAVE_LIB_SIMDMATH
+    __m128 magScalar = _mm_set_ps1(10.0);
+    magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+    __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+    __m128 power;
+    __m128 input1, input2;
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the complex values
+        input1 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        input2 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+
+        // Apply the normalization factor
+        input1 = _mm_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm_mul_ps(input2, input2);
+
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        power = _mm_hadd_ps(input1, input2);
+
+        // Calculate the natural log power
+        power = logf4(power);
+
+        // Convert to log10 and multiply by 10.0
+        power = _mm_mul_ps(power, magScalar);
+
+        // Store the floating point results
+        _mm_store_ps(destPtr, power);
+
+        destPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+    // Calculate the FFT for any remaining points
+
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *destPtr = volk_log2to10factor * log2f_non_ieee(((real * real) + (imag * imag)));
+
+        destPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void
+volk_32fc_s32f_power_spectrum_32f_neon(float* logPowerOutput,
+                                       const lv_32fc_t* complexFFTInput,
+                                       const float normalizationFactor,
+                                       unsigned int num_points)
+{
+    float* logPowerOutputPtr = logPowerOutput;
+    const lv_32fc_t* complexFFTInputPtr = complexFFTInput;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+    unsigned int number;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t fft_vec;
+    float32x4_t log_pwr_vec;
+    float32x4_t mag_squared_vec;
+
+    const float inv_ln10_10 = 4.34294481903f; // 10.0/ln(10.)
+
+    for (number = 0; number < quarter_points; number++) {
+        // Load
+        fft_vec = vld2q_f32((float*)complexFFTInputPtr);
+        // Prefetch next 4
+        __VOLK_PREFETCH(complexFFTInputPtr + 4);
+        // Normalize
+        fft_vec.val[0] = vmulq_n_f32(fft_vec.val[0], iNormalizationFactor);
+        fft_vec.val[1] = vmulq_n_f32(fft_vec.val[1], iNormalizationFactor);
+        mag_squared_vec = _vmagnitudesquaredq_f32(fft_vec);
+        log_pwr_vec = vmulq_n_f32(_vlogq_f32(mag_squared_vec), inv_ln10_10);
+        // Store
+        vst1q_f32(logPowerOutputPtr, log_pwr_vec);
+        // Move pointers ahead
+        complexFFTInputPtr += 4;
+        logPowerOutputPtr += 4;
+    }
+
+    // deal with the rest
+    for (number = quarter_points * 4; number < num_points; number++) {
+        const float real = lv_creal(*complexFFTInputPtr) * iNormalizationFactor;
+        const float imag = lv_cimag(*complexFFTInputPtr) * iNormalizationFactor;
+
+        *logPowerOutputPtr =
+            volk_log2to10factor * log2f_non_ieee(((real * real) + (imag * imag)));
+        complexFFTInputPtr++;
+        logPowerOutputPtr++;
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_s32f_power_spectrum_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h

new file mode 100644 (file)

index 0000000..4d75c9d
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -0,0 +1,247 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32f_x2_power_spectral_density_32f
+ *
+ * \b Overview
+ *
+ * Calculates the log10 power value divided by the RBW for each input point.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const
+ * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned
+ * int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexFFTInput The complex data output from the FFT point.
+ * \li normalizationFactor: This value is divided against all the input values before the
+ * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li
+ * num_points: The number of fft data points.
+ *
+ * \b Outputs
+ * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_32fc_s32f_x2_power_spectral_density_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+#define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H
+
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float* logPowerOutput,
+                                                   const lv_32fc_t* complexFFTInput,
+                                                   const float normalizationFactor,
+                                                   const float rbw,
+                                                   unsigned int num_points)
+{
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iRBW = 1.0 / rbw;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    __m256 magScalar = _mm256_set1_ps(10.0);
+    magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+
+    __m256 invRBW = _mm256_set1_ps(iRBW);
+
+    __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+
+    __m256 power;
+    __m256 input1, input2;
+    const uint64_t eighthPoints = num_points / 8;
+    for (; number < eighthPoints; number++) {
+        // Load the complex values
+        input1 = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+        input2 = _mm256_load_ps(inputPtr);
+        inputPtr += 8;
+
+        // Apply the normalization factor
+        input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm256_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm256_mul_ps(input2, input2);
+
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+        inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+
+        power = _mm256_hadd_ps(inputVal1, inputVal2);
+
+        // Divide by the rbw
+        power = _mm256_mul_ps(power, invRBW);
+
+        // Calculate the natural log power
+        power = logf4(power);
+
+        // Convert to log10 and multiply by 10.0
+        power = _mm256_mul_ps(power, magScalar);
+
+        // Store the floating point results
+        _mm256_store_ps(destPtr, power);
+
+        destPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+    // Calculate the FFT for any remaining points
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *destPtr = volk_log2to10factor *
+                   log2f_non_ieee((((real * real) + (imag * imag))) * iRBW);
+        destPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_a_sse3(float* logPowerOutput,
+                                                    const lv_32fc_t* complexFFTInput,
+                                                    const float normalizationFactor,
+                                                    const float rbw,
+                                                    unsigned int num_points)
+{
+    const float* inputPtr = (const float*)complexFFTInput;
+    float* destPtr = logPowerOutput;
+    uint64_t number = 0;
+    const float iRBW = 1.0 / rbw;
+    const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+    __m128 magScalar = _mm_set_ps1(10.0);
+    magScalar = _mm_div_ps(magScalar, logf4(magScalar));
+
+    __m128 invRBW = _mm_set_ps1(iRBW);
+
+    __m128 invNormalizationFactor = _mm_set_ps1(iNormalizationFactor);
+
+    __m128 power;
+    __m128 input1, input2;
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the complex values
+        input1 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+        input2 = _mm_load_ps(inputPtr);
+        inputPtr += 4;
+
+        // Apply the normalization factor
+        input1 = _mm_mul_ps(input1, invNormalizationFactor);
+        input2 = _mm_mul_ps(input2, invNormalizationFactor);
+
+        // Multiply each value by itself
+        // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+        input1 = _mm_mul_ps(input1, input1);
+        // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+        input2 = _mm_mul_ps(input2, input2);
+
+        // Horizontal add, to add (r*r) + (i*i) for each complex value
+        // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+        power = _mm_hadd_ps(input1, input2);
+
+        // Divide by the rbw
+        power = _mm_mul_ps(power, invRBW);
+
+        // Calculate the natural log power
+        power = logf4(power);
+
+        // Convert to log10 and multiply by 10.0
+        power = _mm_mul_ps(power, magScalar);
+
+        // Store the floating point results
+        _mm_store_ps(destPtr, power);
+
+        destPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+    // Calculate the FFT for any remaining points
+    for (; number < num_points; number++) {
+        // Calculate dBm
+        // 50 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+        // 75 ohm load assumption
+        // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+        const float real = *inputPtr++ * iNormalizationFactor;
+        const float imag = *inputPtr++ * iNormalizationFactor;
+
+        *destPtr = volk_log2to10factor *
+                   log2f_non_ieee((((real * real) + (imag * imag))) * iRBW);
+        destPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput,
+                                                     const lv_32fc_t* complexFFTInput,
+                                                     const float normalizationFactor,
+                                                     const float rbw,
+                                                     unsigned int num_points)
+{
+    if (rbw != 1.0)
+        volk_32fc_s32f_power_spectrum_32f(
+            logPowerOutput, complexFFTInput, normalizationFactor * sqrt(rbw), num_points);
+    else
+        volk_32fc_s32f_power_spectrum_32f(
+            logPowerOutput, complexFFTInput, normalizationFactor, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_multiply_32fc.h b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h

new file mode 100644 (file)

index 0000000..f560b5e
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -0,0 +1,443 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies the input complex vector by a complex scalar and returns
+ * the results.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t scalar, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be multiplied.
+ * \li scalar The complex scalar to multiply against aVector.
+ * \li num_points: The number of complex values in aVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate points around the unit circle and shift the phase pi/3 rad.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t scalar = lv_cmake((float)std::cos(M_PI/3.f), (float)std::sin(M_PI/3.f));
+ *
+ *   float delta = 2.f*M_PI / (float)N;
+ *   for(unsigned int ii = 0; ii < N/2; ++ii){
+ *       // Generate points around the unit circle
+ *       float real = std::cos(delta * (float)ii);
+ *       float imag = std::sin(delta * (float)ii);
+ *       in[ii] = lv_cmake(real, imag);
+ *       in[ii+N/2] = lv_cmake(-real, -imag);
+ *   }
+ *
+ *   volk_32fc_s32fc_multiply_32fc(out, in, scalar, N);
+ *
+ *   printf(" mag   phase  |   mag   phase\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%+1.2f  %+1.2f  |  %+1.2f   %+1.2f\n",
+ *           std::abs(in[ii]), std::arg(in[ii]),
+ *           std::abs(out[ii]), std::arg(out[ii]));
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(scalar));
+    yh = _mm_set_ps1(lv_cimag(scalar));
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t scalar,
+                                                         unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0)
+        *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+    __m256 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm256_set1_ps(lv_creal(scalar));
+    yh = _mm256_set1_ps(lv_cimag(scalar));
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t scalar,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, yl, yh, z, tmp1, tmp2;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+
+    // Set up constant scalar vector
+    yl = _mm_set_ps1(lv_creal(scalar));
+    yh = _mm_set_ps1(lv_cimag(scalar));
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t a_val, scalar_val;
+    float32x4x2_t tmp_imag;
+
+    scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
+    scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)aPtr);
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
+
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
+
+        vst2q_f32((float*)cPtr, tmp_imag);
+        aPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = *aPtr++ * scalar;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector,
+                                                           const lv_32fc_t* aVector,
+                                                           const lv_32fc_t scalar,
+                                                           unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        *cPtr++ = (*aPtr++) * scalar;
+        number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0)
+        *cPtr++ = *aPtr++ * scalar;
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h

new file mode 100644 (file)

index 0000000..e328a31
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_rotatorpuppet_32fc.h
@@ -0,0 +1,169 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H
+
+
+#include <stdio.h>
+#include <volk/volk_32fc_s32fc_x2_rotator_32fc.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_generic(lv_32fc_t* outVector,
+                                                              const lv_32fc_t* inVector,
+                                                              const lv_32fc_t phase_inc,
+                                                              unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_generic(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_neon(lv_32fc_t* outVector,
+                                                           const lv_32fc_t* inVector,
+                                                           const lv_32fc_t phase_inc,
+                                                           unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, 0.95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_neon(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                                               const lv_32fc_t* inVector,
+                                                               const lv_32fc_t phase_inc,
+                                                               unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                                               const lv_32fc_t* inVector,
+                                                               const lv_32fc_t phase_inc,
+                                                               unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_avx(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_avx(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                                                const lv_32fc_t* inVector,
+                                                                const lv_32fc_t phase_inc,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_rotatorpuppet_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                                                const lv_32fc_t* inVector,
+                                                                const lv_32fc_t phase_inc,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t phase[1] = { lv_cmake(.3f, .95393f) };
+    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    const lv_32fc_t phase_inc_n =
+        phase_inc / hypotf(lv_creal(phase_inc), lv_cimag(phase_inc));
+    volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(
+        outVector, inVector, phase_inc_n, phase, num_points);
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h

new file mode 100644 (file)

index 0000000..aebf46d
--- /dev/null
+++ b/kernels/volk/volk_32fc_s32fc_x2_rotator_32fc.h
@@ -0,0 +1,780 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_s32fc_x2_rotator_32fc
+ *
+ * \b Overview
+ *
+ * Rotate input vector at fixed rate per sample from initial phase
+ * offset.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_s32fc_x2_rotator_32fc(lv_32fc_t* outVector, const lv_32fc_t* inVector,
+ * const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inVector: Vector to be rotated.
+ * \li phase_inc: rotational velocity.
+ * \li phase: initial phase offset.
+ * \li num_points: The number of values in inVector to be rotated and stored into
+ * outVector.
+ *
+ * \b Outputs
+ * \li outVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Generate a tone at f=0.3 (normalized frequency) and use the rotator with
+ * f=0.1 to shift the tone to f=0.4. Change this example to start with a DC
+ * tone (initialize in with lv_cmake(1, 0)) to observe rotator signal generation.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* in  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       // Generate a tone at f=0.3
+ *       float real = std::cos(0.3f * (float)ii);
+ *       float imag = std::sin(0.3f * (float)ii);
+ *       in[ii] = lv_cmake(real, imag);
+ *   }
+ *   // The oscillator rotates at f=0.1
+ *   float frequency = 0.1f;
+ *   lv_32fc_t phase_increment = lv_cmake(std::cos(frequency), std::sin(frequency));
+ *   lv_32fc_t phase= lv_cmake(1.f, 0.0f); // start at 1 (0 rad phase)
+ *
+ *   // rotate so the output is a tone at f=0.4
+ *   volk_32fc_s32fc_x2_rotator_32fc(out, in, phase_increment, &phase, N);
+ *
+ *   // print results for inspection
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %+1.2f %+1.2fj\n",
+ *           ii, lv_creal(out[ii]), lv_cimag(out[ii]));
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
+
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <volk/volk_complex.h>
+#define ROTATOR_RELOAD 512
+#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
+#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t* outVector,
+                                                           const lv_32fc_t* inVector,
+                                                           const lv_32fc_t phase_inc,
+                                                           lv_32fc_t* phase,
+                                                           unsigned int num_points)
+{
+    unsigned int i = 0;
+    int j = 0;
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD; ++j) {
+            *outVector++ = *inVector++ * (*phase);
+            (*phase) *= phase_inc;
+        }
+
+        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    }
+    for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
+        *outVector++ = *inVector++ * (*phase);
+        (*phase) *= phase_inc;
+    }
+    if (i) {
+        // Make sure, we normalize phase on every call!
+        (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+#include <volk/volk_neon_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t* outVector,
+                                                        const lv_32fc_t* inVector,
+                                                        const lv_32fc_t phase_inc,
+                                                        lv_32fc_t* phase,
+                                                        unsigned int num_points)
+
+{
+    lv_32fc_t* outputVectorPtr = outVector;
+    const lv_32fc_t* inputVectorPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
+    float32x4x2_t input_vec;
+    float32x4x2_t output_vec;
+
+    unsigned int i = 0, j = 0;
+    // const unsigned int quarter_points = num_points / 4;
+
+    for (i = 0; i < 4; ++i) {
+        phasePtr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    // Notice that incr has be incremented in the previous loop
+    const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
+    const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
+    float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; j++) {
+            input_vec = vld2q_f32((float*)inputVectorPtr);
+            // Prefetch next one, speeds things up
+            __VOLK_PREFETCH(inputVectorPtr + 4);
+            // Rotate
+            output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+            // Increase phase
+            phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+            // Store output
+            vst2q_f32((float*)outputVectorPtr, output_vec);
+
+            outputVectorPtr += 4;
+            inputVectorPtr += 4;
+        }
+        // normalize phase so magnitude doesn't grow because of
+        // floating point rounding error
+        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+        // Multiply complex with real
+        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
+        input_vec = vld2q_f32((float*)inputVectorPtr);
+        // Prefetch next one, speeds things up
+        __VOLK_PREFETCH(inputVectorPtr + 4);
+        // Rotate
+        output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
+        // Increase phase
+        phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
+        // Store output
+        vst2q_f32((float*)outputVectorPtr, output_vec);
+
+        outputVectorPtr += 4;
+        inputVectorPtr += 4;
+    }
+    // if(i) == true means we looped above
+    if (i) {
+        // normalize phase so magnitude doesn't grow because of
+        // floating point rounding error
+        const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
+        const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
+        // Multiply complex with real
+        phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
+        phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
+    }
+    // Store current phase
+    vst2q_f32((float*)phasePtr, phase_vec);
+
+    // Deal with the rest
+    for (i = 0; i < num_points % 4; i++) {
+        *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
+        phasePtr[0] *= (phase_inc);
+    }
+
+    // For continuous phase next time we need to call this function
+    (*phase) = phasePtr[0];
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            lv_32fc_t* phase,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 2; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+            aVal = _mm_load_ps((float*)aPtr);
+
+            yl = _mm_moveldup_ps(phase_Val);
+            yh = _mm_movehdup_ps(phase_Val);
+            ylp = _mm_moveldup_ps(inc_Val);
+            yhp = _mm_movehdup_ps(inc_Val);
+
+            tmp1 = _mm_mul_ps(aVal, yl);
+            tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm_mul_ps(aVal, yh);
+            tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+            z = _mm_addsub_ps(tmp1, tmp2);
+            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+            _mm_store_ps((float*)cPtr, z);
+
+            aPtr += 2;
+            cPtr += 2;
+        }
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+        aVal = _mm_load_ps((float*)aPtr);
+
+        yl = _mm_moveldup_ps(phase_Val);
+        yh = _mm_movehdup_ps(phase_Val);
+        ylp = _mm_moveldup_ps(inc_Val);
+        yhp = _mm_movehdup_ps(inc_Val);
+
+        tmp1 = _mm_mul_ps(aVal, yl);
+
+        tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm_mul_ps(aVal, yh);
+        tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+        z = _mm_addsub_ps(tmp1, tmp2);
+        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+        _mm_store_ps((float*)cPtr, z);
+
+        aPtr += 2;
+        cPtr += 2;
+    }
+    if (i) {
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+
+    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+    if (num_points & 1) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
+                                                            const lv_32fc_t* inVector,
+                                                            const lv_32fc_t phase_inc,
+                                                            lv_32fc_t* phase,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 2; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
+    printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
+    printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
+    __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
+
+            aVal = _mm_loadu_ps((float*)aPtr);
+
+            yl = _mm_moveldup_ps(phase_Val);
+            yh = _mm_movehdup_ps(phase_Val);
+            ylp = _mm_moveldup_ps(inc_Val);
+            yhp = _mm_movehdup_ps(inc_Val);
+
+            tmp1 = _mm_mul_ps(aVal, yl);
+            tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+            aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm_mul_ps(aVal, yh);
+            tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+            z = _mm_addsub_ps(tmp1, tmp2);
+            phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+            _mm_storeu_ps((float*)cPtr, z);
+
+            aPtr += 2;
+            cPtr += 2;
+        }
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
+        aVal = _mm_loadu_ps((float*)aPtr);
+
+        yl = _mm_moveldup_ps(phase_Val);
+        yh = _mm_movehdup_ps(phase_Val);
+        ylp = _mm_moveldup_ps(inc_Val);
+        yhp = _mm_movehdup_ps(inc_Val);
+
+        tmp1 = _mm_mul_ps(aVal, yl);
+
+        tmp1p = _mm_mul_ps(phase_Val, ylp);
+
+        aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm_mul_ps(aVal, yh);
+        tmp2p = _mm_mul_ps(phase_Val, yhp);
+
+        z = _mm_addsub_ps(tmp1, tmp2);
+        phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
+
+        _mm_storeu_ps((float*)cPtr, z);
+
+        aPtr += 2;
+        cPtr += 2;
+    }
+    if (i) {
+        tmp1 = _mm_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm_sqrt_ps(tmp1);
+        phase_Val = _mm_div_ps(phase_Val, tmp2);
+    }
+
+    _mm_storeu_ps((float*)phase_Ptr, phase_Val);
+    if (num_points & 1) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t* outVector,
+                                                         const lv_32fc_t* inVector,
+                                                         const lv_32fc_t phase_inc,
+                                                         lv_32fc_t* phase,
+                                                         unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    __m256 aVal, phase_Val, z;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_load_ps((float*)aPtr);
+
+            z = _mm256_complexmul_ps(aVal, phase_Val);
+            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+            _mm256_store_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_load_ps((float*)aPtr);
+
+        z = _mm256_complexmul_ps(aVal, phase_Val);
+        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+        _mm256_store_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX for aligned */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t* outVector,
+                                                         const lv_32fc_t* inVector,
+                                                         const lv_32fc_t phase_inc,
+                                                         lv_32fc_t* phase,
+                                                         unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    __m256 aVal, phase_Val, z;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+
+    const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr),
+                                         lv_cimag(incr),
+                                         lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_loadu_ps((float*)aPtr);
+
+            z = _mm256_complexmul_ps(aVal, phase_Val);
+            phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+            _mm256_storeu_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_loadu_ps((float*)aPtr);
+
+        z = _mm256_complexmul_ps(aVal, phase_Val);
+        phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
+
+        _mm256_storeu_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        phase_Val = _mm256_normalize_ps(phase_Val);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    (*phase) = phase_Ptr[0];
+    volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
+}
+
+#endif /* LV_HAVE_AVX */
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    __VOLK_ATTR_ALIGNED(32)
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm256_load_ps((float*)phase_Ptr);
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_load_ps((float*)aPtr);
+
+            yl = _mm256_moveldup_ps(phase_Val);
+            yh = _mm256_movehdup_ps(phase_Val);
+            ylp = _mm256_moveldup_ps(inc_Val);
+            yhp = _mm256_movehdup_ps(inc_Val);
+
+            tmp1 = aVal;
+            tmp1p = phase_Val;
+
+            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm256_mul_ps(aVal, yh);
+            tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+            _mm256_store_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_load_ps((float*)aPtr);
+
+        yl = _mm256_moveldup_ps(phase_Val);
+        yh = _mm256_movehdup_ps(phase_Val);
+        ylp = _mm256_moveldup_ps(inc_Val);
+        yhp = _mm256_movehdup_ps(inc_Val);
+
+        tmp1 = aVal;
+        tmp1p = phase_Val;
+
+        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm256_mul_ps(aVal, yh);
+        tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+        _mm256_store_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+
+    _mm256_store_ps((float*)phase_Ptr, phase_Val);
+    for (i = 0; i < num_points % 4; ++i) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
+                                                             const lv_32fc_t* inVector,
+                                                             const lv_32fc_t phase_inc,
+                                                             lv_32fc_t* phase,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t* cPtr = outVector;
+    const lv_32fc_t* aPtr = inVector;
+    lv_32fc_t incr = 1;
+    lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
+
+    unsigned int i, j = 0;
+
+    for (i = 0; i < 4; ++i) {
+        phase_Ptr[i] *= incr;
+        incr *= (phase_inc);
+    }
+
+    __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
+
+    phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
+    inc_Val = _mm256_set_ps(lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr),
+                            lv_cimag(incr),
+                            lv_creal(incr));
+
+    for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
+        for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
+
+            aVal = _mm256_loadu_ps((float*)aPtr);
+
+            yl = _mm256_moveldup_ps(phase_Val);
+            yh = _mm256_movehdup_ps(phase_Val);
+            ylp = _mm256_moveldup_ps(inc_Val);
+            yhp = _mm256_movehdup_ps(inc_Val);
+
+            tmp1 = aVal;
+            tmp1p = phase_Val;
+
+            aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+            phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+            tmp2 = _mm256_mul_ps(aVal, yh);
+            tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+            z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+            phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+            _mm256_storeu_ps((float*)cPtr, z);
+
+            aPtr += 4;
+            cPtr += 4;
+        }
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+    for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
+        aVal = _mm256_loadu_ps((float*)aPtr);
+
+        yl = _mm256_moveldup_ps(phase_Val);
+        yh = _mm256_movehdup_ps(phase_Val);
+        ylp = _mm256_moveldup_ps(inc_Val);
+        yhp = _mm256_movehdup_ps(inc_Val);
+
+        tmp1 = aVal;
+        tmp1p = phase_Val;
+
+        aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
+        phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
+        tmp2 = _mm256_mul_ps(aVal, yh);
+        tmp2p = _mm256_mul_ps(phase_Val, yhp);
+
+        z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
+        phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
+
+        _mm256_storeu_ps((float*)cPtr, z);
+
+        aPtr += 4;
+        cPtr += 4;
+    }
+    if (i) {
+        tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
+        tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+        tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+        tmp2 = _mm256_sqrt_ps(tmp1);
+        phase_Val = _mm256_div_ps(phase_Val, tmp2);
+    }
+
+    _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
+    for (i = 0; i < num_points % 4; ++i) {
+        *cPtr++ = *aPtr++ * phase_Ptr[0];
+        phase_Ptr[0] *= (phase_inc);
+    }
+
+    (*phase) = phase_Ptr[0];
+}
+
+#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_add_32fc.h b/kernels/volk/volk_32fc_x2_add_32fc.h

new file mode 100644 (file)

index 0000000..d3139f7
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_add_32fc.h
@@ -0,0 +1,277 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_add_32fc
+ *
+ * \b Overview
+ *
+ * Adds two vectors together element by element:
+ *
+ * c[i] = a[i] + b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_add_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First vector of input points.
+ * \li bVector: Second vector of input points.
+ * \li num_points: The number of values in both input vector.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ *
+ * The follow example adds the increasing and decreasing vectors such that the result of
+ * every summation pair is 10
+ *
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* increasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* decreasing = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (lv_32fc_t)ii;
+ *       decreasing[ii] = 10.f - (lv_32fc_t)ii;
+ *   }
+ *
+ *   volk_32fc_x2_add_32fc(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_add_32fc_u_H
+#define INCLUDED_volk_32fc_x2_add_32fc_u_H
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_add_32fc_u_avx(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_loadu_ps((float*)aPtr);
+        bVal = _mm256_loadu_ps((float*)bPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_storeu_ps((float*)cPtr,
+                         cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_add_32fc_a_avx(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    __m256 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_load_ps((float*)aPtr);
+        bVal = _mm256_load_ps((float*)bPtr);
+
+        cVal = _mm256_add_ps(aVal, bVal);
+
+        _mm256_store_ps((float*)cPtr,
+                        cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_x2_add_32fc_u_sse(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
+
+        aVal = _mm_loadu_ps((float*)aPtr);
+        bVal = _mm_loadu_ps((float*)bPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_storeu_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_add_32fc_generic(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32fc_x2_add_32fc_a_sse(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
+        aVal = _mm_load_ps((float*)aPtr);
+        bVal = _mm_load_ps((float*)bPtr);
+
+        cVal = _mm_add_ps(aVal, bVal);
+
+        _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_add_32fc_u_neon(lv_32fc_t* cVector,
+                                                const lv_32fc_t* aVector,
+                                                const lv_32fc_t* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    float32x4_t aVal, bVal, cVal;
+    for (number = 0; number < halfPoints; number++) {
+        // Load in to NEON registers
+        aVal = vld1q_f32((const float32_t*)(aPtr));
+        bVal = vld1q_f32((const float32_t*)(bPtr));
+        __VOLK_PREFETCH(aPtr + 2);
+        __VOLK_PREFETCH(bPtr + 2);
+
+        // vector add
+        cVal = vaddq_f32(aVal, bVal);
+        // Store the results back into the C container
+        vst1q_f32((float*)(cPtr), cVal);
+
+        aPtr += 2; // q uses quadwords, 4 lv_32fc_ts per vadd
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2; // should be = num_points
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32fc_x2_add_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h

new file mode 100644 (file)

index 0000000..a7fd48e
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
@@ -0,0 +1,743 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_conjugate_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the conjugate dot product (or inner product)
+ * between two vectors, the \p input and \p taps vectors. Given a set
+ * of \p num_points taps, the result is the sum of products between
+ * the input vector and the conjugate of the taps. The result is a
+ * single value stored in the \p result address and is returned as a
+ * complex float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input,
+ * const lv_32fc_t* taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+ * \li taps:  complex float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex float value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * unsigned int N = 1000;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * lv_32fc_t* a = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment);
+ * lv_32fc_t* b = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment);
+ *
+ * for (int i = 0; i < N; ++i) {
+ *   a[i] = lv_cmake(.50f, .50f);
+ *   b[i] = lv_cmake(.50f, .75f);
+ * }
+ *
+ * lv_32fc_t e = (float) N * a[0] * lv_conj(b[0]); // When a and b constant
+ * lv_32fc_t res;
+ *
+ * volk_32fc_x2_conjugate_dot_prod_32fc(&res, a, b, N);
+ *
+ * printf("Expected: %8.2f%+8.2fi\n", lv_real(e), lv_imag(e));
+ * printf("Result:   %8.2f%+8.2fi\n", lv_real(res), lv_imag(res));
+ *
+ * volk_free(a);
+ * volk_free(b);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
+
+
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                                const lv_32fc_t* input,
+                                                                const lv_32fc_t* taps,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t res = lv_cmake(0.f, 0.f);
+    for (unsigned int i = 0; i < num_points; ++i) {
+        res += (*input++) * lv_conj((*taps++));
+    }
+    *result = res;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_block(lv_32fc_t* result,
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
+
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+        sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+        sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+        in += 4;
+        tp += 4;
+    }
+
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
+
+    if (num_bytes >> 3 & 1) {
+        *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_avx(lv_32fc_t* result,
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
+{
+    // Partial sums for indices i, i+1, i+2 and i+3.
+    __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+    __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+        /* Four complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m256 a = _mm256_loadu_ps((const float*)&input[i]);
+        __m256 b = _mm256_loadu_ps((const float*)&taps[i]);
+        __m256 b_real = _mm256_moveldup_ps(b);
+        __m256 b_imag = _mm256_movehdup_ps(b);
+
+        // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+    }
+
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+    __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+     * s1 + s3 and s0 + s2 …
+     */
+    sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+    // … and now (s0 + s2) + (s1 + s3)
+    sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    __m128 lower = _mm256_extractf128_ps(sum, 0);
+    _mm_storel_pi((__m64*)result, lower);
+
+    // Handle the last elements if num_points mod 4 is bigger than 0.
+    for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+        *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+                                lv_cimag(input[i]) * lv_cimag(taps[i]),
+                            lv_cimag(input[i]) * lv_creal(taps[i]) -
+                                lv_creal(input[i]) * lv_cimag(taps[i]));
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result,
+                                                               const lv_32fc_t* input,
+                                                               const lv_32fc_t* taps,
+                                                               unsigned int num_points)
+{
+    // Partial sums for indices i and i+1.
+    __m128 sum_a_mult_b_real = _mm_setzero_ps();
+    __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+        /* Two complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m128 a = _mm_loadu_ps((const float*)&input[i]);
+        __m128 b = _mm_loadu_ps((const float*)&taps[i]);
+        __m128 b_real = _mm_moveldup_ps(b);
+        __m128 b_imag = _mm_movehdup_ps(b);
+
+        // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+    }
+
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag =
+        _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+    __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    // Sum the two partial sums.
+    sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    _mm_storel_pi((__m64*)result, sum);
+
+    // Handle the last element if num_points mod 2 is 1.
+    if (num_points & 1u) {
+        *result += lv_cmake(
+            lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+                lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+            lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+                lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+    }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                             const lv_32fc_t* input,
+                                                             const lv_32fc_t* taps,
+                                                             unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, accumulator;
+    float32x4x2_t tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *result += (*a_ptr++) * lv_conj(*b_ptr++);
+    }
+    *result = lv_conj(*result);
+}
+#endif /*LV_HAVE_NEON*/
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
+
+#include <stdio.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_avx(lv_32fc_t* result,
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
+{
+    // Partial sums for indices i, i+1, i+2 and i+3.
+    __m256 sum_a_mult_b_real = _mm256_setzero_ps();
+    __m256 sum_a_mult_b_imag = _mm256_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~3u); i += 4) {
+        /* Four complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+3 | ar,i+3 | … | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+3 | br,i+3 | … | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+3 | br,i+3 | … | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+3 | bi,i+3 | … | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m256 a = _mm256_load_ps((const float*)&input[i]);
+        __m256 b = _mm256_load_ps((const float*)&taps[i]);
+        __m256 b_real = _mm256_moveldup_ps(b);
+        __m256 b_imag = _mm256_movehdup_ps(b);
+
+        // Add | ai⋅br,i+3 | ar⋅br,i+3 | … | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm256_add_ps(sum_a_mult_b_real, _mm256_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+3 | −ar⋅bi,i+3 | … | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm256_addsub_ps(sum_a_mult_b_imag, _mm256_mul_ps(a, b_imag));
+    }
+
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag = _mm256_permute_ps(sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains four such partial sums.
+    __m256 sum = _mm256_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    /* Sum the four partial sums: Add high half of vector sum to the low one, i.e.
+     * s1 + s3 and s0 + s2 …
+     */
+    sum = _mm256_add_ps(sum, _mm256_permute2f128_ps(sum, sum, 0x01));
+    // … and now (s0 + s2) + (s1 + s3)
+    sum = _mm256_add_ps(sum, _mm256_permute_ps(sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    __m128 lower = _mm256_extractf128_ps(sum, 0);
+    _mm_storel_pi((__m64*)result, lower);
+
+    // Handle the last elements if num_points mod 4 is bigger than 0.
+    for (long unsigned i = num_points & ~3u; i < num_points; ++i) {
+        *result += lv_cmake(lv_creal(input[i]) * lv_creal(taps[i]) +
+                                lv_cimag(input[i]) * lv_cimag(taps[i]),
+                            lv_cimag(input[i]) * lv_creal(taps[i]) -
+                                lv_creal(input[i]) * lv_cimag(taps[i]));
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse3(lv_32fc_t* result,
+                                                               const lv_32fc_t* input,
+                                                               const lv_32fc_t* taps,
+                                                               unsigned int num_points)
+{
+    // Partial sums for indices i and i+1.
+    __m128 sum_a_mult_b_real = _mm_setzero_ps();
+    __m128 sum_a_mult_b_imag = _mm_setzero_ps();
+
+    for (long unsigned i = 0; i < (num_points & ~1u); i += 2) {
+        /* Two complex elements a time are processed.
+         * (ar + j⋅ai)*conj(br + j⋅bi) =
+         * ar⋅br + ai⋅bi + j⋅(ai⋅br − ar⋅bi)
+         */
+
+        /* Load input and taps, split and duplicate real und imaginary parts of taps.
+         * a: | ai,i+1 | ar,i+1 | ai,i+0 | ar,i+0 |
+         * b: | bi,i+1 | br,i+1 | bi,i+0 | br,i+0 |
+         * b_real: | br,i+1 | br,i+1 | br,i+0 | br,i+0 |
+         * b_imag: | bi,i+1 | bi,i+1 | bi,i+0 | bi,i+0 |
+         */
+        __m128 a = _mm_load_ps((const float*)&input[i]);
+        __m128 b = _mm_load_ps((const float*)&taps[i]);
+        __m128 b_real = _mm_moveldup_ps(b);
+        __m128 b_imag = _mm_movehdup_ps(b);
+
+        // Add | ai⋅br,i+1 | ar⋅br,i+1 | ai⋅br,i+0 | ar⋅br,i+0 | to partial sum.
+        sum_a_mult_b_real = _mm_add_ps(sum_a_mult_b_real, _mm_mul_ps(a, b_real));
+        // Add | ai⋅bi,i+1 | −ar⋅bi,i+1 | ai⋅bi,i+0 | −ar⋅bi,i+0 | to partial sum.
+        sum_a_mult_b_imag = _mm_addsub_ps(sum_a_mult_b_imag, _mm_mul_ps(a, b_imag));
+    }
+
+    // Swap position of −ar⋅bi and ai⋅bi.
+    sum_a_mult_b_imag =
+        _mm_shuffle_ps(sum_a_mult_b_imag, sum_a_mult_b_imag, _MM_SHUFFLE(2, 3, 0, 1));
+    // | ai⋅br + ai⋅bi | ai⋅br − ar⋅bi |, sum contains two such partial sums.
+    __m128 sum = _mm_add_ps(sum_a_mult_b_real, sum_a_mult_b_imag);
+    // Sum the two partial sums.
+    sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2)));
+    // Store result.
+    _mm_storel_pi((__m64*)result, sum);
+
+    // Handle the last element if num_points mod 2 is 1.
+    if (num_points & 1u) {
+        *result += lv_cmake(
+            lv_creal(input[num_points - 1]) * lv_creal(taps[num_points - 1]) +
+                lv_cimag(input[num_points - 1]) * lv_cimag(taps[num_points - 1]),
+            lv_cimag(input[num_points - 1]) * lv_creal(taps[num_points - 1]) -
+                lv_creal(input[num_points - 1]) * lv_cimag(taps[num_points - 1]));
+    }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result,
+                                                                  const lv_32fc_t* input,
+                                                                  const lv_32fc_t* taps,
+                                                                  unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
+
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] + in[1] * tp[1];
+        sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] + in[3] * tp[3];
+        sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
+
+        in += 4;
+        tp += 4;
+    }
+
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
+
+    if (num_bytes >> 3 & 1) {
+        *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
+                                                              const lv_32fc_t* input,
+                                                              const lv_32fc_t* taps,
+                                                              unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    __VOLK_ATTR_ALIGNED(16)
+    static const uint32_t conjugator[4] = {
+        0x00000000, 0x80000000, 0x00000000, 0x80000000
+    };
+
+    __VOLK_ASM __VOLK_VOLATILE(
+        "#  ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %[conjugator], %%r9\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movaps 0(%%r9), %%xmm8\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "  xorps  %%xmm8, %%xmm2\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%%r9), %%xmmA\n\t"
+        "#     movaps  (%%r10), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      movaps  16(%%r9), %%xmm1\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      movaps  0(%%r10), %%xmm2\n\t"
+        "  xorps   %%xmm8, %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  16(%%r10), %%xmm3\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "  xorps   %%xmm8, %%xmm3\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      movaps  0(%%r10), %%xmm2\n\t"
+        "  xorps   %%xmm8, %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input),
+          [rdx] "r"(taps),
+          "c"(num_bytes),
+          [rdi] "r"(result),
+          [conjugator] "r"(conjugator)
+        : "rax", "r8", "r9", "r10");
+
+    int getem = num_bytes % 16;
+
+    for (; getem > 0; getem -= 8) {
+        *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
+    }
+}
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+                                                                 const lv_32fc_t* input,
+                                                                 const lv_32fc_t* taps,
+                                                                 unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    __VOLK_ATTR_ALIGNED(16)
+    static const uint32_t conjugator[4] = {
+        0x00000000, 0x80000000, 0x00000000, 0x80000000
+    };
+
+    int bound = num_bytes >> 4;
+    int leftovers = num_bytes % 16;
+
+    __VOLK_ASM __VOLK_VOLATILE(
+        "      #pushl  %%ebp\n\t"
+        "      #movl   %%esp, %%ebp\n\t"
+        "      #movl   12(%%ebp), %%eax                # input\n\t"
+        "      #movl   16(%%ebp), %%edx                # taps\n\t"
+        "      #movl   20(%%ebp), %%ecx                # n_bytes\n\t"
+        "  movaps  0(%[conjugator]), %%xmm1\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      movaps  0(%[eax]), %%xmm0\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      movaps  0(%[edx]), %%xmm2\n\t"
+        "  movl    %[ecx], (%[out])\n\t"
+        "      shrl    $5, %[ecx]              # ecx = n_2_ccomplex_blocks / 2\n\t"
+
+        "  xorps   %%xmm1, %%xmm2\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%[eax]), %%xmmA\n\t"
+        "#     movaps  (%[edx]), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  16(%[edx]), %%xmm3\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "  xorps   %%xmm1, %%xmm3\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      movaps  16(%[eax]), %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "  movaps  0(%[conjugator]), %%xmm1\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      movaps  32(%[eax]), %%xmm0\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      addl    $32, %[eax]\n\t"
+        "      movaps  32(%[edx]), %%xmm2\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "  xorps   %%xmm1, %%xmm2\n\t"
+        "      addl    $32, %[edx]\n\t"
+        ".%=L1_test:\n\t"
+        "      decl    %[ecx]\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      movl    0(%[out]), %[ecx]               # n_2_ccomplex_blocks\n\t"
+        "  shrl    $4, %[ecx]\n\t"
+        "      andl    $1, %[ecx]\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      # Note that we've already got mm0/mm2 preloaded\n\t"
+        "      # from the main loop.\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "  #movl 8(%%ebp), %[eax] \n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "  movl        $0x80000000, (%[out])\n\t"
+        "      movss   (%[out]), %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      #movl   8(%%ebp), %[eax]                # @result\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[out])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        "      #popl   %%ebp\n\t"
+        :
+        : [eax] "r"(input),
+          [edx] "r"(taps),
+          [ecx] "r"(num_bytes),
+          [out] "r"(result),
+          [conjugator] "r"(conjugator));
+
+    for (; leftovers > 0; leftovers -= 8) {
+        *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
+    }
+}
+#endif /*LV_HAVE_SSE*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
diff --git a/kernels/volk/volk_32fc_x2_divide_32fc.h b/kernels/volk/volk_32fc_x2_divide_32fc.h

new file mode 100644 (file)

index 0000000..f15e702
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_divide_32fc.h
@@ -0,0 +1,380 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_divide_32fc
+ *
+ * \b Overview
+ *
+ * Divide first vector of complexes element-wise by second.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
+ * const lv_32fc_t* denumeratorVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li numeratorVector: The numerator complex values.
+ * \li numeratorVector: The denumerator complex values.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * divide a complex vector by itself, demonstrating the result should be pretty close to
+ * 1+0j.
+ *
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* input_vector  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   float delta = 2.f*M_PI / (float)N;
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real_1 = std::cos(0.3f * (float)ii);
+ *       float imag_1 = std::sin(0.3f * (float)ii);
+ *       input_vector[ii] = lv_cmake(real_1, imag_1);
+ *   }
+ *
+ *   volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
+ *   }
+ *   printf("\n");
+ *
+ *   volk_free(input_vector);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
+#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    for (unsigned int number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* numeratorVector,
+                                                   const lv_32fc_t* denumeratorVector,
+                                                   unsigned int num_points)
+{
+    /*
+     * we'll do the "classical"
+     *  a      a b*
+     * --- = -------
+     *  b     |b|^2
+     * */
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 num01, num23, den01, den23, norm, result;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    for (; number < quarterPoints; number++) {
+        num01 = _mm_loadu_ps((float*)a);                  // first pair
+        den01 = _mm_loadu_ps((float*)b);                  // first pair
+        num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+        a += 2;
+        b += 2;
+
+        num23 = _mm_loadu_ps((float*)a);                  // second pair
+        den23 = _mm_loadu_ps((float*)b);                  // second pair
+        num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+        a += 2;
+        b += 2;
+
+        norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+        den01 = _mm_unpacklo_ps(norm, norm);
+        den23 = _mm_unpackhi_ps(norm, norm);
+
+        result = _mm_div_ps(num01, den01);
+        _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+        result = _mm_div_ps(num23, den23);
+        _mm_storeu_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+    }
+
+    number *= 4;
+    for (; number < num_points; number++) {
+        *c = (*a) / (*b);
+        a++;
+        b++;
+        c++;
+    }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* numeratorVector,
+                                                  const lv_32fc_t* denumeratorVector,
+                                                  unsigned int num_points)
+{
+    /*
+     * we'll do the "classical"
+     *  a      a b*
+     * --- = -------
+     *  b     |b|^2
+     * */
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    for (; number < quarterPoints; number++) {
+        num = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        denum = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        mul_conj = _mm256_complexconjugatemul_ps(num, denum);
+        sq = _mm256_mul_ps(denum, denum); // Square the values
+        mag_sq_un = _mm256_hadd_ps(
+            sq, sq); // obtain the actual squared magnitude, although out of order
+        mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
+        // best guide I found on using these functions:
+        // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
+        div = _mm256_div_ps(mul_conj, mag_sq);
+
+        _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) / (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
+
+
+#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
+#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* numeratorVector,
+                                                   const lv_32fc_t* denumeratorVector,
+                                                   unsigned int num_points)
+{
+    /*
+     * we'll do the "classical"
+     *  a      a b*
+     * --- = -------
+     *  b     |b|^2
+     * */
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128 num01, num23, den01, den23, norm, result;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    for (; number < quarterPoints; number++) {
+        num01 = _mm_load_ps((float*)a);                   // first pair
+        den01 = _mm_load_ps((float*)b);                   // first pair
+        num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
+        a += 2;
+        b += 2;
+
+        num23 = _mm_load_ps((float*)a);                   // second pair
+        den23 = _mm_load_ps((float*)b);                   // second pair
+        num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
+        a += 2;
+        b += 2;
+
+        norm = _mm_magnitudesquared_ps_sse3(den01, den23);
+
+        den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
+        den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
+
+        result = _mm_div_ps(num01, den01);
+        _mm_store_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+        result = _mm_div_ps(num23, den23);
+        _mm_store_ps((float*)c, result); // Store the results back into the C container
+        c += 2;
+    }
+
+    number *= 4;
+    for (; number < num_points; number++) {
+        *c = (*a) / (*b);
+        a++;
+        b++;
+        c++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* numeratorVector,
+                                                  const lv_32fc_t* denumeratorVector,
+                                                  unsigned int num_points)
+{
+    /*
+     * Guide to AVX intrisics:
+     * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
+     *
+     * we'll do the "classical"
+     *  a      a b*
+     * --- = -------
+     *  b     |b|^2
+     *
+     */
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = numeratorVector;
+    const lv_32fc_t* b = denumeratorVector;
+
+    const unsigned int eigthPoints = num_points / 8;
+
+    __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
+
+    for (unsigned int number = 0; number < eigthPoints; number++) {
+        // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        num01 = _mm256_load_ps((float*)a);
+        denum01 = _mm256_load_ps((float*)b);
+
+        num01 = _mm256_complexconjugatemul_ps(num01, denum01);
+        a += 4;
+        b += 4;
+
+        num23 = _mm256_load_ps((float*)a);
+        denum23 = _mm256_load_ps((float*)b);
+        num23 = _mm256_complexconjugatemul_ps(num23, denum23);
+        a += 4;
+        b += 4;
+
+        complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
+                                        _mm256_mul_ps(denum23, denum23));
+
+        denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
+        denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
+
+        result0 = _mm256_div_ps(num01, denum01);
+        result1 = _mm256_div_ps(num23, denum23);
+
+        _mm256_store_ps((float*)c, result0);
+        c += 4;
+        _mm256_store_ps((float*)c, result1);
+        c += 4;
+    }
+
+    volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+
+    float32x4x2_t aVal, bVal, cVal;
+    float32x4_t bAbs, bAbsInv;
+
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    for (; number < quarterPoints; number++) {
+        aVal = vld2q_f32((const float*)(aPtr));
+        bVal = vld2q_f32((const float*)(bPtr));
+        aPtr += 4;
+        bPtr += 4;
+        __VOLK_PREFETCH(aPtr + 4);
+        __VOLK_PREFETCH(bPtr + 4);
+
+        bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
+        bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
+
+        bAbsInv = vrecpeq_f32(bAbs);
+        bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+        bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
+
+        cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
+        cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
+        cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
+
+        cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
+        cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
+        cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
+
+        vst2q_f32((float*)(cPtr), cVal);
+        cPtr += 4;
+    }
+
+    for (number = quarterPoints * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) / (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h

new file mode 100644 (file)

index 0000000..38cb221
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -0,0 +1,1326 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_dot_prod_32fc
+ *
+ * \b Overview
+ *
+ * This block computes the dot product (or inner product) between two
+ * vectors, the \p input and \p taps vectors. Given a set of \p
+ * num_points taps, the result is the sum of products between the two
+ * vectors. The result is a single value stored in the \p result
+ * address and is returned as a complex float.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const
+ * lv_32fc_t* taps, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li input: vector of complex floats.
+ * \li taps:  complex float taps.
+ * \li num_points: number of samples in both \p input and \p taps.
+ *
+ * \b Outputs
+ * \li result: pointer to a complex float value to hold the dot product result.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_32fc_x2_dot_prod_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
+
+#include <stdio.h>
+#include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result,
+                                                      const lv_32fc_t* input,
+                                                      const lv_32fc_t* taps,
+                                                      unsigned int num_points)
+{
+
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_points / 2;
+
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
+
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+        sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+        sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+        in += 4;
+        tp += 4;
+    }
+
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
+
+    // Cleanup if we had an odd number of points
+    if (num_points & 1) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
+
+    __VOLK_ASM(
+        "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movups  (%%r9), %%xmmA\n\t"
+        "#     movups  (%%r10), %%xmmB\n\t"
+        "#     movups  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movups  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movups  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movups  0(%%r9), %%xmm0\n\t"
+        "      movups  16(%%r9), %%xmm1\n\t"
+        "      movups  %%xmm0, %%xmm4\n\t"
+        "      movups  0(%%r10), %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movups  16(%%r10), %%xmm3\n\t"
+        "      movups  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      movups  0(%%r9), %%xmm0\n\t"
+        "      movups  %%xmm0, %%xmm4\n\t"
+        "      movups  0(%%r10), %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movups  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movups  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+        : "rax", "r8", "r9", "r10");
+
+
+    if (isodd) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
+
+    return;
+}
+
+#endif /* LV_HAVE_SSE && LV_HAVE_64 */
+
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    unsigned int isodd = num_points & 1;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm_setzero_ps();
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+        yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal =
+            _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+        a += 2;
+        b += 2;
+    }
+
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+    _mm_storeu_ps((float*)dotProductVector,
+                  dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1]);
+
+    if (isodd) {
+        dotProduct += input[num_points - 1] * taps[num_points - 1];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+// #ifdef LV_HAVE_SSE4_1
+
+// #include <smmintrin.h>
+
+// static inline void volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result,
+//                                                        const lv_32fc_t* input,
+//                                                        const lv_32fc_t* taps,
+//                                                        unsigned int num_points)
+// {
+
+//     unsigned int i = 0;
+//     const unsigned int qtr_points = num_points / 4;
+//     const unsigned int isodd = num_points & 3;
+
+//     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+//     float *p_input, *p_taps;
+//     __m64* p_result;
+
+//     p_result = (__m64*)result;
+//     p_input = (float*)input;
+//     p_taps = (float*)taps;
+
+//     static const __m128i neg = { 0x000000000000000080000000 };
+
+//     real0 = _mm_setzero_ps();
+//     real1 = _mm_setzero_ps();
+//     im0 = _mm_setzero_ps();
+//     im1 = _mm_setzero_ps();
+
+//     for (; i < qtr_points; ++i) {
+//         xmm0 = _mm_loadu_ps(p_input);
+//         xmm1 = _mm_loadu_ps(p_taps);
+
+//         p_input += 4;
+//         p_taps += 4;
+
+//         xmm2 = _mm_loadu_ps(p_input);
+//         xmm3 = _mm_loadu_ps(p_taps);
+
+//         p_input += 4;
+//         p_taps += 4;
+
+//         xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+//         xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+//         xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+//         xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+//         // imaginary vector from input
+//         xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+//         // real vector from input
+//         xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+//         // imaginary vector from taps
+//         xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+//         // real vector from taps
+//         xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+//         xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+//         xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+//         xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+//         xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+//         real0 = _mm_add_ps(xmm4, real0);
+//         real1 = _mm_add_ps(xmm5, real1);
+//         im0 = _mm_add_ps(xmm6, im0);
+//         im1 = _mm_add_ps(xmm7, im1);
+//     }
+
+//     real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+//     im0 = _mm_add_ps(im0, im1);
+//     real0 = _mm_add_ps(real0, real1);
+
+//     im0 = _mm_add_ps(im0, real0);
+
+//     _mm_storel_pi(p_result, im0);
+
+//     for (i = num_points - isodd; i < num_points; i++) {
+//         *result += input[i] * taps[i];
+//     }
+// }
+
+// #endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result,
+                                                    const lv_32fc_t* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
+
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
+
+        a += 4;
+        b += 4;
+    }
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+    _mm256_storeu_ps((float*)dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx_fma(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
+
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+
+        x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
+
+        a += 4;
+        b += 4;
+    }
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+    _mm256_storeu_ps((float*)dotProductVector,
+                     dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
+
+#include <stdio.h>
+#include <string.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+
+    float* res = (float*)result;
+    float* in = (float*)input;
+    float* tp = (float*)taps;
+    unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
+
+    float sum0[2] = { 0, 0 };
+    float sum1[2] = { 0, 0 };
+    unsigned int i = 0;
+
+    for (i = 0; i < n_2_ccomplex_blocks; ++i) {
+        sum0[0] += in[0] * tp[0] - in[1] * tp[1];
+        sum0[1] += in[0] * tp[1] + in[1] * tp[0];
+        sum1[0] += in[2] * tp[2] - in[3] * tp[3];
+        sum1[1] += in[2] * tp[3] + in[3] * tp[2];
+
+        in += 4;
+        tp += 4;
+    }
+
+    res[0] = sum0[0] + sum1[0];
+    res[1] = sum0[1] + sum1[1];
+
+    if (num_points & 1) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE && LV_HAVE_64
+
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
+
+    __VOLK_ASM(
+        "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
+        "#                         const float *taps, unsigned num_bytes)\n\t"
+        "#    float sum0 = 0;\n\t"
+        "#    float sum1 = 0;\n\t"
+        "#    float sum2 = 0;\n\t"
+        "#    float sum3 = 0;\n\t"
+        "#    do {\n\t"
+        "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
+        "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
+        "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
+        "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
+        "#      input += 4;\n\t"
+        "#      taps += 4;  \n\t"
+        "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
+        "#    result[0] = sum0 + sum2;\n\t"
+        "#    result[1] = sum1 + sum3;\n\t"
+        "# TODO: prefetch and better scheduling\n\t"
+        "  xor    %%r9,  %%r9\n\t"
+        "  xor    %%r10, %%r10\n\t"
+        "  movq   %%rcx, %%rax\n\t"
+        "  movq   %%rcx, %%r8\n\t"
+        "  movq   %[rsi],  %%r9\n\t"
+        "  movq   %[rdx], %%r10\n\t"
+        "      xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+        "      xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+        "      shr     $5, %%rax               # rax = n_2_ccomplex_blocks / 2\n\t"
+        "  shr     $4, %%r8\n\t"
+        "      jmp     .%=L1_test\n\t"
+        "      # 4 taps / loop\n\t"
+        "      # something like ?? cycles / loop\n\t"
+        ".%=Loop1:     \n\t"
+        "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+        "#     movaps  (%%r9), %%xmmA\n\t"
+        "#     movaps  (%%r10), %%xmmB\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+        "#     mulps   %%xmmB, %%xmmA\n\t"
+        "#     mulps   %%xmmZ, %%xmmB\n\t"
+        "#     # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+        "#     xorps   %%xmmPN, %%xmmA\n\t"
+        "#     movaps  %%xmmA, %%xmmZ\n\t"
+        "#     unpcklps %%xmmB, %%xmmA\n\t"
+        "#     unpckhps %%xmmB, %%xmmZ\n\t"
+        "#     movaps  %%xmmZ, %%xmmY\n\t"
+        "#     shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+        "#     shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+        "#     addps   %%xmmZ, %%xmmA\n\t"
+        "#     addps   %%xmmA, %%xmmC\n\t"
+        "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+        "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      movaps  16(%%r9), %%xmm1\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      movaps  0(%%r10), %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      movaps  16(%%r10), %%xmm3\n\t"
+        "      movaps  %%xmm1, %%xmm5\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm3, %%xmm1\n\t"
+        "      shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+        "      addps   %%xmm1, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        "      mulps   %%xmm5, %%xmm3\n\t"
+        "      add     $32, %%r9\n\t"
+        "      addps   %%xmm3, %%xmm7\n\t"
+        "      add     $32, %%r10\n\t"
+        ".%=L1_test:\n\t"
+        "      dec     %%rax\n\t"
+        "      jge     .%=Loop1\n\t"
+        "      # We've handled the bulk of multiplies up to here.\n\t"
+        "      # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+        "      # If so, we've got 2 more taps to do.\n\t"
+        "      and     $1, %%r8\n\t"
+        "      je      .%=Leven\n\t"
+        "      # The count was odd, do 2 more taps.\n\t"
+        "      movaps  0(%%r9), %%xmm0\n\t"
+        "      movaps  %%xmm0, %%xmm4\n\t"
+        "      movaps  0(%%r10), %%xmm2\n\t"
+        "      mulps   %%xmm2, %%xmm0\n\t"
+        "      shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+        "      addps   %%xmm0, %%xmm6\n\t"
+        "      mulps   %%xmm4, %%xmm2\n\t"
+        "      addps   %%xmm2, %%xmm7\n\t"
+        ".%=Leven:\n\t"
+        "      # neg inversor\n\t"
+        "      xorps   %%xmm1, %%xmm1\n\t"
+        "      mov     $0x80000000, %%r9\n\t"
+        "      movd    %%r9, %%xmm1\n\t"
+        "      shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+        "      # pfpnacc\n\t"
+        "      xorps   %%xmm1, %%xmm6\n\t"
+        "      movaps  %%xmm6, %%xmm2\n\t"
+        "      unpcklps %%xmm7, %%xmm6\n\t"
+        "      unpckhps %%xmm7, %%xmm2\n\t"
+        "      movaps  %%xmm2, %%xmm3\n\t"
+        "      shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+        "      shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+        "      addps   %%xmm2, %%xmm6\n\t"
+        "                                      # xmm6 = r1 i2 r3 i4\n\t"
+        "      movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+        "      addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+        "      movlps  %%xmm6, (%[rdi])                # store low 2x32 bits (complex) "
+        "to memory\n\t"
+        :
+        : [rsi] "r"(input), [rdx] "r"(taps), "c"(num_bytes), [rdi] "r"(result)
+        : "rax", "r8", "r9", "r10");
+
+
+    if (isodd) {
+        *result += input[num_points - 1] * taps[num_points - 1];
+    }
+
+    return;
+}
+
+#endif
+
+#if LV_HAVE_SSE && LV_HAVE_32
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
+                                                       const lv_32fc_t* input,
+                                                       const lv_32fc_t* taps,
+                                                       unsigned int num_points)
+{
+
+    volk_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
+
+#if 0
+  const unsigned int num_bytes = num_points*8;
+  unsigned int isodd = num_points & 1;
+
+  __VOLK_ASM __VOLK_VOLATILE
+    (
+     " #pushl  %%ebp\n\t"
+     " #movl   %%esp, %%ebp\n\t"
+     " movl    12(%%ebp), %%eax                # input\n\t"
+     " movl    16(%%ebp), %%edx                # taps\n\t"
+     " movl    20(%%ebp), %%ecx                # n_bytes\n\t"
+     " xorps   %%xmm6, %%xmm6          # zero accumulators\n\t"
+     " movaps  0(%%eax), %%xmm0\n\t"
+     " xorps   %%xmm7, %%xmm7          # zero accumulators\n\t"
+     " movaps  0(%%edx), %%xmm2\n\t"
+     " shrl    $5, %%ecx               # ecx = n_2_ccomplex_blocks / 2\n\t"
+     " jmp     .%=L1_test\n\t"
+     " # 4 taps / loop\n\t"
+     " # something like ?? cycles / loop\n\t"
+     ".%=Loop1:        \n\t"
+     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
+     "#        movaps  (%%eax), %%xmmA\n\t"
+     "#        movaps  (%%edx), %%xmmB\n\t"
+     "#        movaps  %%xmmA, %%xmmZ\n\t"
+     "#        shufps  $0xb1, %%xmmZ, %%xmmZ   # swap internals\n\t"
+     "#        mulps   %%xmmB, %%xmmA\n\t"
+     "#        mulps   %%xmmZ, %%xmmB\n\t"
+     "#        # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
+     "#        xorps   %%xmmPN, %%xmmA\n\t"
+     "#        movaps  %%xmmA, %%xmmZ\n\t"
+     "#        unpcklps %%xmmB, %%xmmA\n\t"
+     "#        unpckhps %%xmmB, %%xmmZ\n\t"
+     "#        movaps  %%xmmZ, %%xmmY\n\t"
+     "#        shufps  $0x44, %%xmmA, %%xmmZ   # b01000100\n\t"
+     "#        shufps  $0xee, %%xmmY, %%xmmA   # b11101110\n\t"
+     "#        addps   %%xmmZ, %%xmmA\n\t"
+     "#        addps   %%xmmA, %%xmmC\n\t"
+     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
+     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
+     " movaps  16(%%eax), %%xmm1\n\t"
+     " movaps  %%xmm0, %%xmm4\n\t"
+     " mulps   %%xmm2, %%xmm0\n\t"
+     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+     " movaps  16(%%edx), %%xmm3\n\t"
+     " movaps  %%xmm1, %%xmm5\n\t"
+     " addps   %%xmm0, %%xmm6\n\t"
+     " mulps   %%xmm3, %%xmm1\n\t"
+     " shufps  $0xb1, %%xmm5, %%xmm5   # swap internals\n\t"
+     " addps   %%xmm1, %%xmm6\n\t"
+     " mulps   %%xmm4, %%xmm2\n\t"
+     " movaps  32(%%eax), %%xmm0\n\t"
+     " addps   %%xmm2, %%xmm7\n\t"
+     " mulps   %%xmm5, %%xmm3\n\t"
+     " addl    $32, %%eax\n\t"
+     " movaps  32(%%edx), %%xmm2\n\t"
+     " addps   %%xmm3, %%xmm7\n\t"
+     " addl    $32, %%edx\n\t"
+     ".%=L1_test:\n\t"
+     " decl    %%ecx\n\t"
+     " jge     .%=Loop1\n\t"
+     " # We've handled the bulk of multiplies up to here.\n\t"
+     " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
+     " # If so, we've got 2 more taps to do.\n\t"
+     " movl    20(%%ebp), %%ecx                # n_2_ccomplex_blocks\n\t"
+     "  shrl    $4, %%ecx\n\t"
+     " andl    $1, %%ecx\n\t"
+     " je      .%=Leven\n\t"
+     " # The count was odd, do 2 more taps.\n\t"
+     " # Note that we've already got mm0/mm2 preloaded\n\t"
+     " # from the main loop.\n\t"
+     " movaps  %%xmm0, %%xmm4\n\t"
+     " mulps   %%xmm2, %%xmm0\n\t"
+     " shufps  $0xb1, %%xmm4, %%xmm4   # swap internals\n\t"
+     " addps   %%xmm0, %%xmm6\n\t"
+     " mulps   %%xmm4, %%xmm2\n\t"
+     " addps   %%xmm2, %%xmm7\n\t"
+     ".%=Leven:\n\t"
+     " # neg inversor\n\t"
+     "  movl 8(%%ebp), %%eax \n\t"
+     " xorps   %%xmm1, %%xmm1\n\t"
+     "  movl   $0x80000000, (%%eax)\n\t"
+     " movss   (%%eax), %%xmm1\n\t"
+     " shufps  $0x11, %%xmm1, %%xmm1   # b00010001 # 0 -0 0 -0\n\t"
+     " # pfpnacc\n\t"
+     " xorps   %%xmm1, %%xmm6\n\t"
+     " movaps  %%xmm6, %%xmm2\n\t"
+     " unpcklps %%xmm7, %%xmm6\n\t"
+     " unpckhps %%xmm7, %%xmm2\n\t"
+     " movaps  %%xmm2, %%xmm3\n\t"
+     " shufps  $0x44, %%xmm6, %%xmm2   # b01000100\n\t"
+     " shufps  $0xee, %%xmm3, %%xmm6   # b11101110\n\t"
+     " addps   %%xmm2, %%xmm6\n\t"
+     "                                 # xmm6 = r1 i2 r3 i4\n\t"
+     " #movl   8(%%ebp), %%eax         # @result\n\t"
+     " movhlps %%xmm6, %%xmm4          # xmm4 = r3 i4 ?? ??\n\t"
+     " addps   %%xmm4, %%xmm6          # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
+     " movlps  %%xmm6, (%%eax)         # store low 2x32 bits (complex) to memory\n\t"
+     " #popl   %%ebp\n\t"
+     :
+     :
+     : "eax", "ecx", "edx"
+     );
+
+
+  int getem = num_bytes % 16;
+
+  if(isodd) {
+    *result += (input[num_points - 1] * taps[num_points - 1]);
+  }
+
+  return;
+#endif
+}
+
+#endif /*LV_HAVE_SSE*/
+
+#ifdef LV_HAVE_SSE3
+
+#include <pmmintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result,
+                                                     const lv_32fc_t* input,
+                                                     const lv_32fc_t* taps,
+                                                     unsigned int num_points)
+{
+
+    const unsigned int num_bytes = num_points * 8;
+    unsigned int isodd = num_points & 1;
+
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_bytes >> 4;
+
+    __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm_setzero_ps();
+
+    for (; number < halfPoints; number++) {
+
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+        yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+        tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+        x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        z = _mm_addsub_ps(tmp1,
+                          tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal =
+            _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
+
+        a += 2;
+        b += 2;
+    }
+
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
+
+    _mm_store_ps((float*)dotProductVector,
+                 dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1]);
+
+    if (isodd) {
+        dotProduct += input[num_points - 1] * taps[num_points - 1];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+// #ifdef LV_HAVE_SSE4_1
+
+// #include <smmintrin.h>
+
+// static inline void volk_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result,
+//                                                        const lv_32fc_t* input,
+//                                                        const lv_32fc_t* taps,
+//                                                        unsigned int num_points)
+// {
+
+//     unsigned int i = 0;
+//     const unsigned int qtr_points = num_points / 4;
+//     const unsigned int isodd = num_points & 3;
+
+//     __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
+//     float *p_input, *p_taps;
+//     __m64* p_result;
+
+//     static const __m128i neg = { 0x000000000000000080000000 };
+
+//     p_result = (__m64*)result;
+//     p_input = (float*)input;
+//     p_taps = (float*)taps;
+
+//     real0 = _mm_setzero_ps();
+//     real1 = _mm_setzero_ps();
+//     im0 = _mm_setzero_ps();
+//     im1 = _mm_setzero_ps();
+
+//     for (; i < qtr_points; ++i) {
+//         xmm0 = _mm_load_ps(p_input);
+//         xmm1 = _mm_load_ps(p_taps);
+
+//         p_input += 4;
+//         p_taps += 4;
+
+//         xmm2 = _mm_load_ps(p_input);
+//         xmm3 = _mm_load_ps(p_taps);
+
+//         p_input += 4;
+//         p_taps += 4;
+
+//         xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
+//         xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
+//         xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
+//         xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
+
+//         // imaginary vector from input
+//         xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
+//         // real vector from input
+//         xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
+//         // imaginary vector from taps
+//         xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
+//         // real vector from taps
+//         xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
+
+//         xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
+//         xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
+
+//         xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
+//         xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
+
+//         real0 = _mm_add_ps(xmm4, real0);
+//         real1 = _mm_add_ps(xmm5, real1);
+//         im0 = _mm_add_ps(xmm6, im0);
+//         im1 = _mm_add_ps(xmm7, im1);
+//     }
+
+//     real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
+
+//     im0 = _mm_add_ps(im0, im1);
+//     real0 = _mm_add_ps(real0, real1);
+
+//     im0 = _mm_add_ps(im0, real0);
+
+//     _mm_storel_pi(p_result, im0);
+
+//     for (i = num_points - isodd; i < num_points; i++) {
+//         *result += input[i] * taps[i];
+//     }
+// }
+
+// #endif /*LV_HAVE_SSE4_1*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_neon(lv_32fc_t* result,
+                                                   const lv_32fc_t* input,
+                                                   const lv_32fc_t* taps,
+                                                   unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, c_val, accumulator;
+    float32x4x2_t tmp_real, tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], c_val.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], c_val.val[1]);
+
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_dot_prod_32fc_neon_opttests(lv_32fc_t* result,
+                                                            const lv_32fc_t* input,
+                                                            const lv_32fc_t* taps,
+                                                            unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, accumulator;
+    float32x4x2_t tmp_imag;
+    accumulator.val[0] = vdupq_n_f32(0);
+    accumulator.val[1] = vdupq_n_f32(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
+        accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
+
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfma(lv_32fc_t* result,
+                                                          const lv_32fc_t* input,
+                                                          const lv_32fc_t* taps,
+                                                          unsigned int num_points)
+{
+
+    unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x2_t a_val, b_val, accumulator1, accumulator2;
+    accumulator1.val[0] = vdupq_n_f32(0);
+    accumulator1.val[1] = vdupq_n_f32(0);
+    accumulator2.val[0] = vdupq_n_f32(0);
+    accumulator2.val[1] = vdupq_n_f32(0);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+        accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+    }
+    accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+    accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, accumulator1);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /*LV_HAVE_NEON*/
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* result,
+                                                                const lv_32fc_t* input,
+                                                                const lv_32fc_t* taps,
+                                                                unsigned int num_points)
+{
+    // NOTE: GCC does a poor job with this kernel, but the equivalent ASM code is very
+    // fast
+
+    unsigned int quarter_points = num_points / 8;
+    unsigned int number;
+
+    lv_32fc_t* a_ptr = (lv_32fc_t*)taps;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)input;
+    // for 2-lane vectors, 1st lane holds the real part,
+    // 2nd lane holds the imaginary part
+    float32x4x4_t a_val, b_val, accumulator1, accumulator2;
+    float32x4x2_t reduced_accumulator;
+    accumulator1.val[0] = vdupq_n_f32(0);
+    accumulator1.val[1] = vdupq_n_f32(0);
+    accumulator1.val[2] = vdupq_n_f32(0);
+    accumulator1.val[3] = vdupq_n_f32(0);
+    accumulator2.val[0] = vdupq_n_f32(0);
+    accumulator2.val[1] = vdupq_n_f32(0);
+    accumulator2.val[2] = vdupq_n_f32(0);
+    accumulator2.val[3] = vdupq_n_f32(0);
+
+    // 8 input regs, 8 accumulators -> 16/16 neon regs are used
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld4q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld4q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 8);
+        __VOLK_PREFETCH(b_ptr + 8);
+
+        // use 2 accumulators to remove inter-instruction data dependencies
+        accumulator1.val[0] = vmlaq_f32(accumulator1.val[0], a_val.val[0], b_val.val[0]);
+        accumulator1.val[1] = vmlaq_f32(accumulator1.val[1], a_val.val[0], b_val.val[1]);
+
+        accumulator1.val[2] = vmlaq_f32(accumulator1.val[2], a_val.val[2], b_val.val[2]);
+        accumulator1.val[3] = vmlaq_f32(accumulator1.val[3], a_val.val[2], b_val.val[3]);
+
+        accumulator2.val[0] = vmlsq_f32(accumulator2.val[0], a_val.val[1], b_val.val[1]);
+        accumulator2.val[1] = vmlaq_f32(accumulator2.val[1], a_val.val[1], b_val.val[0]);
+
+        accumulator2.val[2] = vmlsq_f32(accumulator2.val[2], a_val.val[3], b_val.val[3]);
+        accumulator2.val[3] = vmlaq_f32(accumulator2.val[3], a_val.val[3], b_val.val[2]);
+        // increment pointers
+        a_ptr += 8;
+        b_ptr += 8;
+    }
+    // reduce 8 accumulator lanes down to 2 (1 real and 1 imag)
+    accumulator1.val[0] = vaddq_f32(accumulator1.val[0], accumulator1.val[2]);
+    accumulator1.val[1] = vaddq_f32(accumulator1.val[1], accumulator1.val[3]);
+    accumulator2.val[0] = vaddq_f32(accumulator2.val[0], accumulator2.val[2]);
+    accumulator2.val[1] = vaddq_f32(accumulator2.val[1], accumulator2.val[3]);
+    reduced_accumulator.val[0] = vaddq_f32(accumulator1.val[0], accumulator2.val[0]);
+    reduced_accumulator.val[1] = vaddq_f32(accumulator1.val[1], accumulator2.val[1]);
+    // now reduce accumulators to scalars
+    lv_32fc_t accum_result[4];
+    vst2q_f32((float*)accum_result, reduced_accumulator);
+    *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
+
+    // tail case
+    for (number = quarter_points * 8; number < num_points; ++number) {
+        *result += (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /*LV_HAVE_NEON*/
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result,
+                                                    const lv_32fc_t* input,
+                                                    const lv_32fc_t* taps,
+                                                    unsigned int num_points)
+{
+
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+
+        x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+        tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+        z = _mm256_addsub_ps(tmp1,
+                             tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
+
+        a += 4;
+        b += 4;
+    }
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+    _mm256_store_ps((float*)dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
+#if LV_HAVE_AVX && LV_HAVE_FMA
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx_fma(lv_32fc_t* result,
+                                                        const lv_32fc_t* input,
+                                                        const lv_32fc_t* taps,
+                                                        unsigned int num_points)
+{
+
+    unsigned int isodd = num_points & 3;
+    unsigned int i = 0;
+    lv_32fc_t dotProduct;
+    memset(&dotProduct, 0x0, 2 * sizeof(float));
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+    const lv_32fc_t* a = input;
+    const lv_32fc_t* b = taps;
+
+    dotProdVal = _mm256_setzero_ps();
+
+    for (; number < quarterPoints; number++) {
+
+        x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+        y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+        yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+        yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+        tmp1 = x;
+
+        x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br,ei,er,fi,fr
+
+        tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+        z = _mm256_fmaddsub_ps(
+            tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        dotProdVal = _mm256_add_ps(dotProdVal,
+                                   z); // Add the complex multiplication results together
+
+        a += 4;
+        b += 4;
+    }
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+    _mm256_store_ps((float*)dotProductVector,
+                    dotProdVal); // Store the results back into the dot product vector
+
+    dotProduct += (dotProductVector[0] + dotProductVector[1] + dotProductVector[2] +
+                   dotProductVector[3]);
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        dotProduct += input[i] * taps[i];
+    }
+
+    *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX && LV_HAVE_FMA*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/kernels/volk/volk_32fc_x2_multiply_32fc.h b/kernels/volk/volk_32fc_x2_multiply_32fc.h

new file mode 100644 (file)

index 0000000..44ddc63
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -0,0 +1,482 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_multiply_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies two complex vectors and returns the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const
+ * lv_32fc_t* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+ * \li bVector: The second input vector of complex floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * Mix two signals at f=0.3 and 0.1.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* sig_1  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* sig_2  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       // Generate two tones
+ *       float real_1 = std::cos(0.3f * (float)ii);
+ *       float imag_1 = std::sin(0.3f * (float)ii);
+ *       sig_1[ii] = lv_cmake(real_1, imag_1);
+ *       float real_2 = std::cos(0.1f * (float)ii);
+ *       float imag_2 = std::sin(0.1f * (float)ii);
+ *       sig_2[ii] = lv_cmake(real_2, imag_2);
+ *   }
+ *
+ *   volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
+ * *
+ *   volk_free(sig_1);
+ *   volk_free(sig_2);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input complex vectors and stores their results in the third
+  vector \param cVector The vector where the results will be stored \param aVector One of
+  the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+  num_points The number of complex values in aVector and bVector to be multiplied together
+  and stored into cVector
+*/
+static inline void volk_32fc_x2_multiply_32fc_u_avx2_fma(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* bVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+
+        const __m256 x =
+            _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        const __m256 y =
+            _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+        const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+        const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        const __m256 z = _mm256_fmaddsub_ps(
+            x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexmul_ps(x, y);
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexmul_ps(x, y);
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#if LV_HAVE_AVX2 && LV_HAVE_FMA
+#include <immintrin.h>
+/*!
+  \brief Multiplies the two input complex vectors and stores their results in the third
+  vector \param cVector The vector where the results will be stored \param aVector One of
+  the vectors to be multiplied \param bVector One of the vectors to be multiplied \param
+  num_points The number of complex values in aVector and bVector to be multiplied together
+  and stored into cVector
+*/
+static inline void volk_32fc_x2_multiply_32fc_a_avx2_fma(lv_32fc_t* cVector,
+                                                         const lv_32fc_t* aVector,
+                                                         const lv_32fc_t* bVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+
+        const __m256 x =
+            _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        const __m256 y =
+            _mm256_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+
+        const __m256 yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
+        const __m256 yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
+
+        const __m256 tmp2x = _mm256_permute_ps(x, 0xB1); // Re-arrange x to be ai,ar,bi,br
+
+        const __m256 tmp2 = _mm256_mul_ps(tmp2x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+        const __m256 z = _mm256_fmaddsub_ps(
+            x, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexmul_ps(x, y);
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * (*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexmul_ps(x, y);
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * (*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector,
+                                                        const lv_32fc_t* aVector,
+                                                        const lv_32fc_t* bVector,
+                                                        unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector,
+                                                   const lv_32fc_t* aVector,
+                                                   const lv_32fc_t* bVector,
+                                                   unsigned int num_points)
+{
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEON
+
+static inline void volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector,
+                                                            const lv_32fc_t* aVector,
+                                                            const lv_32fc_t* bVector,
+                                                            unsigned int num_points)
+{
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val;
+    float32x4x2_t tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // do the first multiply
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+
+        // use multiply accumulate/subtract to get result
+        tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
+        tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
+
+        // store
+        vst2q_f32((float*)cVector, tmp_imag);
+        // increment pointers
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * (*b_ptr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_NEONV7
+
+extern void volk_32fc_x2_multiply_32fc_a_neonasm(lv_32fc_t* cVector,
+                                                 const lv_32fc_t* aVector,
+                                                 const lv_32fc_t* bVector,
+                                                 unsigned int num_points);
+#endif /* LV_HAVE_NEONV7 */
+
+
+#ifdef LV_HAVE_ORC
+
+extern void volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
+                                                  const lv_32fc_t* aVector,
+                                                  const lv_32fc_t* bVector,
+                                                  unsigned int num_points);
+
+static inline void volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    unsigned int num_points)
+{
+    volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+
+#endif /* LV_HAVE_ORC */
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h

new file mode 100644 (file)

index 0000000..478c4c2
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -0,0 +1,311 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_multiply_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Multiplies a complex vector by the conjugate of a second complex
+ * vector and returns the complex result.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
+ * const lv_32fc_t* bVector, unsigned int num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The first input vector of complex floats.
+ * \li bVector: The second input vector of complex floats that is conjugated.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The output vector complex floats.
+ *
+ * \b Example
+ * Calculate mag^2 of a signal using x * conj(x).
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* sig_1  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *
+ *   float delta = 2.f*M_PI / (float)N;
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       float real_1 = std::cos(0.3f * (float)ii);
+ *       float imag_1 = std::sin(0.3f * (float)ii);
+ *       sig_1[ii] = lv_cmake(real_1, imag_1);
+ *   }
+ *
+ *   volk_32fc_x2_multiply_conjugate_32fc(out, sig_1, sig_1, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
+ *   }
+ *   printf("\n");
+ *
+ *   volk_free(sig_1);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t* cVector,
+                                                              const lv_32fc_t* aVector,
+                                                              const lv_32fc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps(
+            (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_loadu_ps(
+            (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexconjugatemul_ps(x, y);
+        _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * lv_conj(*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
+                                                               const lv_32fc_t* aVector,
+                                                               const lv_32fc_t* bVector,
+                                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexconjugatemul_ps(x, y);
+        _mm_storeu_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                                const lv_32fc_t* aVector,
+                                                                const lv_32fc_t* bVector,
+                                                                unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H */
+#ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t* cVector,
+                                                              const lv_32fc_t* aVector,
+                                                              const lv_32fc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m256 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
+        y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
+        z = _mm256_complexconjugatemul_ps(x, y);
+        _mm256_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+
+    for (; number < num_points; number++) {
+        *c++ = (*a++) * lv_conj(*b++);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
+                                                               const lv_32fc_t* aVector,
+                                                               const lv_32fc_t* bVector,
+                                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, z;
+    lv_32fc_t* c = cVector;
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
+        y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
+        z = _mm_complexconjugatemul_ps(x, y);
+        _mm_store_ps((float*)c, z); // Store the results back into the C container
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = (*a) * lv_conj(*b);
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32fc_x2_multiply_conjugate_32fc_neon(lv_32fc_t* cVector,
+                                                             const lv_32fc_t* aVector,
+                                                             const lv_32fc_t* bVector,
+                                                             unsigned int num_points)
+{
+    lv_32fc_t* a_ptr = (lv_32fc_t*)aVector;
+    lv_32fc_t* b_ptr = (lv_32fc_t*)bVector;
+    unsigned int quarter_points = num_points / 4;
+    float32x4x2_t a_val, b_val, c_val;
+    float32x4x2_t tmp_real, tmp_imag;
+    unsigned int number = 0;
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+        b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+        b_val.val[1] = vnegq_f32(b_val.val[1]);
+        __VOLK_PREFETCH(a_ptr + 4);
+        __VOLK_PREFETCH(b_ptr + 4);
+
+        // multiply the real*real and imag*imag to get real result
+        // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+        tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
+        // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+        tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
+
+        // Multiply cross terms to get the imaginary result
+        // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+        tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
+        // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+        tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
+
+        // store the results
+        c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+        c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+        vst2q_f32((float*)cVector, c_val);
+
+        a_ptr += 4;
+        b_ptr += 4;
+        cVector += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cVector++ = (*a_ptr++) * conj(*b_ptr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_multiply_conjugate_32fc_a_generic(lv_32fc_t* cVector,
+                                               const lv_32fc_t* aVector,
+                                               const lv_32fc_t* bVector,
+                                               unsigned int num_points)
+{
+    lv_32fc_t* cPtr = cVector;
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * lv_conj(*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H */
diff --git a/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h

new file mode 100644 (file)

index 0000000..54ffbf0
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
@@ -0,0 +1,538 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_s32f_square_dist_scalar_mult_32f
+ *
+ * \b Overview
+ *
+ * Calculates the square distance between a single complex input for each
+ * point in a complex vector scaled by a scalar value.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_s32f_square_dist_scalar_mult_32f(float* target, lv_32fc_t* src0,
+ * lv_32fc_t* points, float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+ * \li points: A complex vector of reference points.
+ * \li scalar: A float to scale the distances by
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: A vector of distances between src0 and the vector of points.
+ *
+ * \b Example
+ * Calculate the distance between an input and reference points in a square
+ * 16-qam constellation. Normalize distances by the area of the constellation.
+ * \code
+ *   int N = 16;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* constellation  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* rx  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float const_vals[] = {-3, -1, 1, 3};
+ *
+ *   unsigned int jj = 0;
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
+ *       if((ii+1)%4 == 0) ++jj;
+ *   }
+ *
+ *   *rx = lv_cmake(0.5f, 2.f);
+ *   float scale = 1.f/64.f; // 1 / constellation area
+ *
+ *   volk_32fc_x2_s32f_square_dist_scalar_mult_32f(out, rx, constellation, scale, N);
+ *
+ *   printf("Distance from each constellation point:\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%.4f  ", out[ii]);
+ *       if((ii+1)%4 == 0) printf("\n");
+ *   }
+ *
+ *   volk_free(rx);
+ *   volk_free(constellation);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
+
+#include <volk/volk_complex.h>
+
+
+static inline void calculate_scaled_distances(float* target,
+                                              const lv_32fc_t symbol,
+                                              const lv_32fc_t* points,
+                                              const float scalar,
+                                              const unsigned int num_points)
+{
+    lv_32fc_t diff;
+    for (unsigned int i = 0; i < num_points; ++i) {
+        /*
+         * Calculate: |y - x|^2 * SNR_lin
+         * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
+         */
+        diff = symbol - *points++;
+        *target++ =
+            scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
+    }
+}
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
+                                                     unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm9, xmm10;
+    __m256 xmm4, xmm6;
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    const unsigned int bound = num_bytes >> 6;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+    const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+    const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+
+    // Set permutation constant
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (unsigned int i = 0; i < bound; ++i) {
+        xmm_points0 = _mm256_load_ps((float*)points);
+        xmm_points1 = _mm256_load_ps((float*)(points + 4));
+        points += 8;
+        __VOLK_PREFETCH(points);
+
+        xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_store_ps(target, xmm_result);
+        target += 8;
+    }
+
+    if (num_bytes >> 5 & 1) {
+        xmm_points0 = _mm256_load_ps((float*)points);
+
+        xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+
+        points += 4;
+
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+
+        xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+        _mm_store_ps(target, xmm9);
+        target += 4;
+    }
+
+    if (num_bytes >> 4 & 1) {
+        xmm9 = _mm_load_ps((float*)points);
+
+        xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+
+        points += 2;
+
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+        xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm10);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const int eightsPoints = num_points / 8;
+    const int remainder = num_points - 8 * eightsPoints;
+
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+    for (int i = 0; i < eightsPoints; ++i) {
+        xmm_points0 = _mm256_load_ps((float*)points);
+        xmm_points1 = _mm256_load_ps((float*)(points + 4));
+        points += 8;
+
+        xmm_result = _mm256_scaled_norm_dist_ps(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_store_ps(target, xmm_result);
+        target += 8;
+    }
+
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, remainder);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
+                                                     unsigned int num_points)
+{
+    __m128 xmm_points0, xmm_points1, xmm_result;
+
+    /*
+     * First do 4 values in every loop iteration.
+     * There may be up to 3 values left.
+     * leftovers0 indicates if at least 2 more are available for SSE execution.
+     * leftovers1 indicates if there is a single element left.
+     */
+    const int quarterPoints = num_points / 4;
+    const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+    const int leftovers1 = num_points % 2;
+
+    // load complex value into both parts of the register.
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    // Load scalar into all 4 parts of the register
+    const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+    for (int i = 0; i < quarterPoints; ++i) {
+        xmm_points0 = _mm_load_ps((float*)points);
+        xmm_points1 = _mm_load_ps((float*)(points + 2));
+        points += 4;
+        __VOLK_PREFETCH(points);
+        // calculate distances
+        xmm_result = _mm_scaled_norm_dist_ps_sse3(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm_store_ps(target, xmm_result);
+        target += 4;
+    }
+
+    for (int i = 0; i < leftovers0; ++i) {
+        xmm_points0 = _mm_load_ps((float*)points);
+        points += 2;
+
+        xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+        xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+        xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+        xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm_result);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const __m128 xmm_scalar = _mm_set1_ps(scalar);
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    for (unsigned i = 0; i < num_points / 4; ++i) {
+        __m128 xmm_points0 = _mm_load_ps((float*)points);
+        __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
+        points += 4;
+        __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+        _mm_store_ps((float*)target, xmm_result);
+        target += 4;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+}
+#endif // LV_HAVE_SSE
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target,
+                                                      lv_32fc_t* src0,
+                                                      lv_32fc_t* points,
+                                                      float scalar,
+                                                      unsigned int num_points)
+{
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, num_points);
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
+#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
+
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk/volk_avx2_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
+                                                     unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm9, xmm10;
+    __m256 xmm4, xmm6;
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    const unsigned int bound = num_bytes >> 6;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+    const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+    const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
+
+    // Set permutation constant
+    const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+
+    for (unsigned int i = 0; i < bound; ++i) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
+        xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+        points += 8;
+        __VOLK_PREFETCH(points);
+
+        xmm_result = _mm256_scaled_norm_dist_ps_avx2(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_storeu_ps(target, xmm_result);
+        target += 8;
+    }
+
+    if (num_bytes >> 5 & 1) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
+
+        xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
+
+        points += 4;
+
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
+
+        xmm9 = _mm256_extractf128_ps(xmm_result, 1);
+        _mm_storeu_ps(target, xmm9);
+        target += 4;
+    }
+
+    if (num_bytes >> 4 & 1) {
+        xmm9 = _mm_loadu_ps((float*)points);
+
+        xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
+
+        points += 2;
+
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+        xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm10);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const int eightsPoints = num_points / 8;
+    const int remainder = num_points - 8 * eightsPoints;
+
+    __m256 xmm_points0, xmm_points1, xmm_result;
+
+    // load complex value into all parts of the register.
+    const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
+
+    // Load scalar into all 8 parts of the register
+    const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
+
+    for (int i = 0; i < eightsPoints; ++i) {
+        xmm_points0 = _mm256_loadu_ps((float*)points);
+        xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
+        points += 8;
+
+        xmm_result = _mm256_scaled_norm_dist_ps(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm256_storeu_ps(target, xmm_result);
+        target += 8;
+    }
+
+    const lv_32fc_t symbol = *src0;
+    calculate_scaled_distances(target, symbol, points, scalar, remainder);
+}
+
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     float scalar,
+                                                     unsigned int num_points)
+{
+    __m128 xmm_points0, xmm_points1, xmm_result;
+
+    /*
+     * First do 4 values in every loop iteration.
+     * There may be up to 3 values left.
+     * leftovers0 indicates if at least 2 more are available for SSE execution.
+     * leftovers1 indicates if there is a single element left.
+     */
+    const int quarterPoints = num_points / 4;
+    const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
+    const int leftovers1 = num_points % 2;
+
+    // load complex value into both parts of the register.
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    // Load scalar into all 4 parts of the register
+    const __m128 xmm_scalar = _mm_load1_ps(&scalar);
+
+    for (int i = 0; i < quarterPoints; ++i) {
+        xmm_points0 = _mm_loadu_ps((float*)points);
+        xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+        points += 4;
+        __VOLK_PREFETCH(points);
+        // calculate distances
+        xmm_result = _mm_scaled_norm_dist_ps_sse3(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+
+        _mm_storeu_ps(target, xmm_result);
+        target += 4;
+    }
+
+    for (int i = 0; i < leftovers0; ++i) {
+        xmm_points0 = _mm_loadu_ps((float*)points);
+        points += 2;
+
+        xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
+        xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
+        xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
+        xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
+
+        _mm_storeh_pi((__m64*)target, xmm_result);
+        target += 2;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#ifdef LV_HAVE_SSE
+#include <volk/volk_sse_intrinsics.h>
+#include <xmmintrin.h>
+static inline void
+volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float* target,
+                                                    lv_32fc_t* src0,
+                                                    lv_32fc_t* points,
+                                                    float scalar,
+                                                    unsigned int num_points)
+{
+    const __m128 xmm_scalar = _mm_set1_ps(scalar);
+    const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
+
+    for (unsigned i = 0; i < num_points / 4; ++i) {
+        __m128 xmm_points0 = _mm_loadu_ps((float*)points);
+        __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
+        points += 4;
+        __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
+            xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
+        _mm_storeu_ps((float*)target, xmm_result);
+        target += 4;
+    }
+
+    calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
+}
+#endif // LV_HAVE_SSE
+
+#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
diff --git a/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h

new file mode 100644 (file)

index 0000000..85cdaf1
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
@@ -0,0 +1,344 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2019 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_s32fc_multiply_conjugate_add_32fc
+ *
+ * \b Overview
+ *
+ * Conjugate the input complex vector, multiply them by a complex scalar,
+ * add the another input complex vector and returns the results.
+ *
+ * c[i] = a[i] + conj(b[i]) * scalar
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(lv_32fc_t* cVector, const
+ * lv_32fc_t* aVector, const lv_32fc_t* bVector, const lv_32fc_t scalar, unsigned int
+ * num_points); \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector to be added.
+ * \li bVector: The input vector to be conjugate and multiplied.
+ * \li scalar: The complex scalar to multiply against conjugated bVector.
+ * \li num_points: The number of complex values in aVector and bVector to be conjugate,
+ * multiplied and stored into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The vector where the results will be stored.
+ *
+ * \b Example
+ * Calculate coefficients.
+ *
+ * \code
+ * int n_filter = 2 * N + 1;
+ * unsigned int alignment = volk_get_alignment();
+ *
+ * // state is a queue of input IQ data.
+ * lv_32fc_t* state = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * // weight and next one.
+ * lv_32fc_t* weight = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * lv_32fc_t* next = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t) * n_filter, alignment);
+ * ...
+ * // push back input IQ data into state.
+ * foo_push_back_queue(state, input);
+ *
+ * // get filtered output.
+ * lv_32fc_t output = lv_cmake(0.f,0.f);
+ * for (int i = 0; i < n_filter; i++) {
+ *   output += state[i] * weight[i];
+ * }
+ *
+ * // update weight using output.
+ * float real = lv_creal(output) * (1.0 - std::norm(output)) * MU;
+ * lv_32fc_t factor = lv_cmake(real, 0.f);
+ * volk_32fc_x2_s32fc_multiply_conjugate_add_32fc(next, weight, state, factor, n_filter);
+ * lv_32fc_t *tmp = next;
+ * next = weight;
+ * weight = tmp;
+ * weight[N + 1] = lv_cmake(lv_creal(weight[N + 1]), 0.f);
+ * ...
+ * volk_free(state);
+ * volk_free(weight);
+ * volk_free(next);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
+#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
+
+#include <float.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t* cVector,
+                                                       const lv_32fc_t* aVector,
+                                                       const lv_32fc_t* bVector,
+                                                       const lv_32fc_t scalar,
+                                                       unsigned int num_points)
+{
+    const lv_32fc_t* aPtr = aVector;
+    const lv_32fc_t* bPtr = bVector;
+    lv_32fc_t* cPtr = cVector;
+    unsigned int number = num_points;
+
+    // unwrap loop
+    while (number >= 8) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+        number -= 8;
+    }
+
+    // clean up any remaining
+    while (number-- > 0) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     const lv_32fc_t scalar,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+
+    __m256 x, y, s, z;
+    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm256_loadu_ps((float*)v_scalar);
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_loadu_ps((float*)b);
+        y = _mm256_loadu_ps((float*)a);
+        z = _mm256_complexconjugatemul_ps(s, x);
+        z = _mm256_add_ps(y, z);
+        _mm256_storeu_ps((float*)c, z);
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) + lv_conj(*b++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, s, z;
+    lv_32fc_t v_scalar[2] = { scalar, scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm_loadu_ps((float*)v_scalar);
+
+    for (; number < halfPoints; number++) {
+        x = _mm_loadu_ps((float*)b);
+        y = _mm_loadu_ps((float*)a);
+        z = _mm_complexconjugatemul_ps(s, x);
+        z = _mm_add_ps(y, z);
+        _mm_storeu_ps((float*)c, z);
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = *a + lv_conj(*b) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk/volk_avx_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t* cVector,
+                                                     const lv_32fc_t* aVector,
+                                                     const lv_32fc_t* bVector,
+                                                     const lv_32fc_t scalar,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int i = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int isodd = num_points & 3;
+
+    __m256 x, y, s, z;
+    lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm256_loadu_ps((float*)v_scalar);
+
+    for (; number < quarterPoints; number++) {
+        x = _mm256_load_ps((float*)b);
+        y = _mm256_load_ps((float*)a);
+        z = _mm256_complexconjugatemul_ps(s, x);
+        z = _mm256_add_ps(y, z);
+        _mm256_store_ps((float*)c, z);
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    for (i = num_points - isodd; i < num_points; i++) {
+        *c++ = (*a++) + lv_conj(*b++) * scalar;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <volk/volk_sse3_intrinsics.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t* cVector,
+                                                      const lv_32fc_t* aVector,
+                                                      const lv_32fc_t* bVector,
+                                                      const lv_32fc_t scalar,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    __m128 x, y, s, z;
+    lv_32fc_t v_scalar[2] = { scalar, scalar };
+
+    const lv_32fc_t* a = aVector;
+    const lv_32fc_t* b = bVector;
+    lv_32fc_t* c = cVector;
+
+    // Set up constant scalar vector
+    s = _mm_loadu_ps((float*)v_scalar);
+
+    for (; number < halfPoints; number++) {
+        x = _mm_load_ps((float*)b);
+        y = _mm_load_ps((float*)a);
+        z = _mm_complexconjugatemul_ps(s, x);
+        z = _mm_add_ps(y, z);
+        _mm_store_ps((float*)c, z);
+
+        a += 2;
+        b += 2;
+        c += 2;
+    }
+
+    if ((num_points % 2) != 0) {
+        *c = *a + lv_conj(*b) * scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void
+volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t* cVector,
+                                                    const lv_32fc_t* aVector,
+                                                    const lv_32fc_t* bVector,
+                                                    const lv_32fc_t scalar,
+                                                    unsigned int num_points)
+{
+    const lv_32fc_t* bPtr = bVector;
+    const lv_32fc_t* aPtr = aVector;
+    lv_32fc_t* cPtr = cVector;
+    unsigned int number = num_points;
+    unsigned int quarter_points = num_points / 4;
+
+    float32x4x2_t a_val, b_val, c_val, scalar_val;
+    float32x4x2_t tmp_val;
+
+    scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
+    scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
+
+    for (number = 0; number < quarter_points; ++number) {
+        a_val = vld2q_f32((float*)aPtr);
+        b_val = vld2q_f32((float*)bPtr);
+        b_val.val[1] = vnegq_f32(b_val.val[1]);
+        __VOLK_PREFETCH(aPtr + 8);
+        __VOLK_PREFETCH(bPtr + 8);
+
+        tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
+        tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
+
+        tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
+        tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
+
+        c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
+        c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
+
+        vst2q_f32((float*)cPtr, c_val);
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
diff --git a/kernels/volk/volk_32fc_x2_square_dist_32f.h b/kernels/volk/volk_32fc_x2_square_dist_32f.h

new file mode 100644 (file)

index 0000000..558e25c
--- /dev/null
+++ b/kernels/volk/volk_32fc_x2_square_dist_32f.h
@@ -0,0 +1,393 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32fc_x2_square_dist_32f
+ *
+ * \b Overview
+ *
+ * Calculates the square distance between a single complex input for each
+ * point in a complex vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32fc_x2_square_dist_32f(float* target, lv_32fc_t* src0, lv_32fc_t* points,
+ * unsigned int num_points) { \endcode
+ *
+ * \b Inputs
+ * \li src0: The complex input. Only the first point is used.
+ * \li points: A complex vector of reference points.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li target: A vector of distances between src0 and the vector of points.
+ *
+ * \b Example
+ * Calculate the distance between an input and reference points in a square
+ * 16-qam constellation.
+ * \code
+ *   int N = 16;
+ *   unsigned int alignment = volk_get_alignment();
+ *   lv_32fc_t* constellation  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   lv_32fc_t* rx  = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float const_vals[] = {-3, -1, 1, 3};
+ *
+ *   // Generate 16-QAM constellation points
+ *   unsigned int jj = 0;
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       constellation[ii] = lv_cmake(const_vals[ii%4], const_vals[jj]);
+ *       if((ii+1)%4 == 0) ++jj;
+ *   }
+ *
+ *   *rx = lv_cmake(0.5f, 2.f);
+ *
+ *   volk_32fc_x2_square_dist_32f(out, rx, constellation, N);
+ *
+ *   printf("Distance from each constellation point:\n");
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("%.4f  ", out[ii]);
+ *       if((ii+1)%4 == 0) printf("\n");
+ *   }
+ *
+ *   volk_free(rx);
+ *   volk_free(constellation);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm0, xmm9, xmm10;
+    __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 6;
+    int leftovers0 = (num_bytes >> 5) & 1;
+    int leftovers1 = (num_bytes >> 4) & 1;
+    int leftovers2 = (num_bytes >> 3) & 1;
+    int i = 0;
+
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    xmm1 = _mm256_setzero_ps();
+    xmm0 = _mm_load_ps((float*)src0);
+    xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+
+    for (; i < bound; ++i) {
+        xmm2 = _mm256_load_ps((float*)&points[0]);
+        xmm3 = _mm256_load_ps((float*)&points[4]);
+        points += 8;
+
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+        xmm5 = _mm256_sub_ps(xmm1, xmm3);
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm7 = _mm256_mul_ps(xmm5, xmm5);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        _mm256_store_ps(target, xmm4);
+
+        target += 8;
+    }
+
+    for (i = 0; i < leftovers0; ++i) {
+
+        xmm2 = _mm256_load_ps((float*)&points[0]);
+
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+
+        points += 4;
+
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm9 = _mm256_extractf128_ps(xmm4, 1);
+        _mm_store_ps(target, xmm9);
+
+        target += 4;
+    }
+
+    for (i = 0; i < leftovers1; ++i) {
+        xmm9 = _mm_load_ps((float*)&points[0]);
+
+        xmm10 = _mm_sub_ps(xmm0, xmm9);
+
+        points += 2;
+
+        xmm9 = _mm_mul_ps(xmm10, xmm10);
+
+        xmm10 = _mm_hadd_ps(xmm9, xmm9);
+
+        _mm_storeh_pi((__m64*)target, xmm10);
+
+        target += 2;
+    }
+
+    for (i = 0; i < leftovers2; ++i) {
+
+        diff = src0[0] - points[0];
+
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+        target[0] = sq_dist;
+    }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+
+    __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 5;
+    int i = 0;
+
+    xmm1 = _mm_setzero_ps();
+    xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
+    xmm2 = _mm_load_ps((float*)&points[0]);
+    xmm1 = _mm_movelh_ps(xmm1, xmm1);
+    xmm3 = _mm_load_ps((float*)&points[2]);
+
+    for (; i < bound - 1; ++i) {
+        xmm4 = _mm_sub_ps(xmm1, xmm2);
+        xmm5 = _mm_sub_ps(xmm1, xmm3);
+        points += 4;
+        xmm6 = _mm_mul_ps(xmm4, xmm4);
+        xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+        xmm2 = _mm_load_ps((float*)&points[0]);
+
+        xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+        xmm3 = _mm_load_ps((float*)&points[2]);
+
+        _mm_store_ps(target, xmm4);
+
+        target += 4;
+    }
+
+    xmm4 = _mm_sub_ps(xmm1, xmm2);
+    xmm5 = _mm_sub_ps(xmm1, xmm3);
+
+    points += 4;
+    xmm6 = _mm_mul_ps(xmm4, xmm4);
+    xmm7 = _mm_mul_ps(xmm5, xmm5);
+
+    xmm4 = _mm_hadd_ps(xmm6, xmm7);
+
+    _mm_store_ps(target, xmm4);
+
+    target += 4;
+
+    if (num_bytes >> 4 & 1) {
+
+        xmm2 = _mm_load_ps((float*)&points[0]);
+
+        xmm4 = _mm_sub_ps(xmm1, xmm2);
+
+        points += 2;
+
+        xmm6 = _mm_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm_hadd_ps(xmm6, xmm6);
+
+        _mm_storeh_pi((__m64*)target, xmm4);
+
+        target += 2;
+    }
+
+    if (num_bytes >> 3 & 1) {
+
+        diff = src0[0] - points[0];
+
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+        target[0] = sq_dist;
+    }
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
+                                                     lv_32fc_t* src0,
+                                                     lv_32fc_t* points,
+                                                     unsigned int num_points)
+{
+    const unsigned int quarter_points = num_points / 4;
+    unsigned int number;
+
+    float32x4x2_t a_vec, b_vec;
+    float32x4x2_t diff_vec;
+    float32x4_t tmp, tmp1, dist_sq;
+    a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
+    a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
+    for (number = 0; number < quarter_points; ++number) {
+        b_vec = vld2q_f32((float*)points);
+        diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
+        diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
+        tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
+        tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
+
+        dist_sq = vaddq_f32(tmp, tmp1);
+        vst1q_f32(target, dist_sq);
+        points += 4;
+        target += 4;
+    }
+    for (number = quarter_points * 4; number < num_points; ++number) {
+        lv_32fc_t diff = src0[0] - *points++;
+        *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
+                                                        lv_32fc_t* src0,
+                                                        lv_32fc_t* points,
+                                                        unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    unsigned int i = 0;
+
+    for (; i<num_bytes>> 3; ++i) {
+        diff = src0[0] - points[i];
+
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+        target[i] = sq_dist;
+    }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
+
+#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
+#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
+                                                       lv_32fc_t* src0,
+                                                       lv_32fc_t* points,
+                                                       unsigned int num_points)
+{
+    const unsigned int num_bytes = num_points * 8;
+    __m128 xmm0, xmm9;
+    __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+
+    lv_32fc_t diff;
+    float sq_dist;
+    int bound = num_bytes >> 6;
+    int leftovers1 = (num_bytes >> 3) & 0b11;
+    int i = 0;
+
+    __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+    xmm1 = _mm256_setzero_ps();
+    xmm0 = _mm_loadu_ps((float*)src0);
+    xmm0 = _mm_permute_ps(xmm0, 0b01000100);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
+    xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
+
+    for (; i < bound; ++i) {
+        xmm2 = _mm256_loadu_ps((float*)&points[0]);
+        xmm3 = _mm256_loadu_ps((float*)&points[4]);
+        points += 8;
+
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+        xmm5 = _mm256_sub_ps(xmm1, xmm3);
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+        xmm7 = _mm256_mul_ps(xmm5, xmm5);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm7);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        _mm256_storeu_ps(target, xmm4);
+
+        target += 8;
+    }
+
+    if (num_bytes >> 5 & 1) {
+
+        xmm2 = _mm256_loadu_ps((float*)&points[0]);
+
+        xmm4 = _mm256_sub_ps(xmm1, xmm2);
+
+        points += 4;
+
+        xmm6 = _mm256_mul_ps(xmm4, xmm4);
+
+        xmm4 = _mm256_hadd_ps(xmm6, xmm6);
+        xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
+
+        xmm9 = _mm256_extractf128_ps(xmm4, 1);
+        _mm_storeu_ps(target, xmm9);
+
+        target += 4;
+    }
+
+    for (i = 0; i < leftovers1; ++i) {
+
+        diff = src0[0] - points[0];
+        points += 1;
+
+        sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
+
+        target[0] = sq_dist;
+        target += 1;
+    }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/
diff --git a/kernels/volk/volk_32i_s32f_convert_32f.h b/kernels/volk/volk_32i_s32f_convert_32f.h

new file mode 100644 (file)

index 0000000..0bcd9fb
--- /dev/null
+++ b/kernels/volk/volk_32i_s32f_convert_32f.h
@@ -0,0 +1,336 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts the samples in the inputVector from 32-bit integers into
+ * floating point values and then divides them by the input scalar.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_s32f_convert_32f(float* outputVector, const int32_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of 32-bit integers.
+ * \li scalar: The value that the output is divided by after being converted to a float.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li complexVector: The output vector of floats.
+ *
+ * \b Example
+ * Convert full-range integers to floats in range [0,1].
+ * \code
+ *   int N = 1<<8;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   float* z = (float*)volk_malloc(N*sizeof(float), alignment);
+ *   float scale = (float)N;
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       x[ii] = ii;
+ *   }
+ *
+ *   volk_32i_s32f_convert_32f(z, x, scale, N);
+ *
+ *   volk_free(x);
+ *   volk_free(z);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_u_H
+#define INCLUDED_volk_32i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_u_avx512f(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int onesixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m512 invScalar = _mm512_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m512i inputVal;
+    __m512 ret;
+
+    for (; number < onesixteenthPoints; number++) {
+        // Load the values
+        inputVal = _mm512_loadu_si512((__m512i*)inputPtr);
+
+        ret = _mm512_cvtepi32_ps(inputVal);
+        ret = _mm512_mul_ps(ret, invScalar);
+
+        _mm512_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 16;
+        inputPtr += 16;
+    }
+
+    number = onesixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m256i inputVal;
+    __m256 ret;
+
+    for (; number < oneEightPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm256_loadu_si256((__m256i*)inputPtr);
+
+        ret = _mm256_cvtepi32_ps(inputVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+
+        _mm256_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 8;
+        inputPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_u_sse2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+
+        _mm_storeu_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+        inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32i_s32f_convert_32f_generic(float* outputVector,
+                                                     const int32_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int32_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_u_H */
+
+
+#ifndef INCLUDED_volk_32i_s32f_convert_32f_a_H
+#define INCLUDED_volk_32i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_a_avx512f(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int onesixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m512 invScalar = _mm512_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m512i inputVal;
+    __m512 ret;
+
+    for (; number < onesixteenthPoints; number++) {
+        // Load the values
+        inputVal = _mm512_load_si512((__m512i*)inputPtr);
+
+        ret = _mm512_cvtepi32_ps(inputVal);
+        ret = _mm512_mul_ps(ret, invScalar);
+
+        _mm512_store_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 16;
+        inputPtr += 16;
+    }
+
+    number = onesixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m256i inputVal;
+    __m256 ret;
+
+    for (; number < oneEightPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm256_load_si256((__m256i*)inputPtr);
+
+        ret = _mm256_cvtepi32_ps(inputVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+
+        _mm256_store_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 8;
+        inputPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32i_s32f_convert_32f_a_sse2(float* outputVector,
+                                                    const int32_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int32_t* inputPtr = (int32_t*)inputVector;
+    __m128i inputVal;
+    __m128 ret;
+
+    for (; number < quarterPoints; number++) {
+        // Load the 4 values
+        inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+        ret = _mm_cvtepi32_ps(inputVal);
+        ret = _mm_mul_ps(ret, invScalar);
+
+        _mm_store_ps(outputVectorPtr, ret);
+
+        outputVectorPtr += 4;
+        inputPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = ((float)(inputVector[number])) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32i_s32f_convert_32f_a_generic(float* outputVector,
+                                                       const int32_t* inputVector,
+                                                       const float scalar,
+                                                       unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int32_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32i_s32f_convert_32f_a_H */
diff --git a/kernels/volk/volk_32i_x2_and_32i.h b/kernels/volk/volk_32i_x2_and_32i.h

new file mode 100644 (file)

index 0000000..645640a
--- /dev/null
+++ b/kernels/volk/volk_32i_x2_and_32i.h
@@ -0,0 +1,341 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32i_x2_and_32i
+ *
+ * \b Overview
+ *
+ * Computes the Boolean AND operation between two input 32-bit integer vectors.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_x2_and_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+ * \li bVector: Input vector of samples.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * This example generates a Karnaugh map for the lower two bits of x AND y.
+ * \code
+ *   int N = 1<<4;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t* y = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t* z = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t in_seq[] = {0,1,3,2};
+ *   unsigned int jj=0;
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       x[ii] = in_seq[ii%4];
+ *       y[ii] = in_seq[jj];
+ *       if(((ii+1) % 4) == 0) jj++;
+ *   }
+ *
+ *   volk_32i_x2_and_32i(z, x, y, N);
+ *
+ *   printf("Karnaugh map for x AND y\n");
+ *   printf("y\\x|");
+ *   for(unsigned int ii=0; ii<4; ++ii){
+ *       printf(" %.2x ", in_seq[ii]);
+ *   }
+ *   printf("\n---|---------------\n");
+ *   jj = 0;
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       if(((ii+1) % 4) == 1){
+ *           printf("%.2x | ", in_seq[jj++]);
+ *       }
+ *       printf("%.2x  ", z[ii]);
+ *       if(!((ii+1) % 4)){
+ *           printf("\n");
+ *       }
+ *   }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_x2_and_32i_a_H
+#define INCLUDED_volk_32i_x2_and_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_x2_and_32i_a_avx512f(int32_t* cVector,
+                                                 const int32_t* aVector,
+                                                 const int32_t* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
+
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_load_si512(aPtr);
+        bVal = _mm512_load_si512(bPtr);
+
+        cVal = _mm512_and_si512(aVal, bVal);
+
+        _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_x2_and_32i_a_avx2(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
+
+        aVal = _mm256_load_si256((__m256i*)aPtr);
+        bVal = _mm256_load_si256((__m256i*)bPtr);
+
+        cVal = _mm256_and_si256(aVal, bVal);
+
+        _mm256_store_si256((__m256i*)cPtr,
+                           cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32i_x2_and_32i_a_sse(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_and_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32i_x2_and_32i_neon(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
+{
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vandq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32i_x2_and_32i_generic(int32_t* cVector,
+                                               const int32_t* aVector,
+                                               const int32_t* bVector,
+                                               unsigned int num_points)
+{
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) & (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_32i_x2_and_32i_a_orc_impl(int32_t* cVector,
+                                           const int32_t* aVector,
+                                           const int32_t* bVector,
+                                           unsigned int num_points);
+
+static inline void volk_32i_x2_and_32i_u_orc(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
+{
+    volk_32i_x2_and_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_a_H */
+
+
+#ifndef INCLUDED_volk_32i_x2_and_32i_u_H
+#define INCLUDED_volk_32i_x2_and_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_x2_and_32i_u_avx512f(int32_t* cVector,
+                                                 const int32_t* aVector,
+                                                 const int32_t* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
+
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_loadu_si512(aPtr);
+        bVal = _mm512_loadu_si512(bPtr);
+
+        cVal = _mm512_and_si512(aVal, bVal);
+
+        _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_x2_and_32i_u_avx2(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
+
+        aVal = _mm256_loadu_si256((__m256i*)aPtr);
+        bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+        cVal = _mm256_and_si256(aVal, bVal);
+
+        _mm256_storeu_si256((__m256i*)cPtr,
+                            cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] & bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_32i_x2_and_32i_u_H */
diff --git a/kernels/volk/volk_32i_x2_or_32i.h b/kernels/volk/volk_32i_x2_or_32i.h

new file mode 100644 (file)

index 0000000..fdb713c
--- /dev/null
+++ b/kernels/volk/volk_32i_x2_or_32i.h
@@ -0,0 +1,340 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32i_x2_or_32i
+ *
+ * \b Overview
+ *
+ * Computes the Boolean OR operation between two input 32-bit integer vectors.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32i_x2_or_32i(int32_t* cVector, const int32_t* aVector, const int32_t*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: Input vector of samples.
+ * \li bVector: Input vector of samples.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * This example generates a Karnaugh map for the first two bits of x OR y
+ * \code
+ *   int N = 1<<4;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   int32_t* x = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t* y = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t* z = (int32_t*)volk_malloc(N*sizeof(int32_t), alignment);
+ *   int32_t in_seq[] = {0,1,3,2};
+ *   unsigned int jj=0;
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       x[ii] = in_seq[ii%4];
+ *       y[ii] = in_seq[jj];
+ *       if(((ii+1) % 4) == 0) jj++;
+ *   }
+ *
+ *   volk_32i_x2_or_32i(z, x, y, N);
+ *
+ *   printf("Karnaugh map for x OR y\n");
+ *   printf("y\\x|");
+ *   for(unsigned int ii=0; ii<4; ++ii){
+ *       printf(" %.2x ", in_seq[ii]);
+ *   }
+ *   printf("\n---|---------------\n");
+ *   jj = 0;
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       if(((ii+1) % 4) == 1){
+ *           printf("%.2x | ", in_seq[jj++]);
+ *       }
+ *       printf("%.2x  ", z[ii]);
+ *       if(!((ii+1) % 4)){
+ *           printf("\n");
+ *       }
+ *   }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
+#define INCLUDED_volk_32i_x2_or_32i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
+                                                const int32_t* aVector,
+                                                const int32_t* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
+
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_load_si512(aPtr);
+        bVal = _mm512_load_si512(bPtr);
+
+        cVal = _mm512_or_si512(aVal, bVal);
+
+        _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
+
+        aVal = _mm256_load_si256((__m256i*)aPtr);
+        bVal = _mm256_load_si256((__m256i*)bPtr);
+
+        cVal = _mm256_or_si256(aVal, bVal);
+
+        _mm256_store_si256((__m256i*)cPtr,
+                           cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    float* cPtr = (float*)cVector;
+    const float* aPtr = (float*)aVector;
+    const float* bPtr = (float*)bVector;
+
+    __m128 aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+        aVal = _mm_load_ps(aPtr);
+        bVal = _mm_load_ps(bPtr);
+
+        cVal = _mm_or_ps(aVal, bVal);
+
+        _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
+                                           const int32_t* aVector,
+                                           const int32_t* bVector,
+                                           unsigned int num_points)
+{
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+    unsigned int quarter_points = num_points / 4;
+
+    int32x4_t a_val, b_val, c_val;
+
+    for (number = 0; number < quarter_points; number++) {
+        a_val = vld1q_s32(aPtr);
+        b_val = vld1q_s32(bPtr);
+        c_val = vorrq_s32(a_val, b_val);
+        vst1q_s32(cPtr, c_val);
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    for (number = quarter_points * 4; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
+                                              const int32_t* aVector,
+                                              const int32_t* bVector,
+                                              unsigned int num_points)
+{
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) | (*bPtr++);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
+                                          const int32_t* aVector,
+                                          const int32_t* bVector,
+                                          unsigned int num_points);
+
+static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
+                                            const int32_t* aVector,
+                                            const int32_t* bVector,
+                                            unsigned int num_points)
+{
+    volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
+
+
+#ifndef INCLUDED_volk_32i_x2_or_32i_u_H
+#define INCLUDED_volk_32i_x2_or_32i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
+                                                const int32_t* aVector,
+                                                const int32_t* bVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    int32_t* cPtr = (int32_t*)cVector;
+    const int32_t* aPtr = (int32_t*)aVector;
+    const int32_t* bPtr = (int32_t*)bVector;
+
+    __m512i aVal, bVal, cVal;
+    for (; number < sixteenthPoints; number++) {
+
+        aVal = _mm512_loadu_si512(aPtr);
+        bVal = _mm512_loadu_si512(bPtr);
+
+        cVal = _mm512_or_si512(aVal, bVal);
+
+        _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 16;
+        bPtr += 16;
+        cPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
+                                             const int32_t* aVector,
+                                             const int32_t* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEightPoints = num_points / 8;
+
+    int32_t* cPtr = cVector;
+    const int32_t* aPtr = aVector;
+    const int32_t* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+    for (; number < oneEightPoints; number++) {
+
+        aVal = _mm256_loadu_si256((__m256i*)aPtr);
+        bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+        cVal = _mm256_or_si256(aVal, bVal);
+
+        _mm256_storeu_si256((__m256i*)cPtr,
+                            cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        cVector[number] = aVector[number] | bVector[number];
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
diff --git a/kernels/volk/volk_32u_byteswap.h b/kernels/volk/volk_32u_byteswap.h

new file mode 100644 (file)

index 0000000..150fc0d
--- /dev/null
+++ b/kernels/volk/volk_32u_byteswap.h
@@ -0,0 +1,367 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int32_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32u_byteswap(uint32_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   uint32_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffff,
+ *       0x5a5a5a5a, 0xa5a5a5a5, 0x2a2a2a2a,
+ *       0xffffffff, 0x32, 0x64};
+ *   uint32_t hamming_distance = 0;
+ *
+ *   printf("byteswap vector =\n");
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       printf("    %.8x\n", bitstring[ii]);
+ *   }
+ *
+ *   volk_32u_byteswap(bitstring, N);
+ *
+ *   printf("byteswapped vector =\n");
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       printf("    %.8x\n", bitstring[ii]);
+ *   }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_32u_byteswap_u_H
+#define INCLUDED_volk_32u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_32u_byteswap_u_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
+
+    unsigned int number;
+
+    const unsigned int nPerSet = 8;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
+                                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+                                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
+
+    for (number = 0; number < nSets; number++) {
+
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_32u_byteswap_u_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    uint32_t* inputPtr = intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_32u_byteswap_neon(uint32_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
+    unsigned int number = 0;
+    unsigned int n8points = num_points / 8;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(74609667900706840);
+    int_lookup23 = vcreate_u8(219290013576860186);
+    int_lookup45 = vcreate_u8(363970359253013532);
+    int_lookup67 = vcreate_u8(508650704929166878);
+
+    for (number = 0; number < n8points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+        inputPtr += 8;
+    }
+
+    for (number = n8points * 8; number < num_points; ++number) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+        *inputPtr = output;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_32u_byteswap_neonv8(uint32_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int n8points = num_points / 8;
+    uint8x16_t input;
+    uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+    unsigned int number = 0;
+    for (number = 0; number < n8points; ++number) {
+        __VOLK_PREFETCH(inputPtr + 8);
+        input = vld1q_u8((uint8_t*)inputPtr);
+        input = vqtbl1q_u8(input, idx);
+        vst1q_u8((uint8_t*)inputPtr, input);
+        inputPtr += 4;
+
+        input = vld1q_u8((uint8_t*)inputPtr);
+        input = vqtbl1q_u8(input, idx);
+        vst1q_u8((uint8_t*)inputPtr, input);
+        inputPtr += 4;
+    }
+
+    for (number = n8points * 8; number < num_points; ++number) {
+        uint32_t output = *inputPtr;
+
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+        *inputPtr++ = output;
+    }
+}
+#endif /* LV_HAVE_NEONV8 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_byteswap_generic(uint32_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
+
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+        *inputPtr = output;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32u_byteswap_u_H */
+#ifndef INCLUDED_volk_32u_byteswap_a_H
+#define INCLUDED_volk_32u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_32u_byteswap_a_avx2(uint32_t* intsToSwap, unsigned int num_points)
+{
+
+    unsigned int number;
+
+    const unsigned int nPerSet = 8;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
+                                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
+                                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector);
+
+    for (number = 0; number < nSets; number++) {
+
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
+        inputPtr += nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+
+static inline void volk_32u_byteswap_a_sse2(uint32_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    uint32_t* inputPtr = intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+
+    const uint64_t quarterPoints = num_points / 4;
+    for (; number < quarterPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        uint32_t outputVal = *inputPtr;
+        outputVal = (((outputVal >> 24) & 0xff) | ((outputVal >> 8) & 0x0000ff00) |
+                     ((outputVal << 8) & 0x00ff0000) | ((outputVal << 24) & 0xff000000));
+        *inputPtr = outputVal;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_byteswap_a_generic(uint32_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    uint32_t* inputPtr = intsToSwap;
+
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output = *inputPtr;
+        output = (((output >> 24) & 0xff) | ((output >> 8) & 0x0000ff00) |
+                  ((output << 8) & 0x00ff0000) | ((output << 24) & 0xff000000));
+
+        *inputPtr = output;
+        inputPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_32u_byteswap_a_H */
diff --git a/kernels/volk/volk_32u_byteswappuppet_32u.h b/kernels/volk/volk_32u_byteswappuppet_32u.h

new file mode 100644 (file)

index 0000000..a6ef921
--- /dev/null
+++ b/kernels/volk/volk_32u_byteswappuppet_32u.h
@@ -0,0 +1,94 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014, 2015, 2018, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_32u_byteswappuppet_32u_H
+#define INCLUDED_volk_32u_byteswappuppet_32u_H
+
+#include <stdint.h>
+#include <string.h>
+#include <volk/volk_32u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_byteswappuppet_32u_generic(uint32_t* output,
+                                                       uint32_t* intsToSwap,
+                                                       unsigned int num_points)
+{
+
+    volk_32u_byteswap_generic((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_32u_byteswappuppet_32u_neon(uint32_t* output,
+                                                    uint32_t* intsToSwap,
+                                                    unsigned int num_points)
+{
+
+    volk_32u_byteswap_neon((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_NEONV8
+static inline void volk_32u_byteswappuppet_32u_neonv8(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_32u_byteswap_neonv8((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_u_sse2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_32u_byteswap_u_sse2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_32u_byteswappuppet_32u_a_sse2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_32u_byteswap_a_sse2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_32u_byteswappuppet_32u_u_avx2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_32u_byteswap_u_avx2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_32u_byteswappuppet_32u_a_avx2(uint32_t* output,
+                                                      uint32_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_32u_byteswap_a_avx2((uint32_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint32_t));
+}
+#endif
+
+#endif
diff --git a/kernels/volk/volk_32u_popcnt.h b/kernels/volk/volk_32u_popcnt.h

new file mode 100644 (file)

index 0000000..b8c371f
--- /dev/null
+++ b/kernels/volk/volk_32u_popcnt.h
@@ -0,0 +1,79 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32u_popcnt
+ *
+ * \b Overview
+ *
+ * Computes the population count (popcnt), or Hamming distance of a
+ * binary string. This kernel takes in a single unsigned 32-bit value
+ * and returns the count of 1's that the value contains.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32u_popcnt(uint32_t* ret, const uint32_t value)
+ * \endcode
+ *
+ * \b Inputs
+ * \li value: The input value.
+ *
+ * \b Outputs
+ * \li ret: The return value containing the popcnt.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+
+    uint32_t bitstring = 0x55555555;
+    uint32_t hamming_distance = 0;
+
+    volk_32u_popcnt(&hamming_distance, bitstring);
+    printf("hamming distance of %x = %i\n", bitstring, hamming_distance);
+ * \endcode
+ */
+
+#ifndef INCLUDED_VOLK_32u_POPCNT_A16_H
+#define INCLUDED_VOLK_32u_POPCNT_A16_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value)
+{
+    // This is faster than a lookup table
+    uint32_t retVal = value;
+
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+
+    *ret = retVal;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE4_2
+
+#include <nmmintrin.h>
+
+static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value)
+{
+    *ret = _mm_popcnt_u32(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+#endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/
diff --git a/kernels/volk/volk_32u_popcntpuppet_32u.h b/kernels/volk/volk_32u_popcntpuppet_32u.h

new file mode 100644 (file)

index 0000000..19a17f5
--- /dev/null
+++ b/kernels/volk/volk_32u_popcntpuppet_32u.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_32u_popcntpuppet_32u_H
+#define INCLUDED_volk_32u_popcntpuppet_32u_H
+
+#include <stdint.h>
+#include <volk/volk_32u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector,
+                                                     const uint32_t* inVector,
+                                                     unsigned int num_points)
+{
+    unsigned int ii;
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_32u_popcnt_generic(outVector + ii, *(inVector + ii));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE4_2
+static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector,
+                                                      const uint32_t* inVector,
+                                                      unsigned int num_points)
+{
+    unsigned int ii;
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_32u_popcnt_a_sse4_2(outVector + ii, *(inVector + ii));
+    }
+}
+#endif /* LV_HAVE_SSE4_2 */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_32u_reverse_32u.h b/kernels/volk/volk_32u_reverse_32u.h

new file mode 100644 (file)

index 0000000..f65ec60
--- /dev/null
+++ b/kernels/volk/volk_32u_reverse_32u.h
@@ -0,0 +1,407 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_32u_reverse_32u
+ *
+ * \b bit reversal of the input 32 bit word
+
+ * <b>Dispatcher Prototype</b>
+ * \code volk_32u_reverse_32u(uint32_t *outputVector, uint32_t *inputVector; unsigned int
+ num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector
+ * \li num_points The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: The vector where the results will be stored, which is the
+ bit-reversed input
+ *
+ * \endcode
+ */
+#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
+struct dword_split {
+    int b00 : 1;
+    int b01 : 1;
+    int b02 : 1;
+    int b03 : 1;
+    int b04 : 1;
+    int b05 : 1;
+    int b06 : 1;
+    int b07 : 1;
+    int b08 : 1;
+    int b09 : 1;
+    int b10 : 1;
+    int b11 : 1;
+    int b12 : 1;
+    int b13 : 1;
+    int b14 : 1;
+    int b15 : 1;
+    int b16 : 1;
+    int b17 : 1;
+    int b18 : 1;
+    int b19 : 1;
+    int b20 : 1;
+    int b21 : 1;
+    int b22 : 1;
+    int b23 : 1;
+    int b24 : 1;
+    int b25 : 1;
+    int b26 : 1;
+    int b27 : 1;
+    int b28 : 1;
+    int b29 : 1;
+    int b30 : 1;
+    int b31 : 1;
+};
+struct char_split {
+    uint8_t b00 : 1;
+    uint8_t b01 : 1;
+    uint8_t b02 : 1;
+    uint8_t b03 : 1;
+    uint8_t b04 : 1;
+    uint8_t b05 : 1;
+    uint8_t b06 : 1;
+    uint8_t b07 : 1;
+};
+
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+static const unsigned char BitReverseTable256[] = {
+    0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
+    0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
+    0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
+    0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
+    0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
+    0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
+    0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
+    0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
+    0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
+    0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
+    0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
+    0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
+    0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
+    0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
+    0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
+    0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
+    0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
+    0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
+    0x3F, 0xBF, 0x7F, 0xFF
+};
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
+                                                      const uint32_t* in,
+                                                      unsigned int num_points)
+{
+    const struct dword_split* in_ptr = (const struct dword_split*)in;
+    struct dword_split* out_ptr = (struct dword_split*)out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        out_ptr->b00 = in_ptr->b31;
+        out_ptr->b01 = in_ptr->b30;
+        out_ptr->b02 = in_ptr->b29;
+        out_ptr->b03 = in_ptr->b28;
+        out_ptr->b04 = in_ptr->b27;
+        out_ptr->b05 = in_ptr->b26;
+        out_ptr->b06 = in_ptr->b25;
+        out_ptr->b07 = in_ptr->b24;
+        out_ptr->b08 = in_ptr->b23;
+        out_ptr->b09 = in_ptr->b22;
+        out_ptr->b10 = in_ptr->b21;
+        out_ptr->b11 = in_ptr->b20;
+        out_ptr->b12 = in_ptr->b19;
+        out_ptr->b13 = in_ptr->b18;
+        out_ptr->b14 = in_ptr->b17;
+        out_ptr->b15 = in_ptr->b16;
+        out_ptr->b16 = in_ptr->b15;
+        out_ptr->b17 = in_ptr->b14;
+        out_ptr->b18 = in_ptr->b13;
+        out_ptr->b19 = in_ptr->b12;
+        out_ptr->b20 = in_ptr->b11;
+        out_ptr->b21 = in_ptr->b10;
+        out_ptr->b22 = in_ptr->b09;
+        out_ptr->b23 = in_ptr->b08;
+        out_ptr->b24 = in_ptr->b07;
+        out_ptr->b25 = in_ptr->b06;
+        out_ptr->b26 = in_ptr->b05;
+        out_ptr->b27 = in_ptr->b04;
+        out_ptr->b28 = in_ptr->b03;
+        out_ptr->b29 = in_ptr->b02;
+        out_ptr->b30 = in_ptr->b01;
+        out_ptr->b31 = in_ptr->b00;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
+                                                     const uint32_t* in,
+                                                     unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        const struct char_split* in8 = (const struct char_split*)in_ptr;
+        struct char_split* out8 = (struct char_split*)out_ptr;
+
+        out8[3].b00 = in8[0].b07;
+        out8[3].b01 = in8[0].b06;
+        out8[3].b02 = in8[0].b05;
+        out8[3].b03 = in8[0].b04;
+        out8[3].b04 = in8[0].b03;
+        out8[3].b05 = in8[0].b02;
+        out8[3].b06 = in8[0].b01;
+        out8[3].b07 = in8[0].b00;
+
+        out8[2].b00 = in8[1].b07;
+        out8[2].b01 = in8[1].b06;
+        out8[2].b02 = in8[1].b05;
+        out8[2].b03 = in8[1].b04;
+        out8[2].b04 = in8[1].b03;
+        out8[2].b05 = in8[1].b02;
+        out8[2].b06 = in8[1].b01;
+        out8[2].b07 = in8[1].b00;
+
+        out8[1].b00 = in8[2].b07;
+        out8[1].b01 = in8[2].b06;
+        out8[1].b02 = in8[2].b05;
+        out8[1].b03 = in8[2].b04;
+        out8[1].b04 = in8[2].b03;
+        out8[1].b05 = in8[2].b02;
+        out8[1].b06 = in8[2].b01;
+        out8[1].b07 = in8[2].b00;
+
+        out8[0].b00 = in8[3].b07;
+        out8[0].b01 = in8[3].b06;
+        out8[0].b02 = in8[3].b05;
+        out8[0].b03 = in8[3].b04;
+        out8[0].b04 = in8[3].b03;
+        out8[0].b05 = in8[3].b02;
+        out8[0].b06 = in8[3].b01;
+        out8[0].b07 = in8[3].b00;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
+// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+                   (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+                   (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+                   (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
+// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const uint8_t* in8;
+    uint8_t* out8;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        in8 = (const uint8_t*)in_ptr;
+        out8 = (uint8_t*)out_ptr;
+        out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_GENERIC
+// Current gr-pager implementation
+static inline void
+volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const uint8_t* in8;
+    uint8_t* out8;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        in8 = (const uint8_t*)in_ptr;
+        out8 = (uint8_t*)out_ptr;
+        out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+// After lengthy thought and quite a bit of whiteboarding:
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
+                                                                 const uint32_t* in,
+                                                                 unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        uint32_t tmp = *in_ptr;
+        /* permute uint16:
+           The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
+         */
+        tmp = (tmp << 16) | (tmp >> 16);
+        /* permute bytes:
+           shift up by 1 B first, then only consider even bytes, and OR with the unshifted
+           even bytes
+         */
+        tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+        /* permute 4bit tuples:
+           Same idea, but the "consideration" mask expression becomes unwieldy
+         */
+        tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+              ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+        /* permute 2bit tuples:
+           Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
+           3; we need those every 4b, which coincides with a hex digit!
+        */
+        tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+        /* permute odd/even:
+           0x01 = 0x1;  we need these every 2b, which works out: 0x01 | (0x01 << 2) =
+           0x05!
+         */
+        tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+
+        *out_ptr = tmp;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+#ifdef LV_HAVE_GENERIC
+static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
+                                                                  const uint32_t* in,
+                                                                  unsigned int num_points)
+{
+    // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    unsigned int number = 0;
+    for (; number < num_points; ++number) {
+        uint32_t tmp = *in_ptr;
+        tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
+        tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
+        tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
+              ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
+        tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
+        tmp = (tmp << 16) | (tmp >> 16);
+
+        *out_ptr = tmp;
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void
+volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+
+    const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+
+    const unsigned int quarterPoints = num_points / 4;
+    unsigned int number = 0;
+    for (; number < quarterPoints; ++number) {
+        __VOLK_PREFETCH(in_ptr + 4);
+        uint32x4_t x = vld1q_u32(in_ptr);
+        uint32x4_t z =
+            vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
+        vst1q_u32(out_ptr, z);
+        in_ptr += 4;
+        out_ptr += 4;
+    }
+    number = quarterPoints * 4;
+    for (; number < num_points; ++number) {
+        *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
+                   (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
+                   (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
+                   (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
+        ++in_ptr;
+        ++out_ptr;
+    }
+}
+
+#else
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+#define DO_RBIT                           \
+    __VOLK_ASM("rbit %[result], %[value]" \
+               : [result] "=r"(*out_ptr)  \
+               : [value] "r"(*in_ptr)     \
+               :);                        \
+    in_ptr++;                             \
+    out_ptr++;
+
+static inline void
+volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
+{
+
+    const uint32_t* in_ptr = in;
+    uint32_t* out_ptr = out;
+    const unsigned int eighthPoints = num_points / 8;
+    unsigned int number = 0;
+    for (; number < eighthPoints; ++number) {
+        __VOLK_PREFETCH(in_ptr + 8);
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+        DO_RBIT;
+    }
+    number = eighthPoints * 8;
+    for (; number < num_points; ++number) {
+        DO_RBIT;
+    }
+}
+#undef DO_RBIT
+#endif /* LV_HAVE_NEON */
+#endif /* LV_HAVE_NEONV8 */
+
+
+#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
diff --git a/kernels/volk/volk_64f_convert_32f.h b/kernels/volk/volk_64f_convert_32f.h

new file mode 100644 (file)

index 0000000..f68f3af
--- /dev/null
+++ b/kernels/volk/volk_64f_convert_32f.h
@@ -0,0 +1,336 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64f_convert_32f
+ *
+ * \b Overview
+ *
+ * Converts doubles into floats.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_convert_32f(float* outputVector, const double* inputVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The vector of doubles to convert to floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li outputVector: returns the converted floats.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (double)ii;
+ *   }
+ *
+ *   volk_64f_convert_32f(out, increasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2f\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_convert_32f_u_H
+#define INCLUDED_volk_64f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_convert_32f_u_avx512f(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int oneSixteenthPoints = num_points / 16;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m256 ret1, ret2;
+    __m512d inputVal1, inputVal2;
+
+    for (; number < oneSixteenthPoints; number++) {
+        inputVal1 = _mm512_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm512_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        ret1 = _mm512_cvtpd_ps(inputVal1);
+        ret2 = _mm512_cvtpd_ps(inputVal2);
+
+        _mm256_storeu_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 8;
+
+        _mm256_storeu_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 8;
+    }
+
+    number = oneSixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_convert_32f_u_avx(float* outputVector,
+                                              const double* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int oneEightPoints = num_points / 8;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret1, ret2;
+    __m256d inputVal1, inputVal2;
+
+    for (; number < oneEightPoints; number++) {
+        inputVal1 = _mm256_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm256_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret1 = _mm256_cvtpd_ps(inputVal1);
+        ret2 = _mm256_cvtpd_ps(inputVal2);
+
+        _mm_storeu_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 4;
+
+        _mm_storeu_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 4;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_convert_32f_u_sse2(float* outputVector,
+                                               const double* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret, ret2;
+    __m128d inputVal1, inputVal2;
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+        inputVal2 = _mm_loadu_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+
+        ret = _mm_cvtpd_ps(inputVal1);
+        ret2 = _mm_cvtpd_ps(inputVal2);
+
+        ret = _mm_movelh_ps(ret, ret2);
+
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_convert_32f_generic(float* outputVector,
+                                                const double* inputVector,
+                                                unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const double* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_u_H */
+#ifndef INCLUDED_volk_64f_convert_32f_a_H
+#define INCLUDED_volk_64f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_convert_32f_a_avx512f(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int oneSixteenthPoints = num_points / 16;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m256 ret1, ret2;
+    __m512d inputVal1, inputVal2;
+
+    for (; number < oneSixteenthPoints; number++) {
+        inputVal1 = _mm512_load_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+        inputVal2 = _mm512_load_pd(inputVectorPtr);
+        inputVectorPtr += 8;
+
+        ret1 = _mm512_cvtpd_ps(inputVal1);
+        ret2 = _mm512_cvtpd_ps(inputVal2);
+
+        _mm256_store_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 8;
+
+        _mm256_store_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 8;
+    }
+
+    number = oneSixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_convert_32f_a_avx(float* outputVector,
+                                              const double* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int oneEightPoints = num_points / 8;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret1, ret2;
+    __m256d inputVal1, inputVal2;
+
+    for (; number < oneEightPoints; number++) {
+        inputVal1 = _mm256_load_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+        inputVal2 = _mm256_load_pd(inputVectorPtr);
+        inputVectorPtr += 4;
+
+        ret1 = _mm256_cvtpd_ps(inputVal1);
+        ret2 = _mm256_cvtpd_ps(inputVal2);
+
+        _mm_store_ps(outputVectorPtr, ret1);
+        outputVectorPtr += 4;
+
+        _mm_store_ps(outputVectorPtr, ret2);
+        outputVectorPtr += 4;
+    }
+
+    number = oneEightPoints * 8;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_convert_32f_a_sse2(float* outputVector,
+                                               const double* inputVector,
+                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int quarterPoints = num_points / 4;
+
+    const double* inputVectorPtr = (const double*)inputVector;
+    float* outputVectorPtr = outputVector;
+    __m128 ret, ret2;
+    __m128d inputVal1, inputVal2;
+
+    for (; number < quarterPoints; number++) {
+        inputVal1 = _mm_load_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+        inputVal2 = _mm_load_pd(inputVectorPtr);
+        inputVectorPtr += 2;
+
+        ret = _mm_cvtpd_ps(inputVal1);
+        ret2 = _mm_cvtpd_ps(inputVal2);
+
+        ret = _mm_movelh_ps(ret, ret2);
+
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_convert_32f_a_generic(float* outputVector,
+                                                  const double* inputVector,
+                                                  unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const double* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++));
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_convert_32f_a_H */
diff --git a/kernels/volk/volk_64f_x2_add_64f.h b/kernels/volk/volk_64f_x2_add_64f.h

new file mode 100644 (file)

index 0000000..867a5d3
--- /dev/null
+++ b/kernels/volk/volk_64f_x2_add_64f.h
@@ -0,0 +1,247 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64f_x2_add_64f
+ *
+ * \b Overview
+ *
+ * addtiplies two input double-precision floating point vectors together.
+ *
+ * c[i] = a[i] * b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_add_64f(float* cVector, const float* aVector, const float* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * add elements of an increasing vector by those of a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_64f_x2_add_64f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2F\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_add_64f_H
+#define INCLUDED_volk_64f_x2_add_64f_H
+
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_x2_add_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+/*
+ * Unaligned versions
+ */
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_loadu_pd(aPtr);
+        bVal = _mm_loadu_pd(bPtr);
+
+        cVal = _mm_add_pd(aVal, bVal);
+
+        _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
+
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
+
+        cVal = _mm256_add_pd(aVal, bVal);
+
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+/*
+ * Aligned versions
+ */
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
+
+        cVal = _mm_add_pd(aVal, bVal);
+
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
+
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
+
+        cVal = _mm256_add_pd(aVal, bVal);
+
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) + (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_64f_x2_add_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_max_64f.h b/kernels/volk/volk_64f_x2_max_64f.h

new file mode 100644 (file)

index 0000000..973605c
--- /dev/null
+++ b/kernels/volk/volk_64f_x2_max_64f.h
@@ -0,0 +1,294 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64f_x2_max_64f
+ *
+ * \b Overview
+ *
+ * Selects maximum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = max(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_max_64f(double* cVector, const double* aVector, const double* bVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (double)ii;
+ *       decreasing[ii] = 10.f - (double)ii;
+ *   }
+ *
+ *   volk_64f_x2_max_64f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2g\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_max_64f_a_H
+#define INCLUDED_volk_64f_x2_max_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_x2_max_64f_a_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
+
+        aVal = _mm512_load_pd(aPtr);
+        bVal = _mm512_load_pd(bPtr);
+
+        cVal = _mm512_max_pd(aVal, bVal);
+
+        _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_x2_max_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
+
+        cVal = _mm256_max_pd(aVal, bVal);
+
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_max_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
+
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
+
+        cVal = _mm_max_pd(aVal, bVal);
+
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_x2_max_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_a_H */
+
+
+#ifndef INCLUDED_volk_64f_x2_max_64f_u_H
+#define INCLUDED_volk_64f_x2_max_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_x2_max_64f_u_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
+
+        aVal = _mm512_loadu_pd(aPtr);
+        bVal = _mm512_loadu_pd(bPtr);
+
+        cVal = _mm512_max_pd(aVal, bVal);
+
+        _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_x2_max_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
+
+        cVal = _mm256_max_pd(aVal, bVal);
+
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a > b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_64f_x2_max_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_min_64f.h b/kernels/volk/volk_64f_x2_min_64f.h

new file mode 100644 (file)

index 0000000..970b843
--- /dev/null
+++ b/kernels/volk/volk_64f_x2_min_64f.h
@@ -0,0 +1,294 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64f_x2_min_64f
+ *
+ * \b Overview
+ *
+ * Selects minimum value from each entry between bVector and aVector
+ * and store their results in the cVector.
+ *
+ * c[i] = min(a[i], b[i])
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_min_64f(double* cVector, const double* aVector, const double* bVector,
+ unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * \code
+    int N = 10;
+    unsigned int alignment = volk_get_alignment();
+    double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+    double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+    double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        increasing[ii] = (double)ii;
+        decreasing[ii] = 10.f - (double)ii;
+    }
+
+    volk_64f_x2_min_64f(out, increasing, decreasing, N);
+
+    for(unsigned int ii = 0; ii < N; ++ii){
+        printf("out[%u] = %1.2g\n", ii, out[ii]);
+    }
+
+    volk_free(increasing);
+    volk_free(decreasing);
+    volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_min_64f_a_H
+#define INCLUDED_volk_64f_x2_min_64f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_x2_min_64f_a_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
+
+        aVal = _mm512_load_pd(aPtr);
+        bVal = _mm512_load_pd(bPtr);
+
+        cVal = _mm512_min_pd(aVal, bVal);
+
+        _mm512_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_x2_min_64f_a_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
+
+        cVal = _mm256_min_pd(aVal, bVal);
+
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_min_64f_a_sse2(double* cVector,
+                                              const double* aVector,
+                                              const double* bVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int halfPoints = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < halfPoints; number++) {
+
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
+
+        cVal = _mm_min_pd(aVal, bVal);
+
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_x2_min_64f_generic(double* cVector,
+                                               const double* aVector,
+                                               const double* bVector,
+                                               unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_a_H */
+
+#ifndef INCLUDED_volk_64f_x2_min_64f_u_H
+#define INCLUDED_volk_64f_x2_min_64f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX512F
+#include <immintrin.h>
+
+static inline void volk_64f_x2_min_64f_u_avx512f(double* cVector,
+                                                 const double* aVector,
+                                                 const double* bVector,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int eigthPoints = num_points / 8;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m512d aVal, bVal, cVal;
+    for (; number < eigthPoints; number++) {
+
+        aVal = _mm512_loadu_pd(aPtr);
+        bVal = _mm512_loadu_pd(bPtr);
+
+        cVal = _mm512_min_pd(aVal, bVal);
+
+        _mm512_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 8;
+        bPtr += 8;
+        cPtr += 8;
+    }
+
+    number = eigthPoints * 8;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX512F */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_64f_x2_min_64f_u_avx(double* cVector,
+                                             const double* aVector,
+                                             const double* bVector,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarterPoints; number++) {
+
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
+
+        cVal = _mm256_min_pd(aVal, bVal);
+
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        const double a = *aPtr++;
+        const double b = *bPtr++;
+        *cPtr++ = (a < b ? a : b);
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#endif /* INCLUDED_volk_64f_x2_min_64f_u_H */
diff --git a/kernels/volk/volk_64f_x2_multiply_64f.h b/kernels/volk/volk_64f_x2_multiply_64f.h

new file mode 100644 (file)

index 0000000..caab3aa
--- /dev/null
+++ b/kernels/volk/volk_64f_x2_multiply_64f.h
@@ -0,0 +1,247 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2018 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64f_x2_multiply_64f
+ *
+ * \b Overview
+ *
+ * Multiplies two input double-precision floating point vectors together.
+ *
+ * c[i] = a[i] * b[i]
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64f_x2_multiply_64f(float* cVector, const float* aVector, const float*
+ * bVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: First input vector.
+ * \li bVector: Second input vector.
+ * \li num_points: The number of values in both input vectors.
+ *
+ * \b Outputs
+ * \li cVector: The output vector.
+ *
+ * \b Example
+ * Multiply elements of an increasing vector by those of a decreasing vector.
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   double* increasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* decreasing = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *   double* out = (double*)volk_malloc(sizeof(double)*N, alignment);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       increasing[ii] = (float)ii;
+ *       decreasing[ii] = 10.f - (float)ii;
+ *   }
+ *
+ *   volk_64f_x2_multiply_64f(out, increasing, decreasing, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("out[%u] = %1.2F\n", ii, out[ii]);
+ *   }
+ *
+ *   volk_free(increasing);
+ *   volk_free(decreasing);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64f_x2_multiply_64f_H
+#define INCLUDED_volk_64f_x2_multiply_64f_H
+
+#include <inttypes.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64f_x2_multiply_64f_generic(double* cVector,
+                                                    const double* aVector,
+                                                    const double* bVector,
+                                                    unsigned int num_points)
+{
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+/*
+ * Unaligned versions
+ */
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_multiply_64f_u_sse2(double* cVector,
+                                                   const double* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_loadu_pd(aPtr);
+        bVal = _mm_loadu_pd(bPtr);
+
+        cVal = _mm_mul_pd(aVal, bVal);
+
+        _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_64f_x2_multiply_64f_u_avx(double* cVector,
+                                                  const double* aVector,
+                                                  const double* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
+
+        aVal = _mm256_loadu_pd(aPtr);
+        bVal = _mm256_loadu_pd(bPtr);
+
+        cVal = _mm256_mul_pd(aVal, bVal);
+
+        _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+/*
+ * Aligned versions
+ */
+
+#ifdef LV_HAVE_SSE2
+
+#include <emmintrin.h>
+
+static inline void volk_64f_x2_multiply_64f_a_sse2(double* cVector,
+                                                   const double* aVector,
+                                                   const double* bVector,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int half_points = num_points / 2;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m128d aVal, bVal, cVal;
+    for (; number < half_points; number++) {
+        aVal = _mm_load_pd(aPtr);
+        bVal = _mm_load_pd(bPtr);
+
+        cVal = _mm_mul_pd(aVal, bVal);
+
+        _mm_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 2;
+        bPtr += 2;
+        cPtr += 2;
+    }
+
+    number = half_points * 2;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_64f_x2_multiply_64f_a_avx(double* cVector,
+                                                  const double* aVector,
+                                                  const double* bVector,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarter_points = num_points / 4;
+
+    double* cPtr = cVector;
+    const double* aPtr = aVector;
+    const double* bPtr = bVector;
+
+    __m256d aVal, bVal, cVal;
+    for (; number < quarter_points; number++) {
+
+        aVal = _mm256_load_pd(aPtr);
+        bVal = _mm256_load_pd(bPtr);
+
+        cVal = _mm256_mul_pd(aVal, bVal);
+
+        _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
+
+        aPtr += 4;
+        bPtr += 4;
+        cPtr += 4;
+    }
+
+    number = quarter_points * 4;
+    for (; number < num_points; number++) {
+        *cPtr++ = (*aPtr++) * (*bPtr++);
+    }
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_64f_x2_multiply_64f_u_H */
diff --git a/kernels/volk/volk_64u_byteswap.h b/kernels/volk/volk_64u_byteswap.h

new file mode 100644 (file)

index 0000000..22bccab
--- /dev/null
+++ b/kernels/volk/volk_64u_byteswap.h
@@ -0,0 +1,501 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64u_byteswap
+ *
+ * \b Overview
+ *
+ * Byteswaps (in-place) an aligned vector of int64_t's.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64u_byteswap(uint64_t* intsToSwap, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li intsToSwap: The vector of data to byte swap
+ * \li num_points: The number of data points
+ *
+ * \b Outputs
+ * \li intsToSwap: returns as an in-place calculation.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff,
+ *       0x5a5a5a5a5a5a5a5a, 0xa5a5a5a5a5a5a5a5, 0x2a2a2a2a2a2a2a2a,
+ *       0xffffffff, 0x32, 0x64};
+ *   uint64_t hamming_distance = 0;
+ *
+ *   printf("byteswap vector =\n");
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       printf("    %.16lx\n", bitstring[ii]);
+ *   }
+ *
+ *   volk_64u_byteswap(bitstring, N);
+ *
+ *   printf("byteswapped vector =\n");
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       printf("    %.16lx\n", bitstring[ii]);
+ *   }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64u_byteswap_u_H
+#define INCLUDED_volk_64u_byteswap_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64u_byteswap_u_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for (; number < halfPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_loadu_si128((__m128i*)inputPtr);
+
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+
+        // Reorder the two words
+        output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64u_byteswap_generic(uint64_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_64u_byteswap_a_avx2(uint64_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int nPerSet = 4;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
+                                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
+                                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+    for (; number < nSets; number++) {
+
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_load_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_store_si256((__m256i*)inputPtr, output);
+
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void volk_64u_byteswap_a_ssse3(uint64_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int nPerSet = 2;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+    uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
+
+    for (; number < nSets; number++) {
+
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m128i input = _mm_load_si128((__m128i*)inputPtr);
+        const __m128i output = _mm_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_NEONV8
+#include <arm_neon.h>
+
+static inline void volk_64u_byteswap_neonv8(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    const unsigned int n4points = num_points / 4;
+    uint8x16x2_t input;
+    uint8x16_t idx = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    unsigned int number = 0;
+    for (number = 0; number < n4points; ++number) {
+        __VOLK_PREFETCH(inputPtr + 8);
+        input = vld2q_u8((uint8_t*)inputPtr);
+        input.val[0] = vqtbl1q_u8(input.val[0], idx);
+        input.val[1] = vqtbl1q_u8(input.val[1], idx);
+        vst2q_u8((uint8_t*)inputPtr, input);
+
+        inputPtr += 8;
+    }
+
+    for (number = n4points * 4; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#else
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_64u_byteswap_neon(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int number = 0;
+    unsigned int n8points = num_points / 4;
+
+    uint8x8x4_t input_table;
+    uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
+    uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
+
+    /* these magic numbers are used as byte-indices in the LUT.
+       they are pre-computed to save time. A simple C program
+       can calculate them; for example for lookup01:
+      uint8_t chars[8] = {24, 16, 8, 0, 25, 17, 9, 1};
+      for(ii=0; ii < 8; ++ii) {
+          index += ((uint64_t)(*(chars+ii))) << (ii*8);
+      }
+    */
+    int_lookup01 = vcreate_u8(2269495096316185);
+    int_lookup23 = vcreate_u8(146949840772469531);
+    int_lookup45 = vcreate_u8(291630186448622877);
+    int_lookup67 = vcreate_u8(436310532124776223);
+
+    for (number = 0; number < n8points; ++number) {
+        input_table = vld4_u8((uint8_t*)inputPtr);
+        swapped_int01 = vtbl4_u8(input_table, int_lookup01);
+        swapped_int23 = vtbl4_u8(input_table, int_lookup23);
+        swapped_int45 = vtbl4_u8(input_table, int_lookup45);
+        swapped_int67 = vtbl4_u8(input_table, int_lookup67);
+        vst1_u8((uint8_t*)inputPtr, swapped_int01);
+        vst1_u8((uint8_t*)(inputPtr + 2), swapped_int23);
+        vst1_u8((uint8_t*)(inputPtr + 4), swapped_int45);
+        vst1_u8((uint8_t*)(inputPtr + 6), swapped_int67);
+
+        inputPtr += 4;
+    }
+
+    for (number = n8points * 4; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_NEON */
+#endif
+
+#endif /* INCLUDED_volk_64u_byteswap_u_H */
+#ifndef INCLUDED_volk_64u_byteswap_a_H
+#define INCLUDED_volk_64u_byteswap_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_SSE2
+#include <emmintrin.h>
+
+static inline void volk_64u_byteswap_a_sse2(uint64_t* intsToSwap, unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    __m128i input, byte1, byte2, byte3, byte4, output;
+    __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
+    __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
+    uint64_t number = 0;
+    const unsigned int halfPoints = num_points / 2;
+    for (; number < halfPoints; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        input = _mm_load_si128((__m128i*)inputPtr);
+
+        // Do the four shifts
+        byte1 = _mm_slli_epi32(input, 24);
+        byte2 = _mm_slli_epi32(input, 8);
+        byte3 = _mm_srli_epi32(input, 8);
+        byte4 = _mm_srli_epi32(input, 24);
+        // Or bytes together
+        output = _mm_or_si128(byte1, byte4);
+        byte2 = _mm_and_si128(byte2, byte2mask);
+        output = _mm_or_si128(output, byte2);
+        byte3 = _mm_and_si128(byte3, byte3mask);
+        output = _mm_or_si128(output, byte3);
+
+        // Reorder the two words
+        output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Store the results
+        _mm_store_si128((__m128i*)inputPtr, output);
+        inputPtr += 4;
+    }
+
+    // Byteswap any remaining points:
+    number = halfPoints * 2;
+    for (; number < num_points; number++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_SSE2 */
+
+#if LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_64u_byteswap_u_avx2(uint64_t* intsToSwap, unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int nPerSet = 4;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+    const uint8_t shuffleVector[32] = { 7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
+                                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
+                                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24 };
+
+    const __m256i myShuffle = _mm256_loadu_si256((__m256i*)&shuffleVector[0]);
+
+    for (; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m256i input = _mm256_loadu_si256((__m256i*)inputPtr);
+        const __m256i output = _mm256_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm256_storeu_si256((__m256i*)inputPtr, output);
+
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
+}
+
+#endif /* LV_HAVE_AVX2 */
+
+
+#if LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void volk_64u_byteswap_u_ssse3(uint64_t* intsToSwap,
+                                             unsigned int num_points)
+{
+    unsigned int number = 0;
+
+    const unsigned int nPerSet = 2;
+    const uint64_t nSets = num_points / nPerSet;
+
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+
+    uint8_t shuffleVector[16] = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+
+    const __m128i myShuffle = _mm_loadu_si128((__m128i*)&shuffleVector);
+
+    for (; number < nSets; number++) {
+        // Load the 32t values, increment inputPtr later since we're doing it in-place.
+        const __m128i input = _mm_loadu_si128((__m128i*)inputPtr);
+        const __m128i output = _mm_shuffle_epi8(input, myShuffle);
+
+        // Store the results
+        _mm_storeu_si128((__m128i*)inputPtr, output);
+
+        /*  inputPtr is 32bit so increment twice  */
+        inputPtr += 2 * nPerSet;
+    }
+
+    // Byteswap any remaining points:
+    for (number = nSets * nPerSet; number < num_points; ++number) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+        uint32_t out1 =
+            ((((output1) >> 24) & 0x000000ff) | (((output1) >> 8) & 0x0000ff00) |
+             (((output1) << 8) & 0x00ff0000) | (((output1) << 24) & 0xff000000));
+
+        uint32_t out2 =
+            ((((output2) >> 24) & 0x000000ff) | (((output2) >> 8) & 0x0000ff00) |
+             (((output2) << 8) & 0x00ff0000) | (((output2) << 24) & 0xff000000));
+        *inputPtr++ = out2;
+        *inputPtr++ = out1;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_64u_byteswap_a_generic(uint64_t* intsToSwap,
+                                               unsigned int num_points)
+{
+    uint32_t* inputPtr = (uint32_t*)intsToSwap;
+    unsigned int point;
+    for (point = 0; point < num_points; point++) {
+        uint32_t output1 = *inputPtr;
+        uint32_t output2 = inputPtr[1];
+
+        output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) |
+                   ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
+
+        output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) |
+                   ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
+
+        *inputPtr++ = output2;
+        *inputPtr++ = output1;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_64u_byteswap_a_H */
diff --git a/kernels/volk/volk_64u_byteswappuppet_64u.h b/kernels/volk/volk_64u_byteswappuppet_64u.h

new file mode 100644 (file)

index 0000000..b1004bb
--- /dev/null
+++ b/kernels/volk/volk_64u_byteswappuppet_64u.h
@@ -0,0 +1,117 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014, 2015, 2018, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_64u_byteswappuppet_64u_H
+#define INCLUDED_volk_64u_byteswappuppet_64u_H
+
+
+#include <stdint.h>
+#include <string.h>
+#include <volk/volk_64u_byteswap.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_byteswappuppet_64u_generic(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
+
+    volk_64u_byteswap_generic((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_NEONV8
+static inline void volk_64u_byteswappuppet_64u_neonv8(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_64u_byteswap_neonv8((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#else
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_byteswappuppet_64u_neon(uint64_t* output,
+                                                    uint64_t* intsToSwap,
+                                                    unsigned int num_points)
+{
+
+    volk_64u_byteswap_neon((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_u_sse2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_64u_byteswap_u_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSE2
+static inline void volk_64u_byteswappuppet_64u_a_sse2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_64u_byteswap_a_sse2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSSE3
+static inline void volk_64u_byteswappuppet_64u_u_ssse3(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
+
+    volk_64u_byteswap_u_ssse3((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_SSSE3
+static inline void volk_64u_byteswappuppet_64u_a_ssse3(uint64_t* output,
+                                                       uint64_t* intsToSwap,
+                                                       unsigned int num_points)
+{
+
+    volk_64u_byteswap_a_ssse3((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_64u_byteswappuppet_64u_u_avx2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_64u_byteswap_u_avx2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_64u_byteswappuppet_64u_a_avx2(uint64_t* output,
+                                                      uint64_t* intsToSwap,
+                                                      unsigned int num_points)
+{
+
+    volk_64u_byteswap_a_avx2((uint64_t*)intsToSwap, num_points);
+    memcpy((void*)output, (void*)intsToSwap, num_points * sizeof(uint64_t));
+}
+#endif
+
+#endif
diff --git a/kernels/volk/volk_64u_popcnt.h b/kernels/volk/volk_64u_popcnt.h

new file mode 100644 (file)

index 0000000..5c9b2a3
--- /dev/null
+++ b/kernels/volk/volk_64u_popcnt.h
@@ -0,0 +1,120 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_64u_popcnt
+ *
+ * \b Overview
+ *
+ * Computes the population count (popcnt), or Hamming distance of a
+ * binary string. This kernel takes in a single unsigned 64-bit value
+ * and returns the count of 1's that the value contains.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_64u_popcnt(uint64_t* ret, const uint64_t value)
+ * \endcode
+ *
+ * \b Inputs
+ * \li value: The input value.
+ *
+ * \b Outputs
+ * \li ret: The return value containing the popcnt.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *
+ *   uint64_t bitstring[] = {0x0, 0x1, 0xf, 0xffffffffffffffff,
+ *       0x5555555555555555, 0xaaaaaaaaaaaaaaaa, 0x2a2a2a2a2a2a2a2a,
+ *       0xffffffff, 0x32, 0x64};
+ *   uint64_t hamming_distance = 0;
+ *
+ *   for(unsigned int ii=0; ii<N; ++ii){
+ *       volk_64u_popcnt(&hamming_distance, bitstring[ii]);
+ *       printf("hamming distance of %lx = %li\n", bitstring[ii], hamming_distance);
+ *   }
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_64u_popcnt_a_H
+#define INCLUDED_volk_64u_popcnt_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+
+static inline void volk_64u_popcnt_generic(uint64_t* ret, const uint64_t value)
+{
+    // const uint32_t* valueVector = (const uint32_t*)&value;
+
+    // This is faster than a lookup table
+    // uint32_t retVal = valueVector[0];
+    uint32_t retVal = (uint32_t)(value & 0x00000000FFFFFFFFull);
+
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+    uint64_t retVal64 = retVal;
+
+    // retVal = valueVector[1];
+    retVal = (uint32_t)((value & 0xFFFFFFFF00000000ull) >> 32);
+    retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555);
+    retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333);
+    retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F;
+    retVal = (retVal + (retVal >> 8));
+    retVal = (retVal + (retVal >> 16)) & 0x0000003F;
+    retVal64 += retVal;
+
+    *ret = retVal64;
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+
+#include <nmmintrin.h>
+
+static inline void volk_64u_popcnt_a_sse4_2(uint64_t* ret, const uint64_t value)
+{
+    *ret = _mm_popcnt_u64(value);
+}
+
+#endif /*LV_HAVE_SSE4_2*/
+
+
+#if LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_64u_popcnt_neon(uint64_t* ret, const uint64_t value)
+{
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((unsigned char*)&value);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(ret, count64x1_val);
+
+    //*ret = _mm_popcnt_u64(value);
+}
+#endif /*LV_HAVE_NEON*/
+
+
+#endif /*INCLUDED_volk_64u_popcnt_a_H*/
diff --git a/kernels/volk/volk_64u_popcntpuppet_64u.h b/kernels/volk/volk_64u_popcntpuppet_64u.h

new file mode 100644 (file)

index 0000000..300d4fd
--- /dev/null
+++ b/kernels/volk/volk_64u_popcntpuppet_64u.h
@@ -0,0 +1,56 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_64u_popcntpuppet_64u_H
+#define INCLUDED_volk_64u_popcntpuppet_64u_H
+
+#include <stdint.h>
+#include <string.h>
+#include <volk/volk_64u_popcnt.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector,
+                                                     const uint64_t* inVector,
+                                                     unsigned int num_points)
+{
+    unsigned int ii;
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_generic(outVector + ii, num_points);
+    }
+    memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_GENERIC */
+
+#if LV_HAVE_SSE4_2 && LV_HAVE_64
+static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector,
+                                                      const uint64_t* inVector,
+                                                      unsigned int num_points)
+{
+    unsigned int ii;
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_a_sse4_2(outVector + ii, num_points);
+    }
+    memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_SSE4_2 */
+
+#ifdef LV_HAVE_NEON
+static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector,
+                                                  const uint64_t* inVector,
+                                                  unsigned int num_points)
+{
+    unsigned int ii;
+    for (ii = 0; ii < num_points; ++ii) {
+        volk_64u_popcnt_neon(outVector + ii, num_points);
+    }
+    memcpy((void*)outVector, (void*)inVector, num_points * sizeof(uint64_t));
+}
+#endif /* LV_HAVE_NEON */
+
+#endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */
diff --git a/kernels/volk/volk_8i_convert_16i.h b/kernels/volk/volk_8i_convert_16i.h

new file mode 100644 (file)

index 0000000..d52d4a6
--- /dev/null
+++ b/kernels/volk/volk_8i_convert_16i.h
@@ -0,0 +1,287 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8i_convert_16i
+ *
+ * \b Overview
+ *
+ * Convert the input vector of 8-bit chars to a vector of 16-bit
+ * shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8i_convert_16i(int16_t* outputVector, const int8_t* inputVector, unsigned int
+ * num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li outputVector: The output 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8i_convert_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8i_convert_16i_u_H
+#define INCLUDED_volk_8i_convert_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
+                                              const int8_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m256i* outputVectorPtr = (__m256i*)outputVector;
+    __m128i inputVal;
+    __m256i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128(inputVectorPtr);
+        ret = _mm256_cvtepi8_epi16(inputVal);
+        ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+        _mm256_storeu_si256(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
+                                                const int8_t* inputVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128(inputVectorPtr);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_storeu_si128(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+
+        inputVal = _mm_srli_si128(inputVal, 8);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_storeu_si128(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
+                                               const int8_t* inputVector,
+                                               unsigned int num_points)
+{
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
+
+
+#ifndef INCLUDED_volk_8i_convert_16i_a_H
+#define INCLUDED_volk_8i_convert_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
+                                              const int8_t* inputVector,
+                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m256i* outputVectorPtr = (__m256i*)outputVector;
+    __m128i inputVal;
+    __m256i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128(inputVectorPtr);
+        ret = _mm256_cvtepi8_epi16(inputVal);
+        ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
+        _mm256_store_si256(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
+                                                const int8_t* inputVector,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const __m128i* inputVectorPtr = (const __m128i*)inputVector;
+    __m128i* outputVectorPtr = (__m128i*)outputVector;
+    __m128i inputVal;
+    __m128i ret;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128(inputVectorPtr);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_store_si128(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+
+        inputVal = _mm_srli_si128(inputVal, 8);
+        ret = _mm_cvtepi8_epi16(inputVal);
+        ret = _mm_slli_epi16(ret, 8); // Multiply by 256
+        _mm_store_si128(outputVectorPtr, ret);
+
+        outputVectorPtr++;
+
+        inputVectorPtr++;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (int16_t)(inputVector[number]) * 256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
+                                                 const int8_t* inputVector,
+                                                 unsigned int num_points)
+{
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
+                                            const int8_t* inputVector,
+                                            unsigned int num_points)
+{
+    int16_t* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number;
+    const unsigned int eighth_points = num_points / 8;
+
+    int8x8_t input_vec;
+    int16x8_t converted_vec;
+
+    // NEON doesn't have a concept of 8 bit registers, so we are really
+    // dealing with the low half of 16-bit registers. Since this requires
+    // a move instruction we likely do better with ASM here.
+    for (number = 0; number < eighth_points; ++number) {
+        input_vec = vld1_s8(inputVectorPtr);
+        converted_vec = vmovl_s8(input_vec);
+        // converted_vec = vmulq_s16(converted_vec, scale_factor);
+        converted_vec = vshlq_n_s16(converted_vec, 8);
+        vst1q_s16(outputVectorPtr, converted_vec);
+
+        inputVectorPtr += 8;
+        outputVectorPtr += 8;
+    }
+
+    for (number = eighth_points * 8; number < num_points; number++) {
+        *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
+                                           const int8_t* inputVector,
+                                           unsigned int num_points);
+
+static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
+                                             const int8_t* inputVector,
+                                             unsigned int num_points)
+{
+    volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
diff --git a/kernels/volk/volk_8i_s32f_convert_32f.h b/kernels/volk/volk_8i_s32f_convert_32f.h

new file mode 100644 (file)

index 0000000..bed006e
--- /dev/null
+++ b/kernels/volk/volk_8i_s32f_convert_32f.h
@@ -0,0 +1,373 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8i_s32f_convert_32f
+ *
+ * \b Overview
+ *
+ * Convert the input vector of 8-bit chars to a vector of floats. The
+ * floats are then divided by the scalar factor.  shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8i_s32f_convert_32f(float* outputVector, const int8_t* inputVector, const
+ * float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li inputVector: The input vector of 8-bit chars.
+ * \li scalar: the scaling factor used to divide the results of the conversion.
+ * \li num_points: The number of values.
+ *
+ * \b Outputs
+ * \li outputVector: The output 16-bit shorts.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8i_s32f_convert_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
+#define INCLUDED_volk_8i_s32f_convert_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8i_s32f_convert_32f_u_avx2(float* outputVector,
+                                                   const int8_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m256 ret;
+    __m128i inputVal128;
+    __m256i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVal128 = _mm_srli_si128(inputVal128, 8);
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8i_s32f_convert_32f_u_sse4_1(float* outputVector,
+                                                     const int8_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_storeu_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8i_s32f_convert_32f_generic(float* outputVector,
+                                                    const int8_t* inputVector,
+                                                    const float scalar,
+                                                    unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_UNALIGNED8_H */
+
+#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
+#define INCLUDED_volk_8i_s32f_convert_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8i_s32f_convert_32f_a_avx2(float* outputVector,
+                                                   const int8_t* inputVector,
+                                                   const float scalar,
+                                                   unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m256 ret;
+    __m128i inputVal128;
+    __m256i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVal128 = _mm_srli_si128(inputVal128, 8);
+        interimVal = _mm256_cvtepi8_epi32(inputVal128);
+        ret = _mm256_cvtepi32_ps(interimVal);
+        ret = _mm256_mul_ps(ret, invScalar);
+        _mm256_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 8;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8i_s32f_convert_32f_a_sse4_1(float* outputVector,
+                                                     const int8_t* inputVector,
+                                                     const float scalar,
+                                                     unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    float* outputVectorPtr = outputVector;
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    const int8_t* inputVectorPtr = inputVector;
+    __m128 ret;
+    __m128i inputVal;
+    __m128i interimVal;
+
+    for (; number < sixteenthPoints; number++) {
+        inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
+
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVal = _mm_srli_si128(inputVal, 4);
+        interimVal = _mm_cvtepi8_epi32(inputVal);
+        ret = _mm_cvtepi32_ps(interimVal);
+        ret = _mm_mul_ps(ret, invScalar);
+        _mm_store_ps(outputVectorPtr, ret);
+        outputVectorPtr += 4;
+
+        inputVectorPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        outputVector[number] = (float)(inputVector[number]) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_8i_s32f_convert_32f_neon(float* outputVector,
+                                                 const int8_t* inputVector,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+
+    const float iScalar = 1.0 / scalar;
+    const float32x4_t qiScalar = vdupq_n_f32(iScalar);
+
+    int8x16_t inputVal;
+
+    int16x8_t lower;
+    int16x8_t higher;
+
+    float32x4_t outputFloat;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    for (; number < sixteenthPoints; number++) {
+        inputVal = vld1q_s8(inputVectorPtr);
+        inputVectorPtr += 16;
+
+        lower = vmovl_s8(vget_low_s8(inputVal));
+        higher = vmovl_s8(vget_high_s8(inputVal));
+
+        outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat);
+        outputVectorPtr += 4;
+
+        outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat);
+        outputVectorPtr += 4;
+
+        outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat);
+        outputVectorPtr += 4;
+
+        outputFloat =
+            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
+        vst1q_f32(outputVectorPtr, outputFloat);
+        outputVectorPtr += 4;
+    }
+    for (number = sixteenthPoints * 16; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
+}
+
+#endif /* LV_HAVE_NEON */
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8i_s32f_convert_32f_a_generic(float* outputVector,
+                                                      const int8_t* inputVector,
+                                                      const float scalar,
+                                                      unsigned int num_points)
+{
+    float* outputVectorPtr = outputVector;
+    const int8_t* inputVectorPtr = inputVector;
+    unsigned int number = 0;
+    const float iScalar = 1.0 / scalar;
+
+    for (number = 0; number < num_points; number++) {
+        *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_ORC
+extern void volk_8i_s32f_convert_32f_a_orc_impl(float* outputVector,
+                                                const int8_t* inputVector,
+                                                const float scalar,
+                                                unsigned int num_points);
+
+static inline void volk_8i_s32f_convert_32f_u_orc(float* outputVector,
+                                                  const int8_t* inputVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
+{
+    float invscalar = 1.0 / scalar;
+    volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
+}
+#endif /* LV_HAVE_ORC */
+
+
+#endif /* INCLUDED_VOLK_8s_CONVERT_32f_ALIGNED8_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_16i_x2.h b/kernels/volk/volk_8ic_deinterleave_16i_x2.h

new file mode 100644 (file)

index 0000000..46b2e2e
--- /dev/null
+++ b/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -0,0 +1,395 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_16i_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into I & Q vector data
+ * and converts them to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_16i_x2(int16_t* iBuffer, int16_t* qBuffer, const lv_8sc_t*
+ * complexVector, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_16i_x2();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i iOutputVal0, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+        qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+        iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+        iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+        qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+        qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
+                                                         int16_t* qBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i iMoveMask = _mm_set_epi8(0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     14,
+                                     12,
+                                     10,
+                                     8,
+                                     6,
+                                     4,
+                                     2,
+                                     0); // set 16 byte values
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+    __m128i complexVal, iOutputVal, qOutputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16; // aligned load
+
+        iOutputVal = _mm_shuffle_epi8(complexVal,
+                                      iMoveMask); // shuffle 16 bytes of 128bit complexVal
+        qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+        iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
+                                                    // of lower 8 bytes of input to output
+        iOutputVal =
+            _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
+                                           // 16-bit integers, shift in with zeros
+
+        qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
+        qOutputVal = _mm_slli_epi16(qOutputVal, 8);
+
+        _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+        _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 8;
+        qBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
+                                                      int16_t* qBuffer,
+                                                      const lv_8sc_t* complexVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m128i iMoveMask = _mm_set_epi8(0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     0x80,
+                                     14,
+                                     12,
+                                     10,
+                                     8,
+                                     6,
+                                     4,
+                                     2,
+                                     0); // set 16 byte values
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i complexVal1, complexVal0;
+    __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32; // aligned load
+
+        // Extract from complexVal to iOutputVal and qOutputVal
+        complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+        complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        iOutputVal1 = _mm_shuffle_epi8(
+            complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
+        iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+        qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+        qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+        iOutputVal1 =
+            _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
+                                            // lower 8 bytes of input to output
+        iOutputVal1 =
+            _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
+                                            // 16-bit integers, shift in with zeros
+        iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+        iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+        qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+        qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+        qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+        qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+        // Pack iOutputVal0,1 to iOutputVal
+        __m256i dummy = _mm256_setzero_si256();
+        iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+        iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+        qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+        qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+        _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
+                                                        int16_t* qBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    unsigned int number;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+        *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
+#define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
+                                                       int16_t* qBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    int16_t* qBufferPtr = qBuffer;
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, iOutputVal, qOutputVal;
+    __m128i iOutputVal0, qOutputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
+        qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
+
+        iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
+        iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
+
+        qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
+        qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
+        _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
+
+        iBufferPtr += 16;
+        qBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ =
+            ((int16_t)*complexVectorPtr++) *
+            256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
+        *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_16i.h b/kernels/volk/volk_8ic_deinterleave_real_16i.h

new file mode 100644 (file)

index 0000000..bef4759
--- /dev/null
+++ b/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -0,0 +1,303 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_real_16i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the I (real)
+ * vector and converts it to 16-bit shorts.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_real_16i(int16_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_real_16i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, outputVal;
+    __m128i outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal = _mm256_cvtepi8_epi16(outputVal0);
+        outputVal = _mm256_slli_epi16(outputVal, 7);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer,
+                                                           const lv_8sc_t* complexVector,
+                                                           unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i complexVal, outputVal;
+
+    unsigned int eighthPoints = num_points / 8;
+
+    for (number = 0; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+        outputVal = _mm_cvtepi8_epi16(complexVal);
+        outputVal = _mm_slli_epi16(outputVal, 7);
+
+        _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+        iBufferPtr += 8;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m256i complexVal, outputVal;
+    __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+        complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+        outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+
+        outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+        outputVal1 = _mm_slli_epi16(outputVal1, 7);
+        outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+        outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+        __m256i dummy = _mm256_setzero_si256();
+        outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+        outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8ic_deinterleave_real_16i_generic(int16_t* iBuffer,
+                                                          const lv_8sc_t* complexVector,
+                                                          unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)(*complexVectorPtr++)) * 128;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_a_H */
+
+#ifndef INCLUDED_volk_8ic_deinterleave_real_16i_u_H
+#define INCLUDED_volk_8ic_deinterleave_real_16i_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int16_t* iBufferPtr = iBuffer;
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    __m256i complexVal, outputVal;
+    __m128i outputVal0;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+
+        outputVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+        outputVal = _mm256_cvtepi8_epi16(outputVal0);
+        outputVal = _mm256_slli_epi16(outputVal, 7);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+#endif /* INCLUDED_volk_8ic_deinterleave_real_16i_u_H */
diff --git a/kernels/volk/volk_8ic_deinterleave_real_8i.h b/kernels/volk/volk_8ic_deinterleave_real_8i.h

new file mode 100644 (file)

index 0000000..116b1af
--- /dev/null
+++ b/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -0,0 +1,405 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_deinterleave_real_8i
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the I (real)
+ * vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_deinterleave_real_8i(int8_t* iBuffer, const lv_8sc_t* complexVector,
+ * unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_deinterleave_real_8i();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i moveMask1 = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i moveMask2 = _mm256_set_epi8(14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+        outputVal = _mm256_or_si256(complexVal1, complexVal2);
+        outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i moveMask1 = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i moveMask2 = _mm_set_epi8(
+        14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m128i complexVal1, complexVal2, outputVal;
+
+    unsigned int sixteenthPoints = num_points / 16;
+
+    for (number = 0; number < sixteenthPoints; number++) {
+        complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+
+        complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
+
+        outputVal = _mm_or_si128(complexVal1, complexVal2);
+
+        _mm_store_si128((__m128i*)iBufferPtr, outputVal);
+        iBufferPtr += 16;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
+                                                       const lv_8sc_t* complexVector,
+                                                       unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m128i moveMaskL = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i moveMaskH = _mm_set_epi8(
+        14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+    __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
+        outputVal2;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+        complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+        complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+        complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+        complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+        complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+        outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+
+        complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+        complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+        outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+        __m256i dummy = _mm256_setzero_si256();
+        outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+        outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+        _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
+                                                         const lv_8sc_t* complexVector,
+                                                         unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
+                                                      const lv_8sc_t* complexVector,
+                                                      unsigned int num_points)
+{
+    unsigned int number;
+    unsigned int sixteenth_points = num_points / 16;
+
+    int8x16x2_t input_vector;
+    for (number = 0; number < sixteenth_points; ++number) {
+        input_vector = vld2q_s8((int8_t*)complexVector);
+        vst1q_s8(iBuffer, input_vector.val[0]);
+        iBuffer += 16;
+        complexVector += 16;
+    }
+
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    for (number = sixteenth_points * 16; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
+
+#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
+#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
+                                                        const lv_8sc_t* complexVector,
+                                                        unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (int8_t*)complexVector;
+    int8_t* iBufferPtr = iBuffer;
+    __m256i moveMask1 = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i moveMask2 = _mm256_set_epi8(14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80);
+    __m256i complexVal1, complexVal2, outputVal;
+
+    unsigned int thirtysecondPoints = num_points / 32;
+
+    for (number = 0; number < thirtysecondPoints; number++) {
+
+        complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+
+        complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
+        complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
+        outputVal = _mm256_or_si256(complexVal1, complexVal2);
+        outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
+
+        _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
+        iBufferPtr += 32;
+    }
+
+    number = thirtysecondPoints * 32;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = *complexVectorPtr++;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h

new file mode 100644 (file)

index 0000000..8936a16
--- /dev/null
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_32f_x2.h
@@ -0,0 +1,444 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_s32f_deinterleave_32f_x2
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into I & Q vector data,
+ * converts them to floats, and divides the results by the scalar
+ * factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_s32f_deinterleave_32f_x2(float* iBuffer, float* qBuffer, const lv_8sc_t*
+ * complexVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The scalar value used to divide the floating point results.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ * \li qBuffer: The Q buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_s32f_deinterleave_32f_x2();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
+                                           float* qBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    __m128 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m128i iMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m128i qMoveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+        qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
+
+        iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        _mm_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 4;
+
+        iComplexVal = _mm_srli_si128(iComplexVal, 4);
+
+        iIntVal = _mm_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+        _mm_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 4;
+
+        qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+        _mm_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 4;
+
+        qComplexVal = _mm_srli_si128(qComplexVal, 4);
+
+        qIntVal = _mm_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
+        _mm_store_ps(qBufferPtr, qFloatValue);
+
+        qBufferPtr += 4;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
+                                                           float* qBuffer,
+                                                           const lv_8sc_t* complexVector,
+                                                           const float scalar,
+                                                           unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 cplxValue1, cplxValue2, iValue, qValue;
+
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
+
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(complexVectorPtr[0]);
+        floatBuffer[1] = (float)(complexVectorPtr[1]);
+        floatBuffer[2] = (float)(complexVectorPtr[2]);
+        floatBuffer[3] = (float)(complexVectorPtr[3]);
+
+        floatBuffer[4] = (float)(complexVectorPtr[4]);
+        floatBuffer[5] = (float)(complexVectorPtr[5]);
+        floatBuffer[6] = (float)(complexVectorPtr[6]);
+        floatBuffer[7] = (float)(complexVectorPtr[7]);
+
+        cplxValue1 = _mm_load_ps(&floatBuffer[0]);
+        cplxValue2 = _mm_load_ps(&floatBuffer[4]);
+
+        complexVectorPtr += 8;
+
+        cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
+        cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
+
+        // Arrange in i1i2i3i4 format
+        iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
+        qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
+
+        _mm_store_ps(iBufferPtr, iValue);
+        _mm_store_ps(qBufferPtr, qValue);
+
+        iBufferPtr += 4;
+        qBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    complexVectorPtr = (int8_t*)&complexVector[number];
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
+                                                            float* qBuffer,
+                                                            const lv_8sc_t* complexVector,
+                                                            const float scalar,
+                                                            unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i iMoveMask = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        14,
+                                        12,
+                                        10,
+                                        8,
+                                        6,
+                                        4,
+                                        2,
+                                        0);
+    __m256i qMoveMask = _mm256_set_epi8(0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        15,
+                                        13,
+                                        11,
+                                        9,
+                                        7,
+                                        5,
+                                        3,
+                                        1,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        0x80,
+                                        15,
+                                        13,
+                                        11,
+                                        9,
+                                        7,
+                                        5,
+                                        3,
+                                        1);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
+        qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
+
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+
+        qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
+        qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_store_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_s32f_deinterleave_32f_x2_generic(float* iBuffer,
+                                          float* qBuffer,
+                                          const lv_8sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
+{
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+    unsigned int number;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
+
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
+                                                            float* qBuffer,
+                                                            const lv_8sc_t* complexVector,
+                                                            const float scalar,
+                                                            unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+    float* qBufferPtr = qBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue, qFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal, qIntVal;
+    __m128i iComplexVal, qComplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i MoveMask = _mm256_set_epi8(15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       15,
+                                       13,
+                                       11,
+                                       9,
+                                       7,
+                                       5,
+                                       3,
+                                       1,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
+        iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+        qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+        iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_storeu_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+
+        complexVal = _mm256_srli_si256(complexVal, 8);
+        iComplexVal = _mm256_extractf128_si256(complexVal, 0);
+        qComplexVal = _mm256_extractf128_si256(complexVal, 1);
+
+        iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
+        qFloatValue = _mm256_cvtepi32_ps(qIntVal);
+        qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
+        _mm256_storeu_ps(qBufferPtr, qFloatValue);
+        qBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
diff --git a/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h

new file mode 100644 (file)

index 0000000..37cb255
--- /dev/null
+++ b/kernels/volk/volk_8ic_s32f_deinterleave_real_32f.h
@@ -0,0 +1,353 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_s32f_deinterleave_real_32f
+ *
+ * \b Overview
+ *
+ * Deinterleaves the complex 8-bit char vector into just the real (I)
+ * vector, converts the samples to floats, and divides the results by
+ * the scalar factor.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_s32f_deinterleave_real_32f(float* iBuffer, const lv_8sc_t* complexVector,
+ * const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li complexVector: The complex input vector.
+ * \li scalar: The scalar value used to divide the floating point results.
+ * \li num_points: The number of complex data values to be deinterleaved.
+ *
+ * \b Outputs
+ * \li iBuffer: The I buffer output data.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8ic_s32f_deinterleave_real_32f();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_a_avx2(float* iBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+
+        complexVal = _mm256_permute4x64_epi64(complexVal, 0b11000110);
+        iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(complexVal));
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+        _mm256_store_ps(iBufferPtr, iFloatValue);
+        iBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_a_sse4_1(float* iBuffer,
+                                             const lv_8sc_t* complexVector,
+                                             const float scalar,
+                                             unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int eighthPoints = num_points / 8;
+    __m128 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    __m128i complexVal, iIntVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m128i moveMask = _mm_set_epi8(
+        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+
+    for (; number < eighthPoints; number++) {
+        complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
+        complexVectorPtr += 16;
+        complexVal = _mm_shuffle_epi8(complexVal, moveMask);
+
+        iIntVal = _mm_cvtepi8_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+        _mm_store_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 4;
+
+        complexVal = _mm_srli_si128(complexVal, 4);
+        iIntVal = _mm_cvtepi8_epi32(complexVal);
+        iFloatValue = _mm_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
+
+        _mm_store_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 4;
+    }
+
+    number = eighthPoints * 8;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_a_sse(float* iBuffer,
+                                          const lv_8sc_t* complexVector,
+                                          const float scalar,
+                                          unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+    __m128 iValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m128 invScalar = _mm_set_ps1(iScalar);
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
+
+    for (; number < quarterPoints; number++) {
+        floatBuffer[0] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[1] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[2] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+        floatBuffer[3] = (float)(*complexVectorPtr);
+        complexVectorPtr += 2;
+
+        iValue = _mm_load_ps(floatBuffer);
+
+        iValue = _mm_mul_ps(iValue, invScalar);
+
+        _mm_store_ps(iBufferPtr, iValue);
+
+        iBufferPtr += 4;
+    }
+
+    number = quarterPoints * 4;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_generic(float* iBuffer,
+                                            const lv_8sc_t* complexVector,
+                                            const float scalar,
+                                            unsigned int num_points)
+{
+    unsigned int number = 0;
+    const int8_t* complexVectorPtr = (const int8_t*)complexVector;
+    float* iBufferPtr = iBuffer;
+    const float invScalar = 1.0 / scalar;
+    for (number = 0; number < num_points; number++) {
+        *iBufferPtr++ = ((float)(*complexVectorPtr++)) * invScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_a_H */
+
+#ifndef INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
+#define INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_common.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_8ic_s32f_deinterleave_real_32f_u_avx2(float* iBuffer,
+                                           const lv_8sc_t* complexVector,
+                                           const float scalar,
+                                           unsigned int num_points)
+{
+    float* iBufferPtr = iBuffer;
+
+    unsigned int number = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+    __m256 iFloatValue;
+
+    const float iScalar = 1.0 / scalar;
+    __m256 invScalar = _mm256_set1_ps(iScalar);
+    __m256i complexVal, iIntVal;
+    __m128i hcomplexVal;
+    int8_t* complexVectorPtr = (int8_t*)complexVector;
+
+    __m256i moveMask = _mm256_set_epi8(0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       0x80,
+                                       14,
+                                       12,
+                                       10,
+                                       8,
+                                       6,
+                                       4,
+                                       2,
+                                       0);
+
+    for (; number < sixteenthPoints; number++) {
+        complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
+        complexVectorPtr += 32;
+        complexVal = _mm256_shuffle_epi8(complexVal, moveMask);
+
+        hcomplexVal = _mm256_extracti128_si256(complexVal, 0);
+        iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+
+        hcomplexVal = _mm256_extracti128_si256(complexVal, 1);
+        iIntVal = _mm256_cvtepi8_epi32(hcomplexVal);
+        iFloatValue = _mm256_cvtepi32_ps(iIntVal);
+
+        iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
+
+        _mm256_storeu_ps(iBufferPtr, iFloatValue);
+
+        iBufferPtr += 8;
+    }
+
+    number = sixteenthPoints * 16;
+    for (; number < num_points; number++) {
+        *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
+        complexVectorPtr++;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_8ic_s32f_deinterleave_real_32f_u_H */
diff --git a/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h

new file mode 100644 (file)

index 0000000..f1080f5
--- /dev/null
+++ b/kernels/volk/volk_8ic_x2_multiply_conjugate_16ic.h
@@ -0,0 +1,276 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
+                                                              const lv_8sc_t* aVector,
+                                                              const lv_8sc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < quarterPoints; number++) {
+        // Convert 8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Perform the addition of products
+
+        _mm256_store_si256((__m256i*)c,
+                           _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+                                              _mm256_unpackhi_epi32(realz, imagz)));
+
+        a += 8;
+        b += 8;
+        c += 8;
+    }
+
+    number = quarterPoints * 8;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
+                                                                const lv_8sc_t* aVector,
+                                                                const lv_8sc_t* bVector,
+                                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < quarterPoints; number++) {
+        // Convert into 8 bit values into 16 bit values
+        x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+        y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm_madd_epi16(x, y);
+
+        _mm_store_si128((__m128i*)c,
+                        _mm_packs_epi32(_mm_unpacklo_epi32(realz, imagz),
+                                        _mm_unpackhi_epi32(realz, imagz)));
+
+        a += 4;
+        b += 4;
+        c += 4;
+    }
+
+    number = quarterPoints * 4;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+#ifdef LV_HAVE_GENERIC
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t* cVector,
+                                                               const lv_8sc_t* aVector,
+                                                               const lv_8sc_t* bVector,
+                                                               unsigned int num_points)
+{
+    unsigned int number = 0;
+    int16_t* c16Ptr = (int16_t*)cVector;
+    int8_t* a8Ptr = (int8_t*)aVector;
+    int8_t* b8Ptr = (int8_t*)bVector;
+    for (number = 0; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
+
+#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
+#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+/*!
+  \brief Multiplys the one complex vector with the complex conjugate of the second complex
+  vector and stores their results in the third vector \param cVector The complex vector
+  where the results will be stored \param aVector One of the complex vectors to be
+  multiplied \param bVector The complex vector which will be converted to complex
+  conjugate and multiplied \param num_points The number of complex values in aVector and
+  bVector to be multiplied together and stored into cVector
+*/
+static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
+                                                              const lv_8sc_t* aVector,
+                                                              const lv_8sc_t* bVector,
+                                                              unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    lv_16sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert 8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Perform the addition of products
+
+        _mm256_storeu_si256((__m256i*)c,
+                            _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
+                                               _mm256_unpackhi_epi32(realz, imagz)));
+
+        a += 8;
+        b += 8;
+        c += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    int16_t* c16Ptr = (int16_t*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *c16Ptr++ = (int16_t)lv_creal(temp);
+        *c16Ptr++ = (int16_t)lv_cimag(temp);
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */
diff --git a/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h

new file mode 100644 (file)

index 0000000..318a781
--- /dev/null
+++ b/kernels/volk/volk_8ic_x2_s32f_multiply_conjugate_32fc.h
@@ -0,0 +1,344 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8ic_x2_s32f_multiply_conjugate_32fc
+ *
+ * \b Overview
+ *
+ * Multiplys the one complex vector with the complex conjugate of the
+ * second complex vector and stores their results in the third vector
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8ic_x2_s32f_multiply_conjugate_32fc(lv_32fc_t* cVector, const lv_8sc_t*
+ * aVector, const lv_8sc_t* bVector, const float scalar, unsigned int num_points) \endcode
+ *
+ * \b Inputs
+ * \li aVector: One of the complex vectors to be multiplied.
+ * \li bVector: The complex vector which will be converted to complex conjugate and
+ * multiplied. \li scalar: each output value is scaled by 1/scalar. \li num_points: The
+ * number of complex values in aVector and bVector to be multiplied together and stored
+ * into cVector.
+ *
+ * \b Outputs
+ * \li cVector: The complex vector where the results will be stored.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * <FIXME>
+ *
+ * volk_8ic_x2_s32f_multiply_conjugate_32fc();
+ *
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(lv_32fc_t* cVector,
+                                                const lv_8sc_t* aVector,
+                                                const lv_8sc_t* bVector,
+                                                const float scalar,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    __m256 ret, retlo, rethi;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert  8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        retlo = _mm256_mul_ps(retlo, invScalar);
+
+        // Interleave real and imaginary and then convert to float values
+        rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        rethi = _mm256_mul_ps(rethi, invScalar);
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+        _mm256_store_ps((float*)c, ret);
+        c += 4;
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+        _mm256_store_ps((float*)c, ret);
+        c += 4;
+
+        a += 8;
+        b += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2*/
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(lv_32fc_t* cVector,
+                                                  const lv_8sc_t* aVector,
+                                                  const lv_8sc_t* bVector,
+                                                  const float scalar,
+                                                  unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int quarterPoints = num_points / 4;
+
+    __m128i x, y, realz, imagz;
+    __m128 ret;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m128 invScalar = _mm_set_ps1(1.0 / scalar);
+
+    for (; number < quarterPoints; number++) {
+        // Convert into 8 bit values into 16 bit values
+        x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
+        y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        ret = _mm_mul_ps(ret, invScalar);
+
+        // Store the floating point values
+        _mm_store_ps((float*)c, ret);
+        c += 2;
+
+        // Interleave real and imaginary and then convert to float values
+        ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        ret = _mm_mul_ps(ret, invScalar);
+
+        // Store the floating point values
+        _mm_store_ps((float*)c, ret);
+        c += 2;
+
+        a += 4;
+        b += 4;
+    }
+
+    number = quarterPoints * 4;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
+}
+#endif /* LV_HAVE_SSE4_1 */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t* cVector,
+                                                 const lv_8sc_t* aVector,
+                                                 const lv_8sc_t* bVector,
+                                                 const float scalar,
+                                                 unsigned int num_points)
+{
+    unsigned int number = 0;
+    float* cPtr = (float*)cVector;
+    const float invScalar = 1.0 / scalar;
+    int8_t* a8Ptr = (int8_t*)aVector;
+    int8_t* b8Ptr = (int8_t*)bVector;
+    for (number = 0; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cPtr++ = (lv_creal(temp) * invScalar);
+        *cPtr++ = (lv_cimag(temp) * invScalar);
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H */
+
+#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
+#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <volk/volk_complex.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(lv_32fc_t* cVector,
+                                                const lv_8sc_t* aVector,
+                                                const lv_8sc_t* bVector,
+                                                const float scalar,
+                                                unsigned int num_points)
+{
+    unsigned int number = 0;
+    const unsigned int oneEigthPoints = num_points / 8;
+
+    __m256i x, y, realz, imagz;
+    __m256 ret, retlo, rethi;
+    lv_32fc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+    const lv_8sc_t* b = bVector;
+    __m256i conjugateSign =
+        _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
+
+    __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
+
+    for (; number < oneEigthPoints; number++) {
+        // Convert  8 bit values into 16 bit values
+        x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
+        y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
+
+        // Calculate the ar*cr - ai*(-ci) portions
+        realz = _mm256_madd_epi16(x, y);
+
+        // Calculate the complex conjugate of the cr + ci j values
+        y = _mm256_sign_epi16(y, conjugateSign);
+
+        // Shift the order of the cr and ci values
+        y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
+                                   _MM_SHUFFLE(2, 3, 0, 1));
+
+        // Calculate the ar*(-ci) + cr*(ai)
+        imagz = _mm256_madd_epi16(x, y);
+
+        // Interleave real and imaginary and then convert to float values
+        retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        retlo = _mm256_mul_ps(retlo, invScalar);
+
+        // Interleave real and imaginary and then convert to float values
+        rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
+
+        // Normalize the floating point values
+        rethi = _mm256_mul_ps(rethi, invScalar);
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
+        _mm256_storeu_ps((float*)c, ret);
+        c += 4;
+
+        ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
+        _mm256_storeu_ps((float*)c, ret);
+        c += 4;
+
+        a += 8;
+        b += 8;
+    }
+
+    number = oneEigthPoints * 8;
+    float* cFloatPtr = (float*)&cVector[number];
+    int8_t* a8Ptr = (int8_t*)&aVector[number];
+    int8_t* b8Ptr = (int8_t*)&bVector[number];
+    for (; number < num_points; number++) {
+        float aReal = (float)*a8Ptr++;
+        float aImag = (float)*a8Ptr++;
+        lv_32fc_t aVal = lv_cmake(aReal, aImag);
+        float bReal = (float)*b8Ptr++;
+        float bImag = (float)*b8Ptr++;
+        lv_32fc_t bVal = lv_cmake(bReal, -bImag);
+        lv_32fc_t temp = aVal * bVal;
+
+        *cFloatPtr++ = lv_creal(temp) / scalar;
+        *cFloatPtr++ = lv_cimag(temp) / scalar;
+    }
+}
+#endif /* LV_HAVE_AVX2*/
+
+
+#endif /* INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H */
diff --git a/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h

new file mode 100644 (file)

index 0000000..e154c54
--- /dev/null
+++ b/kernels/volk/volk_8u_conv_k7_r2puppet_8u.h
@@ -0,0 +1,424 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+#define INCLUDED_volk_8u_conv_k7_r2puppet_8u_H
+
+#include <string.h>
+#include <volk/volk.h>
+#include <volk/volk_8u_x4_conv_k7_r2_8u.h>
+
+typedef union {
+    // decision_t is a BIT vector
+    unsigned char* t;
+    unsigned int* w;
+} p_decision_t;
+
+static inline int parity(int x, unsigned char* Partab)
+{
+    x ^= (x >> 16);
+    x ^= (x >> 8);
+    return Partab[x];
+}
+
+static inline int chainback_viterbi(unsigned char* data,
+                                    unsigned int nbits,
+                                    unsigned int endstate,
+                                    unsigned int tailsize,
+                                    unsigned char* decisions)
+{
+    unsigned char* d;
+    int d_ADDSHIFT = 0;
+    int d_numstates = (1 << 6);
+    int d_decision_t_size = d_numstates / 8;
+    unsigned int d_k = 7;
+    int d_framebits = nbits;
+    /* ADDSHIFT and SUBSHIFT make sure that the thing returned is a byte. */
+    d = decisions;
+    /* Make room beyond the end of the encoder register so we can
+     * accumulate a full byte of decoded data
+     */
+
+    endstate = (endstate % d_numstates) << d_ADDSHIFT;
+
+    /* The store into data[] only needs to be done every 8 bits.
+     * But this avoids a conditional branch, and the writes will
+     * combine in the cache anyway
+     */
+
+    d += tailsize * d_decision_t_size; /* Look past tail */
+    int retval;
+    int dif = tailsize - (d_k - 1);
+    // printf("break, %d, %d\n", dif, (nbits+dif)%d_framebits);
+    p_decision_t dec;
+    while (nbits-- > d_framebits - (d_k - 1)) {
+        int k;
+        dec.t = &d[nbits * d_decision_t_size];
+        k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+        endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+        // data[((nbits+dif)%nbits)>>3] = endstate>>d_SUBSHIFT;
+        // printf("%d, %d\n", k, (nbits+dif)%d_framebits);
+        data[((nbits + dif) % d_framebits)] = k;
+
+        retval = endstate;
+    }
+    nbits += 1;
+
+    while (nbits-- != 0) {
+        int k;
+
+        dec.t = &d[nbits * d_decision_t_size];
+
+        k = (dec.w[(endstate >> d_ADDSHIFT) / 32] >> ((endstate >> d_ADDSHIFT) % 32)) & 1;
+
+        endstate = (endstate >> 1) | (k << (d_k - 2 + d_ADDSHIFT));
+        data[((nbits + dif) % d_framebits)] = k;
+    }
+    // printf("%d, %d, %d, %d, %d, %d, %d, %d\n",
+    // data[4095],data[4094],data[4093],data[4092],data[4091],data[4090],data[4089],data[4088]);
+
+
+    return retval >> d_ADDSHIFT;
+}
+
+
+#if LV_HAVE_SSE3
+
+#include <emmintrin.h>
+#include <mmintrin.h>
+#include <pmmintrin.h>
+#include <stdio.h>
+#include <xmmintrin.h>
+
+static inline void volk_8u_conv_k7_r2puppet_8u_spiral(unsigned char* syms,
+                                                      unsigned char* dec,
+                                                      unsigned int framebits)
+{
+
+
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* D;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
+    }
+
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
+
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+    volk_8u_x4_conv_k7_r2_8u_spiral(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
+    }
+
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+    return;
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+
+#if LV_HAVE_NEON
+
+#include "volk/sse2neon.h"
+
+static inline void volk_8u_conv_k7_r2puppet_8u_neonspiral(unsigned char* syms,
+                                                          unsigned char* dec,
+                                                          unsigned int framebits)
+{
+
+
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* D;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
+    }
+
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
+
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+    volk_8u_x4_conv_k7_r2_8u_neonspiral(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
+    }
+
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+    return;
+}
+
+#endif /*LV_HAVE_NEON*/
+
+
+//#if LV_HAVE_AVX2
+//
+//#include <immintrin.h>
+//#include <stdio.h>
+//
+// static inline void volk_8u_conv_k7_r2puppet_8u_avx2(unsigned char* syms,
+//                                                    unsigned char* dec,
+//                                                    unsigned int framebits)
+//{
+//
+//
+//    static int once = 1;
+//    int d_numstates = (1 << 6);
+//    int rate = 2;
+//    static unsigned char* D;
+//    static unsigned char* Y;
+//    static unsigned char* X;
+//    static unsigned int excess = 6;
+//    static unsigned char* Branchtab;
+//    static unsigned char Partab[256];
+//
+//    int d_polys[2] = { 79, 109 };
+//
+//
+//    if (once) {
+//
+//        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+//        Y = X + d_numstates;
+//        Branchtab =
+//            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+//        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+//                                        volk_get_alignment());
+//        int state, i;
+//        int cnt, ti;
+//
+//        /* Initialize parity lookup table */
+//        for (i = 0; i < 256; i++) {
+//            cnt = 0;
+//            ti = i;
+//            while (ti) {
+//                if (ti & 1)
+//                    cnt++;
+//                ti >>= 1;
+//            }
+//            Partab[i] = cnt & 1;
+//        }
+//        /*  Initialize the branch table */
+//        for (state = 0; state < d_numstates / 2; state++) {
+//            for (i = 0; i < rate; i++) {
+//                Branchtab[i * d_numstates / 2 + state] =
+//                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+//            }
+//        }
+//
+//        once = 0;
+//    }
+//
+//    // unbias the old_metrics
+//    memset(X, 31, d_numstates);
+//
+//    // initialize decisions
+//    memset(D, 0, (d_numstates / 8) * (framebits + 6));
+//
+//    volk_8u_x4_conv_k7_r2_8u_avx2(
+//        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+//
+//    unsigned int min = X[0];
+//    int i = 0, state = 0;
+//    for (i = 0; i < (d_numstates); ++i) {
+//        if (X[i] < min) {
+//            min = X[i];
+//            state = i;
+//        }
+//    }
+//
+//    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+//
+//    return;
+//}
+//
+//#endif /*LV_HAVE_AVX2*/
+
+
+#if LV_HAVE_GENERIC
+
+
+static inline void volk_8u_conv_k7_r2puppet_8u_generic(unsigned char* syms,
+                                                       unsigned char* dec,
+                                                       unsigned int framebits)
+{
+
+
+    static int once = 1;
+    int d_numstates = (1 << 6);
+    int rate = 2;
+    static unsigned char* Y;
+    static unsigned char* X;
+    static unsigned char* D;
+    static unsigned int excess = 6;
+    static unsigned char* Branchtab;
+    static unsigned char Partab[256];
+
+    int d_polys[2] = { 79, 109 };
+
+
+    if (once) {
+
+        X = (unsigned char*)volk_malloc(2 * d_numstates, volk_get_alignment());
+        Y = X + d_numstates;
+        Branchtab =
+            (unsigned char*)volk_malloc(d_numstates / 2 * rate, volk_get_alignment());
+        D = (unsigned char*)volk_malloc((d_numstates / 8) * (framebits + 6),
+                                        volk_get_alignment());
+
+        int state, i;
+        int cnt, ti;
+
+        /* Initialize parity lookup table */
+        for (i = 0; i < 256; i++) {
+            cnt = 0;
+            ti = i;
+            while (ti) {
+                if (ti & 1)
+                    cnt++;
+                ti >>= 1;
+            }
+            Partab[i] = cnt & 1;
+        }
+        /*  Initialize the branch table */
+        for (state = 0; state < d_numstates / 2; state++) {
+            for (i = 0; i < rate; i++) {
+                Branchtab[i * d_numstates / 2 + state] =
+                    parity((2 * state) & d_polys[i], Partab) ? 255 : 0;
+            }
+        }
+
+        once = 0;
+    }
+
+    // unbias the old_metrics
+    memset(X, 31, d_numstates);
+
+    // initialize decisions
+    memset(D, 0, (d_numstates / 8) * (framebits + 6));
+
+    volk_8u_x4_conv_k7_r2_8u_generic(
+        Y, X, syms, D, framebits / 2 - excess, excess, Branchtab);
+
+    unsigned int min = X[0];
+    int i = 0, state = 0;
+    for (i = 0; i < (d_numstates); ++i) {
+        if (X[i] < min) {
+            min = X[i];
+            state = i;
+        }
+    }
+
+    chainback_viterbi(dec, framebits / 2 - excess, state, excess, D);
+
+    return;
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_8u_conv_k7_r2puppet_8u_H*/
diff --git a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h

new file mode 100644 (file)

index 0000000..1a4b889
--- /dev/null
+++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h
@@ -0,0 +1,1137 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*
+ * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
+#include <string.h>
+
+static inline unsigned int log2_of_power_of_2(unsigned int val)
+{
+    // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
+    static const unsigned int b[] = {
+        0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
+    };
+
+    unsigned int res = (val & b[0]) != 0;
+    res |= ((val & b[4]) != 0) << 4;
+    res |= ((val & b[3]) != 0) << 3;
+    res |= ((val & b[2]) != 0) << 2;
+    res |= ((val & b[1]) != 0) << 1;
+    return res;
+}
+
+static inline void encodepolar_single_stage(unsigned char* frame_ptr,
+                                            const unsigned char* temp_ptr,
+                                            const unsigned int num_branches,
+                                            const unsigned int frame_half)
+{
+    unsigned int branch, bit;
+    for (branch = 0; branch < num_branches; ++branch) {
+        for (bit = 0; bit < frame_half; ++bit) {
+            *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
+            *(frame_ptr + frame_half) = *(temp_ptr + 1);
+            ++frame_ptr;
+            temp_ptr += 2;
+        }
+        frame_ptr += frame_half;
+    }
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
+{
+    unsigned int stage = log2_of_power_of_2(frame_size);
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+
+    while (stage) {
+        // encode stage
+        encodepolar_single_stage(frame, temp, num_branches, frame_half);
+        memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+        // update all the parameters.
+        num_branches = num_branches << 1;
+        frame_half = frame_half >> 1;
+        --stage;
+    }
+}
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
+{
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m128i mask_stage1 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+
+    // get some SIMD registers to play with.
+    __m128i r_frame0, r_temp0, shifted;
+
+    {
+        __m128i r_frame1, r_temp1;
+        const __m128i shuffle_separate =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 16) {
+                    r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+                    r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+
+                    shifted = _mm_srli_si128(r_temp0, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp0 = _mm_xor_si128(shifted, r_temp0);
+                    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm_srli_si128(r_temp1, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp1 = _mm_xor_si128(shifted, r_temp1);
+                    r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+                    _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+
+                    r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+                    _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 16;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
+        }
+    }
+
+    // This last part requires at least 16-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
+
+    // prefetch first chunk
+    __VOLK_PREFETCH(temp_ptr);
+
+    const __m128i shuffle_stage4 =
+        _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+    const __m128i mask_stage4 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage3 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage2 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF);
+
+    for (branch = 0; branch < num_branches; ++branch) {
+        r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 16;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm_srli_si128(r_temp0, 8);
+        shifted = _mm_and_si128(shifted, mask_stage4);
+        r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+        shifted = _mm_srli_si128(r_frame0, 4);
+        shifted = _mm_and_si128(shifted, mask_stage3);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 2);
+        shifted = _mm_and_si128(shifted, mask_stage2);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 1);
+        shifted = _mm_and_si128(shifted, mask_stage1);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
+        frame_ptr += 16;
+    }
+}
+
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
+                                                         unsigned char* temp,
+                                                         unsigned int frame_size)
+{
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF);
+
+    const __m128i mask_stage0 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+    // get some SIMD registers to play with.
+    __m256i r_frame0, r_temp0, shifted;
+    __m128i r_temp2, r_frame2, shifted2;
+    {
+        __m256i r_frame1, r_temp1;
+        __m128i r_frame3, r_temp3;
+        const __m256i shuffle_separate = _mm256_setr_epi8(0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15,
+                                                          0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15);
+        const __m128i shuffle_separate128 =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 32) {
+                    if ((frame_half - bit) <
+                        32) // if only 16 bits remaining in frame, not 32
+                    {
+                        r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+                        r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+
+                        shifted2 = _mm_srli_si128(r_temp2, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+                        r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+                        shifted2 = _mm_srli_si128(r_temp3, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+                        r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+                        r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+                        _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
+
+                        r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+                        _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+                        frame_ptr += 16;
+                        break;
+                    }
+                    r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+                    r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+
+                    shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+                    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm256_srli_si256(r_temp1, 1);
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+                    r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+                    r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+                    r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+                    r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+                    _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+
+                    _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 32;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
+        }
+    }
+
+    // This last part requires at least 32-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
+
+    // prefetch first chunk
+    __VOLK_PREFETCH(temp_ptr);
+
+    const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15,
+                                                    0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15);
+    const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF);
+
+    for (branch = 0; branch < num_branches / 2; ++branch) {
+        r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 32;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+        shifted = _mm256_and_si256(shifted, mask_stage4);
+        r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+
+        shifted = _mm256_srli_si256(r_frame0, 4);
+        shifted = _mm256_and_si256(shifted, mask_stage3);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 2);
+        shifted = _mm256_and_si256(shifted, mask_stage2);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 1);
+        shifted = _mm256_and_si256(shifted, mask_stage1);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
+        frame_ptr += 32;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
+                                                          unsigned char* temp,
+                                                          unsigned int frame_size)
+{
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m128i mask_stage1 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+
+    // get some SIMD registers to play with.
+    __m128i r_frame0, r_temp0, shifted;
+
+    {
+        __m128i r_frame1, r_temp1;
+        const __m128i shuffle_separate =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 16) {
+                    r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+                    r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
+                    temp_ptr += 16;
+
+                    shifted = _mm_srli_si128(r_temp0, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp0 = _mm_xor_si128(shifted, r_temp0);
+                    r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm_srli_si128(r_temp1, 1);
+                    shifted = _mm_and_si128(shifted, mask_stage1);
+                    r_temp1 = _mm_xor_si128(shifted, r_temp1);
+                    r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
+                    _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+
+                    r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
+                    _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 16;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
+        }
+    }
+
+    // This last part requires at least 16-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
+
+    // prefetch first chunk
+    __VOLK_PREFETCH(temp_ptr);
+
+    const __m128i shuffle_stage4 =
+        _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
+    const __m128i mask_stage4 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage3 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF,
+                                             0xFF);
+    const __m128i mask_stage2 = _mm_set_epi8(0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF,
+                                             0x0,
+                                             0x0,
+                                             0xFF,
+                                             0xFF);
+
+    for (branch = 0; branch < num_branches; ++branch) {
+        r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 16;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm_srli_si128(r_temp0, 8);
+        shifted = _mm_and_si128(shifted, mask_stage4);
+        r_frame0 = _mm_xor_si128(shifted, r_temp0);
+
+        shifted = _mm_srli_si128(r_frame0, 4);
+        shifted = _mm_and_si128(shifted, mask_stage3);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 2);
+        shifted = _mm_and_si128(shifted, mask_stage2);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        shifted = _mm_srli_si128(r_frame0, 1);
+        shifted = _mm_and_si128(shifted, mask_stage1);
+        r_frame0 = _mm_xor_si128(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm_store_si128((__m128i*)frame_ptr, r_frame0);
+        frame_ptr += 16;
+    }
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
+                                                         unsigned char* temp,
+                                                         unsigned int frame_size)
+{
+    const unsigned int po2 = log2_of_power_of_2(frame_size);
+
+    unsigned int stage = po2;
+    unsigned char* frame_ptr = frame;
+    unsigned char* temp_ptr = temp;
+
+    unsigned int frame_half = frame_size >> 1;
+    unsigned int num_branches = 1;
+    unsigned int branch;
+    unsigned int bit;
+
+    // prepare constants
+    const __m256i mask_stage1 = _mm256_set_epi8(0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF,
+                                                0x0,
+                                                0xFF);
+
+    const __m128i mask_stage0 = _mm_set_epi8(0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF,
+                                             0x0,
+                                             0xFF);
+    // get some SIMD registers to play with.
+    __m256i r_frame0, r_temp0, shifted;
+    __m128i r_temp2, r_frame2, shifted2;
+    {
+        __m256i r_frame1, r_temp1;
+        __m128i r_frame3, r_temp3;
+        const __m256i shuffle_separate = _mm256_setr_epi8(0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15,
+                                                          0,
+                                                          2,
+                                                          4,
+                                                          6,
+                                                          8,
+                                                          10,
+                                                          12,
+                                                          14,
+                                                          1,
+                                                          3,
+                                                          5,
+                                                          7,
+                                                          9,
+                                                          11,
+                                                          13,
+                                                          15);
+        const __m128i shuffle_separate128 =
+            _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+        while (stage > 4) {
+            frame_ptr = frame;
+            temp_ptr = temp;
+
+            // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
+            for (branch = 0; branch < num_branches; ++branch) {
+                for (bit = 0; bit < frame_half; bit += 32) {
+                    if ((frame_half - bit) <
+                        32) // if only 16 bits remaining in frame, not 32
+                    {
+                        r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+                        r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
+                        temp_ptr += 16;
+
+                        shifted2 = _mm_srli_si128(r_temp2, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp2 = _mm_xor_si128(shifted2, r_temp2);
+                        r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
+
+                        shifted2 = _mm_srli_si128(r_temp3, 1);
+                        shifted2 = _mm_and_si128(shifted2, mask_stage0);
+                        r_temp3 = _mm_xor_si128(shifted2, r_temp3);
+                        r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
+
+                        r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
+                        _mm_store_si128((__m128i*)frame_ptr, r_frame2);
+
+                        r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
+                        _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
+                        frame_ptr += 16;
+                        break;
+                    }
+                    r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+                    r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
+                    temp_ptr += 32;
+
+                    shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp0 = _mm256_xor_si256(shifted, r_temp0);
+                    r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
+
+                    shifted = _mm256_srli_si256(r_temp1, 1);
+                    shifted = _mm256_and_si256(shifted, mask_stage1);
+                    r_temp1 = _mm256_xor_si256(shifted, r_temp1);
+                    r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
+
+                    r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
+                    r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
+                    r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
+                    r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
+
+                    _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+
+                    _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
+                    frame_ptr += 32;
+                }
+
+                frame_ptr += frame_half;
+            }
+            memcpy(temp, frame, sizeof(unsigned char) * frame_size);
+
+            num_branches = num_branches << 1;
+            frame_half = frame_half >> 1;
+            stage--;
+        }
+    }
+
+    // This last part requires at least 32-bit frames.
+    // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
+
+    // reset pointers to correct positions.
+    frame_ptr = frame;
+    temp_ptr = temp;
+
+    // prefetch first chunk.
+    __VOLK_PREFETCH(temp_ptr);
+
+    const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15,
+                                                    0,
+                                                    8,
+                                                    4,
+                                                    12,
+                                                    2,
+                                                    10,
+                                                    6,
+                                                    14,
+                                                    1,
+                                                    9,
+                                                    5,
+                                                    13,
+                                                    3,
+                                                    11,
+                                                    7,
+                                                    15);
+    const __m256i mask_stage4 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage3 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF,
+                                                0xFF);
+    const __m256i mask_stage2 = _mm256_set_epi8(0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF,
+                                                0x0,
+                                                0x0,
+                                                0xFF,
+                                                0xFF);
+
+    for (branch = 0; branch < num_branches / 2; ++branch) {
+        r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
+
+        // prefetch next chunk
+        temp_ptr += 32;
+        __VOLK_PREFETCH(temp_ptr);
+
+        // shuffle once for bit-reversal.
+        r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
+
+        shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
+        shifted = _mm256_and_si256(shifted, mask_stage4);
+        r_frame0 = _mm256_xor_si256(shifted, r_temp0);
+
+        shifted = _mm256_srli_si256(r_frame0, 4);
+        shifted = _mm256_and_si256(shifted, mask_stage3);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 2);
+        shifted = _mm256_and_si256(shifted, mask_stage2);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        shifted = _mm256_srli_si256(r_frame0, 1);
+        shifted = _mm256_and_si256(shifted, mask_stage1);
+        r_frame0 = _mm256_xor_si256(shifted, r_frame0);
+
+        // store result of chunk.
+        _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
+        frame_ptr += 32;
+    }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */
diff --git a/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h

new file mode 100644 (file)

index 0000000..91500fb
--- /dev/null
+++ b/kernels/volk/volk_8u_x3_encodepolar_8u_x2.h
@@ -0,0 +1,172 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8u_x3_encodepolar_8u
+ *
+ * \b Overview
+ *
+ * encode given data for POLAR code
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8u_x3_encodepolar_8u(unsigned char* frame, const unsigned char*
+ * frozen_bit_mask, const unsigned char* frozen_bits, const unsigned char* info_bits,
+ * unsigned int frame_size, unsigned int info_bit_size) \endcode
+ *
+ * \b Inputs
+ * \li frame: buffer for encoded frame
+ * \li frozen_bit_mask: bytes with 0xFF for frozen bit positions or 0x00 otherwise.
+ * \li frozen_bits: values of frozen bits, 1 bit per byte
+ * \li info_bits: info bit values, 1 bit per byte
+ * \li frame_size: power of 2 value for frame size.
+ * \li info_bit_size: number of info bits in a frame
+ *
+ * \b Outputs
+ * \li frame: polar encoded frame.
+ *
+ * \b Example
+ * \code
+ * int frame_exp = 10;
+ * int frame_size = 0x01 << frame_exp;
+ * int num_info_bits = frame_size;
+ * int num_frozen_bits = frame_size - num_info_bits;
+ *
+ * // function sets frozenbit positions to 0xff and all others to 0x00.
+ * unsigned char* frozen_bit_mask = get_frozen_bit_mask(frame_size, num_frozen_bits);
+ *
+ * // set elements to desired values. Typically all zero.
+ * unsigned char* frozen_bits = (unsigned char) volk_malloc(sizeof(unsigned char) *
+ * num_frozen_bits, volk_get_alignment());
+ *
+ * unsigned char* frame = (unsigned char) volk_malloc(sizeof(unsigned char) * frame_size,
+ * volk_get_alignment()); unsigned char* temp = (unsigned char)
+ * volk_malloc(sizeof(unsigned char) * frame_size, volk_get_alignment());
+ *
+ * unsigned char* info_bits = get_info_bits_to_encode(num_info_bits);
+ *
+ * volk_8u_x3_encodepolar_8u_x2_generic(frame, temp, frozen_bit_mask, frozen_bits,
+ * info_bits, frame_size);
+ *
+ * volk_free(frozen_bit_mask);
+ * volk_free(frozen_bits);
+ * volk_free(frame);
+ * volk_free(temp);
+ * volk_free(info_bits);
+ * \endcode
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_
+#include <stdio.h>
+#include <volk/volk_8u_x2_encodeframepolar_8u.h>
+
+static inline void interleave_frozen_and_info_bits(unsigned char* target,
+                                                   const unsigned char* frozen_bit_mask,
+                                                   const unsigned char* frozen_bits,
+                                                   const unsigned char* info_bits,
+                                                   const unsigned int frame_size)
+{
+    unsigned int bit;
+    for (bit = 0; bit < frame_size; ++bit) {
+        *target++ = *frozen_bit_mask++ ? *frozen_bits++ : *info_bits++;
+    }
+}
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_8u_x3_encodepolar_8u_x2_generic(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
+                                     unsigned int frame_size)
+{
+    // interleave
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_generic(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+
+static inline void
+volk_8u_x3_encodepolar_8u_x2_u_ssse3(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
+                                     unsigned int frame_size)
+{
+    // interleave
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_u_ssse3(frame, temp, frame_size);
+}
+
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void
+volk_8u_x3_encodepolar_8u_x2_u_avx2(unsigned char* frame,
+                                    unsigned char* temp,
+                                    const unsigned char* frozen_bit_mask,
+                                    const unsigned char* frozen_bits,
+                                    const unsigned char* info_bits,
+                                    unsigned int frame_size)
+{
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_u_avx2(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_
+
+#ifdef LV_HAVE_SSSE3
+#include <tmmintrin.h>
+static inline void
+volk_8u_x3_encodepolar_8u_x2_a_ssse3(unsigned char* frame,
+                                     unsigned char* temp,
+                                     const unsigned char* frozen_bit_mask,
+                                     const unsigned char* frozen_bits,
+                                     const unsigned char* info_bits,
+                                     unsigned int frame_size)
+{
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_a_ssse3(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void
+volk_8u_x3_encodepolar_8u_x2_a_avx2(unsigned char* frame,
+                                    unsigned char* temp,
+                                    const unsigned char* frozen_bit_mask,
+                                    const unsigned char* frozen_bits,
+                                    const unsigned char* info_bits,
+                                    unsigned int frame_size)
+{
+    interleave_frozen_and_info_bits(
+        temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_8u_x2_encodeframepolar_8u_a_avx2(frame, temp, frame_size);
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLAR_8U_X2_A_H_ */
diff --git a/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h

new file mode 100644 (file)

index 0000000..ec6c955
--- /dev/null
+++ b/kernels/volk/volk_8u_x3_encodepolarpuppet_8u.h
@@ -0,0 +1,140 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/* For documentation see 'kernels/volk/volk_8u_x3_encodepolar_8u_x2.h'
+ * This file exists for test purposes only. Should not be used directly.
+ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_
+#include <volk/volk.h>
+#include <volk/volk_8u_x3_encodepolar_8u_x2.h>
+
+static inline unsigned int next_lower_power_of_two(const unsigned int val)
+{
+    // algorithm found and adopted from:
+    // http://acius2.blogspot.de/2007/11/calculating-next-power-of-2.html
+    unsigned int res = val;
+    res = (res >> 1) | res;
+    res = (res >> 2) | res;
+    res = (res >> 4) | res;
+    res = (res >> 8) | res;
+    res = (res >> 16) | res;
+    res += 1;
+    return res >> 1;
+}
+
+static inline void adjust_frozen_mask(unsigned char* mask, const unsigned int frame_size)
+{
+    // just like the rest of the puppet this function exists for test purposes only.
+    unsigned int i;
+    for (i = 0; i < frame_size; ++i) {
+        *mask = (*mask & 0x80) ? 0xFF : 0x00;
+        mask++;
+    }
+}
+
+#ifdef LV_HAVE_GENERIC
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_generic(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
+{
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_generic(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSSE3
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_u_ssse3(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
+{
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_u_ssse3(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_u_avx2(unsigned char* frame,
+                                       unsigned char* frozen_bit_mask,
+                                       const unsigned char* frozen_bits,
+                                       const unsigned char* info_bits,
+                                       unsigned int frame_size)
+{
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_u_avx2(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
+}
+#endif /* LV_HAVE_AVX2 */
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_H_ */
+
+#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_
+#define VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_
+
+#ifdef LV_HAVE_SSSE3
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_a_ssse3(unsigned char* frame,
+                                        unsigned char* frozen_bit_mask,
+                                        const unsigned char* frozen_bits,
+                                        const unsigned char* info_bits,
+                                        unsigned int frame_size)
+{
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_a_ssse3(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
+}
+#endif /* LV_HAVE_SSSE3 */
+
+#ifdef LV_HAVE_AVX2
+static inline void
+volk_8u_x3_encodepolarpuppet_8u_a_avx2(unsigned char* frame,
+                                       unsigned char* frozen_bit_mask,
+                                       const unsigned char* frozen_bits,
+                                       const unsigned char* info_bits,
+                                       unsigned int frame_size)
+{
+    frame_size = next_lower_power_of_two(frame_size);
+    unsigned char* temp = (unsigned char*)volk_malloc(sizeof(unsigned char) * frame_size,
+                                                      volk_get_alignment());
+    adjust_frozen_mask(frozen_bit_mask, frame_size);
+    volk_8u_x3_encodepolar_8u_x2_a_avx2(
+        frame, temp, frozen_bit_mask, frozen_bits, info_bits, frame_size);
+    volk_free(temp);
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X3_ENCODEPOLARPUPPET_8U_A_H_ */
diff --git a/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h

new file mode 100644 (file)

index 0000000..90e407d
--- /dev/null
+++ b/kernels/volk/volk_8u_x4_conv_k7_r2_8u.h
@@ -0,0 +1,950 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+/*!
+ * \page volk_8u_x4_conv_k7_r2_8u
+ *
+ * \b Overview
+ *
+ * Performs convolutional decoding for a K=7, rate 1/2 convolutional
+ * code. The polynomials user defined.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_8u_x4_conv_k7_r2_8u(unsigned char* Y, unsigned char* X, unsigned char* syms,
+ * unsigned char* dec, unsigned int framebits, unsigned int excess, unsigned char*
+ * Branchtab) \endcode
+ *
+ * \b Inputs
+ * \li X: <FIXME>
+ * \li syms: <FIXME>
+ * \li dec: <FIXME>
+ * \li framebits: size of the frame to decode in bits.
+ * \li excess: <FIXME>
+ * \li Branchtab: <FIXME>
+ *
+ * \b Outputs
+ * \li Y: The decoded output bits.
+ *
+ * \b Example
+ * \code
+ * int N = 10000;
+ *
+ * volk_8u_x4_conv_k7_r2_8u();
+ *
+ * volk_free(x);
+ * \endcode
+ */
+
+#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
+#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
+
+typedef union {
+    unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
+    unsigned int w[64 /*NUMSTATES*/ / 32];
+    unsigned short s[64 /*NUMSTATES*/ / 16];
+    unsigned char c[64 /*NUMSTATES*/ / 8];
+#ifdef _MSC_VER
+} decision_t;
+#else
+} decision_t __attribute__((aligned(16)));
+#endif
+
+
+static inline void renormalize(unsigned char* X, unsigned char threshold)
+{
+    int NUMSTATES = 64;
+    int i;
+
+    unsigned char min = X[0];
+    // if(min > threshold) {
+    for (i = 0; i < NUMSTATES; i++)
+        if (min > X[i])
+            min = X[i];
+    for (i = 0; i < NUMSTATES; i++)
+        X[i] -= min;
+    //}
+}
+
+
+// helper BFLY for GENERIC version
+static inline void BFLY(int i,
+                        int s,
+                        unsigned char* syms,
+                        unsigned char* Y,
+                        unsigned char* X,
+                        decision_t* d,
+                        unsigned char* Branchtab)
+{
+    int j, decision0, decision1;
+    unsigned char metric, m0, m1, m2, m3;
+
+    int NUMSTATES = 64;
+    int RATE = 2;
+    int METRICSHIFT = 2;
+    int PRECISIONSHIFT = 2;
+
+    metric = 0;
+    for (j = 0; j < RATE; j++)
+        metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
+    metric = metric >> PRECISIONSHIFT;
+
+    unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
+
+    m0 = X[i] + metric;
+    m1 = X[i + NUMSTATES / 2] + (max - metric);
+    m2 = X[i] + (max - metric);
+    m3 = X[i + NUMSTATES / 2] + metric;
+
+    decision0 = (signed int)(m0 - m1) > 0;
+    decision1 = (signed int)(m2 - m3) > 0;
+
+    Y[2 * i] = decision0 ? m1 : m0;
+    Y[2 * i + 1] = decision1 ? m3 : m2;
+
+    d->w[i / (sizeof(unsigned int) * 8 / 2) +
+         s * (sizeof(decision_t) / sizeof(unsigned int))] |=
+        (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
+}
+
+
+//#if LV_HAVE_AVX2
+//
+//#include <immintrin.h>
+//#include <stdio.h>
+//
+// static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
+//                                                 unsigned char* X,
+//                                                 unsigned char* syms,
+//                                                 unsigned char* dec,
+//                                                 unsigned int framebits,
+//                                                 unsigned int excess,
+//                                                 unsigned char* Branchtab)
+//{
+//    unsigned int i9;
+//    for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+//        unsigned char a75, a81;
+//        int a73, a92;
+//        int s20, s21;
+//        unsigned char *a80, *b6;
+//        int *a110, *a91, *a93;
+//        __m256i *a112, *a71, *a72, *a77, *a83, *a95;
+//        __m256i a86, a87;
+//        __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
+//        m26,
+//            s18, s19, s22, s23, s24, s25, t13, t14, t15;
+//        a71 = ((__m256i*)X);
+//        s18 = *(a71);
+//        a72 = (a71 + 1);
+//        s19 = *(a72);
+//        s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
+//        s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
+//        s18 = s22;
+//        a73 = (4 * i9);
+//        b6 = (syms + a73);
+//        a75 = *(b6);
+//        a76 = _mm256_set1_epi8(a75);
+//        a77 = ((__m256i*)Branchtab);
+//        a78 = *(a77);
+//        a79 = _mm256_xor_si256(a76, a78);
+//        a80 = (b6 + 1);
+//        a81 = *(a80);
+//        a82 = _mm256_set1_epi8(a81);
+//        a83 = (a77 + 1);
+//        a84 = *(a83);
+//        a85 = _mm256_xor_si256(a82, a84);
+//        t13 = _mm256_avg_epu8(a79, a85);
+//        a86 = ((__m256i)t13);
+//        a87 = _mm256_srli_epi16(a86, 2);
+//        a88 = ((__m256i)a87);
+//        t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
+//        t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
+//        m23 = _mm256_adds_epu8(s18, t14);
+//        m24 = _mm256_adds_epu8(s19, t15);
+//        m25 = _mm256_adds_epu8(s18, t15);
+//        m26 = _mm256_adds_epu8(s19, t14);
+//        a89 = _mm256_min_epu8(m24, m23);
+//        d9 = _mm256_cmpeq_epi8(a89, m24);
+//        a90 = _mm256_min_epu8(m26, m25);
+//        d10 = _mm256_cmpeq_epi8(a90, m26);
+//        s22 = _mm256_unpacklo_epi8(d9, d10);
+//        s23 = _mm256_unpackhi_epi8(d9, d10);
+//        s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
+//        a91 = ((int*)dec);
+//        a92 = (4 * i9);
+//        a93 = (a91 + a92);
+//        *(a93) = s20;
+//        s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
+//        a110 = (a93 + 1);
+//        *(a110) = s21;
+//        s22 = _mm256_unpacklo_epi8(a89, a90);
+//        s23 = _mm256_unpackhi_epi8(a89, a90);
+//        a95 = ((__m256i*)Y);
+//        s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
+//        *(a95) = s24;
+//        s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
+//        a112 = (a95 + 1);
+//        *(a112) = s23;
+//        if ((((unsigned char*)Y)[0] > 210)) {
+//            __m256i m5, m6;
+//            m5 = ((__m256i*)Y)[0];
+//            m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
+//            __m256i m7;
+//            m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
+//            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
+//                                           ((__m256i)m7)));
+//            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
+//                                           ((__m256i)m7)));
+//            m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
+//                                           ((__m256i)m7)));
+//            m7 = _mm256_unpacklo_epi8(m7, m7);
+//            m7 = _mm256_shufflelo_epi16(m7, 0);
+//            m6 = _mm256_unpacklo_epi64(m7, m7);
+//            m6 = _mm256_permute2x128_si256(
+//                m6, m6, 0); // copy lower half of m6 to upper half, since above ops
+//                            // operate on 128 bit lanes
+//            ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
+//            ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
+//        }
+//        unsigned char a188, a194;
+//        int a205;
+//        int s48, s54;
+//        unsigned char *a187, *a193;
+//        int *a204, *a206, *a223, *b16;
+//        __m256i *a184, *a185, *a190, *a196, *a208, *a225;
+//        __m256i a199, a200;
+//        __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
+//        m40,
+//            m41, m42, s46, s47, s50, s51, t25, t26, t27;
+//        a184 = ((__m256i*)Y);
+//        s46 = *(a184);
+//        a185 = (a184 + 1);
+//        s47 = *(a185);
+//        s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
+//        s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
+//        s46 = s50;
+//        a187 = (b6 + 2);
+//        a188 = *(a187);
+//        a189 = _mm256_set1_epi8(a188);
+//        a190 = ((__m256i*)Branchtab);
+//        a191 = *(a190);
+//        a192 = _mm256_xor_si256(a189, a191);
+//        a193 = (b6 + 3);
+//        a194 = *(a193);
+//        a195 = _mm256_set1_epi8(a194);
+//        a196 = (a190 + 1);
+//        a197 = *(a196);
+//        a198 = _mm256_xor_si256(a195, a197);
+//        t25 = _mm256_avg_epu8(a192, a198);
+//        a199 = ((__m256i)t25);
+//        a200 = _mm256_srli_epi16(a199, 2);
+//        a201 = ((__m256i)a200);
+//        t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
+//        t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
+//        m39 = _mm256_adds_epu8(s46, t26);
+//        m40 = _mm256_adds_epu8(s47, t27);
+//        m41 = _mm256_adds_epu8(s46, t27);
+//        m42 = _mm256_adds_epu8(s47, t26);
+//        a202 = _mm256_min_epu8(m40, m39);
+//        d17 = _mm256_cmpeq_epi8(a202, m40);
+//        a203 = _mm256_min_epu8(m42, m41);
+//        d18 = _mm256_cmpeq_epi8(a203, m42);
+//        s24 = _mm256_unpacklo_epi8(d17, d18);
+//        s25 = _mm256_unpackhi_epi8(d17, d18);
+//        s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
+//        a204 = ((int*)dec);
+//        a205 = (4 * i9);
+//        b16 = (a204 + a205);
+//        a206 = (b16 + 2);
+//        *(a206) = s48;
+//        s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
+//        a223 = (b16 + 3);
+//        *(a223) = s54;
+//        s50 = _mm256_unpacklo_epi8(a202, a203);
+//        s51 = _mm256_unpackhi_epi8(a202, a203);
+//        s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
+//        s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
+//        a208 = ((__m256i*)X);
+//        *(a208) = s25;
+//        a225 = (a208 + 1);
+//        *(a225) = s51;
+//
+//        if ((((unsigned char*)X)[0] > 210)) {
+//            __m256i m12, m13;
+//            m12 = ((__m256i*)X)[0];
+//            m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
+//            __m256i m14;
+//            m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
+//            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
+//                                            ((__m256i)m14)));
+//            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
+//                                            ((__m256i)m14)));
+//            m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
+//                                            ((__m256i)m14)));
+//            m14 = _mm256_unpacklo_epi8(m14, m14);
+//            m14 = _mm256_shufflelo_epi16(m14, 0);
+//            m13 = _mm256_unpacklo_epi64(m14, m14);
+//            m13 = _mm256_permute2x128_si256(m13, m13, 0);
+//            ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
+//            ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
+//        }
+//    }
+//
+//    renormalize(X, 210);
+//
+//    unsigned int j;
+//    for (j = 0; j < (framebits + excess) % 2; ++j) {
+//        int i;
+//        for (i = 0; i < 64 / 2; i++) {
+//            BFLY(i,
+//                 (((framebits + excess) >> 1) << 1) + j,
+//                 syms,
+//                 Y,
+//                 X,
+//                 (decision_t*)dec,
+//                 Branchtab);
+//        }
+//
+//        renormalize(Y, 210);
+//    }
+//    /*skip*/
+//}
+//
+//#endif /*LV_HAVE_AVX2*/
+
+
+#if LV_HAVE_SSE3
+
+#include <emmintrin.h>
+#include <mmintrin.h>
+#include <pmmintrin.h>
+#include <stdio.h>
+#include <xmmintrin.h>
+
+static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
+                                                   unsigned char* X,
+                                                   unsigned char* syms,
+                                                   unsigned char* dec,
+                                                   unsigned int framebits,
+                                                   unsigned int excess,
+                                                   unsigned char* Branchtab)
+{
+    unsigned int i9;
+    for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+        unsigned char a75, a81;
+        int a73, a92;
+        short int s20, s21, s26, s27;
+        unsigned char *a74, *a80, *b6;
+        short int *a110, *a111, *a91, *a93, *a94;
+        __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
+        __m128i a105, a106, a86, a87;
+        __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
+            a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
+            s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
+        a71 = ((__m128i*)X);
+        s18 = *(a71);
+        a72 = (a71 + 2);
+        s19 = *(a72);
+        a73 = (4 * i9);
+        a74 = (syms + a73);
+        a75 = *(a74);
+        a76 = _mm_set1_epi8(a75);
+        a77 = ((__m128i*)Branchtab);
+        a78 = *(a77);
+        a79 = _mm_xor_si128(a76, a78);
+        b6 = (a73 + syms);
+        a80 = (b6 + 1);
+        a81 = *(a80);
+        a82 = _mm_set1_epi8(a81);
+        a83 = (a77 + 2);
+        a84 = *(a83);
+        a85 = _mm_xor_si128(a82, a84);
+        t13 = _mm_avg_epu8(a79, a85);
+        a86 = ((__m128i)t13);
+        a87 = _mm_srli_epi16(a86, 2);
+        a88 = ((__m128i)a87);
+        t14 = _mm_and_si128(
+            a88,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t15 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t14);
+        m23 = _mm_adds_epu8(s18, t14);
+        m24 = _mm_adds_epu8(s19, t15);
+        m25 = _mm_adds_epu8(s18, t15);
+        m26 = _mm_adds_epu8(s19, t14);
+        a89 = _mm_min_epu8(m24, m23);
+        d9 = _mm_cmpeq_epi8(a89, m24);
+        a90 = _mm_min_epu8(m26, m25);
+        d10 = _mm_cmpeq_epi8(a90, m26);
+        s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
+        a91 = ((short int*)dec);
+        a92 = (8 * i9);
+        a93 = (a91 + a92);
+        *(a93) = s20;
+        s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
+        a94 = (a93 + 1);
+        *(a94) = s21;
+        s22 = _mm_unpacklo_epi8(a89, a90);
+        s23 = _mm_unpackhi_epi8(a89, a90);
+        a95 = ((__m128i*)Y);
+        *(a95) = s22;
+        a96 = (a95 + 1);
+        *(a96) = s23;
+        a97 = (a71 + 1);
+        s24 = *(a97);
+        a98 = (a71 + 3);
+        s25 = *(a98);
+        a99 = (a77 + 1);
+        a100 = *(a99);
+        a101 = _mm_xor_si128(a76, a100);
+        a102 = (a77 + 3);
+        a103 = *(a102);
+        a104 = _mm_xor_si128(a82, a103);
+        t16 = _mm_avg_epu8(a101, a104);
+        a105 = ((__m128i)t16);
+        a106 = _mm_srli_epi16(a105, 2);
+        a107 = ((__m128i)a106);
+        t17 = _mm_and_si128(
+            a107,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t18 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t17);
+        m27 = _mm_adds_epu8(s24, t17);
+        m28 = _mm_adds_epu8(s25, t18);
+        m29 = _mm_adds_epu8(s24, t18);
+        m30 = _mm_adds_epu8(s25, t17);
+        a108 = _mm_min_epu8(m28, m27);
+        d11 = _mm_cmpeq_epi8(a108, m28);
+        a109 = _mm_min_epu8(m30, m29);
+        d12 = _mm_cmpeq_epi8(a109, m30);
+        s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
+        a110 = (a93 + 2);
+        *(a110) = s26;
+        s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
+        a111 = (a93 + 3);
+        *(a111) = s27;
+        s28 = _mm_unpacklo_epi8(a108, a109);
+        s29 = _mm_unpackhi_epi8(a108, a109);
+        a112 = (a95 + 2);
+        *(a112) = s28;
+        a113 = (a95 + 3);
+        *(a113) = s29;
+        if ((((unsigned char*)Y)[0] > 210)) {
+            __m128i m5, m6;
+            m5 = ((__m128i*)Y)[0];
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
+            __m128i m7;
+            m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
+            m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
+            m7 = _mm_unpacklo_epi8(m7, m7);
+            m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+            m6 = _mm_unpacklo_epi64(m7, m7);
+            ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
+            ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
+            ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
+            ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
+        }
+        unsigned char a188, a194;
+        int a186, a205;
+        short int s48, s49, s54, s55;
+        unsigned char *a187, *a193, *b15;
+        short int *a204, *a206, *a207, *a223, *a224, *b16;
+        __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
+            *a225, *a226;
+        __m128i a199, a200, a218, a219;
+        __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
+            a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
+            m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
+        a184 = ((__m128i*)Y);
+        s46 = *(a184);
+        a185 = (a184 + 2);
+        s47 = *(a185);
+        a186 = (4 * i9);
+        b15 = (a186 + syms);
+        a187 = (b15 + 2);
+        a188 = *(a187);
+        a189 = _mm_set1_epi8(a188);
+        a190 = ((__m128i*)Branchtab);
+        a191 = *(a190);
+        a192 = _mm_xor_si128(a189, a191);
+        a193 = (b15 + 3);
+        a194 = *(a193);
+        a195 = _mm_set1_epi8(a194);
+        a196 = (a190 + 2);
+        a197 = *(a196);
+        a198 = _mm_xor_si128(a195, a197);
+        t25 = _mm_avg_epu8(a192, a198);
+        a199 = ((__m128i)t25);
+        a200 = _mm_srli_epi16(a199, 2);
+        a201 = ((__m128i)a200);
+        t26 = _mm_and_si128(
+            a201,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t27 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t26);
+        m39 = _mm_adds_epu8(s46, t26);
+        m40 = _mm_adds_epu8(s47, t27);
+        m41 = _mm_adds_epu8(s46, t27);
+        m42 = _mm_adds_epu8(s47, t26);
+        a202 = _mm_min_epu8(m40, m39);
+        d17 = _mm_cmpeq_epi8(a202, m40);
+        a203 = _mm_min_epu8(m42, m41);
+        d18 = _mm_cmpeq_epi8(a203, m42);
+        s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
+        a204 = ((short int*)dec);
+        a205 = (8 * i9);
+        b16 = (a204 + a205);
+        a206 = (b16 + 4);
+        *(a206) = s48;
+        s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
+        a207 = (b16 + 5);
+        *(a207) = s49;
+        s50 = _mm_unpacklo_epi8(a202, a203);
+        s51 = _mm_unpackhi_epi8(a202, a203);
+        a208 = ((__m128i*)X);
+        *(a208) = s50;
+        a209 = (a208 + 1);
+        *(a209) = s51;
+        a210 = (a184 + 1);
+        s52 = *(a210);
+        a211 = (a184 + 3);
+        s53 = *(a211);
+        a212 = (a190 + 1);
+        a213 = *(a212);
+        a214 = _mm_xor_si128(a189, a213);
+        a215 = (a190 + 3);
+        a216 = *(a215);
+        a217 = _mm_xor_si128(a195, a216);
+        t28 = _mm_avg_epu8(a214, a217);
+        a218 = ((__m128i)t28);
+        a219 = _mm_srli_epi16(a218, 2);
+        a220 = ((__m128i)a219);
+        t29 = _mm_and_si128(
+            a220,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t30 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t29);
+        m43 = _mm_adds_epu8(s52, t29);
+        m44 = _mm_adds_epu8(s53, t30);
+        m45 = _mm_adds_epu8(s52, t30);
+        m46 = _mm_adds_epu8(s53, t29);
+        a221 = _mm_min_epu8(m44, m43);
+        d19 = _mm_cmpeq_epi8(a221, m44);
+        a222 = _mm_min_epu8(m46, m45);
+        d20 = _mm_cmpeq_epi8(a222, m46);
+        s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
+        a223 = (b16 + 6);
+        *(a223) = s54;
+        s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
+        a224 = (b16 + 7);
+        *(a224) = s55;
+        s56 = _mm_unpacklo_epi8(a221, a222);
+        s57 = _mm_unpackhi_epi8(a221, a222);
+        a225 = (a208 + 2);
+        *(a225) = s56;
+        a226 = (a208 + 3);
+        *(a226) = s57;
+        if ((((unsigned char*)X)[0] > 210)) {
+            __m128i m12, m13;
+            m12 = ((__m128i*)X)[0];
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
+            __m128i m14;
+            m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
+                                         ((__m128i)m14)));
+            m14 = _mm_unpacklo_epi8(m14, m14);
+            m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+            m13 = _mm_unpacklo_epi64(m14, m14);
+            ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
+            ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
+            ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
+            ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
+        }
+    }
+
+    renormalize(X, 210);
+
+    /*int ch;
+    for(ch = 0; ch < 64; ch++) {
+      printf("%d,", X[ch]);
+    }
+    printf("\n");*/
+
+    unsigned int j;
+    for (j = 0; j < (framebits + excess) % 2; ++j) {
+        int i;
+        for (i = 0; i < 64 / 2; i++) {
+            BFLY(i,
+                 (((framebits + excess) >> 1) << 1) + j,
+                 syms,
+                 Y,
+                 X,
+                 (decision_t*)dec,
+                 Branchtab);
+        }
+
+
+        renormalize(Y, 210);
+
+        /*printf("\n");
+        for(ch = 0; ch < 64; ch++) {
+          printf("%d,", Y[ch]);
+        }
+        printf("\n");*/
+    }
+    /*skip*/
+}
+
+#endif /*LV_HAVE_SSE3*/
+
+#if LV_HAVE_NEON
+
+#include "volk/sse2neon.h"
+
+static inline void volk_8u_x4_conv_k7_r2_8u_neonspiral(unsigned char* Y,
+                                                       unsigned char* X,
+                                                       unsigned char* syms,
+                                                       unsigned char* dec,
+                                                       unsigned int framebits,
+                                                       unsigned int excess,
+                                                       unsigned char* Branchtab)
+{
+    unsigned int i9;
+    for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
+        unsigned char a75, a81;
+        int a73, a92;
+        short int s20, s21, s26, s27;
+        unsigned char *a74, *a80, *b6;
+        short int *a110, *a111, *a91, *a93, *a94;
+        __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
+        __m128i a105, a106, a86, a87;
+        __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
+            a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
+            s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
+        a71 = ((__m128i*)X);
+        s18 = *(a71);
+        a72 = (a71 + 2);
+        s19 = *(a72);
+        a73 = (4 * i9);
+        a74 = (syms + a73);
+        a75 = *(a74);
+        a76 = _mm_set1_epi8(a75);
+        a77 = ((__m128i*)Branchtab);
+        a78 = *(a77);
+        a79 = _mm_xor_si128(a76, a78);
+        b6 = (a73 + syms);
+        a80 = (b6 + 1);
+        a81 = *(a80);
+        a82 = _mm_set1_epi8(a81);
+        a83 = (a77 + 2);
+        a84 = *(a83);
+        a85 = _mm_xor_si128(a82, a84);
+        t13 = _mm_avg_epu8(a79, a85);
+        a86 = ((__m128i)t13);
+        a87 = _mm_srli_epi16(a86, 2);
+        a88 = ((__m128i)a87);
+        t14 = _mm_and_si128(
+            a88,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t15 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t14);
+        m23 = _mm_adds_epu8(s18, t14);
+        m24 = _mm_adds_epu8(s19, t15);
+        m25 = _mm_adds_epu8(s18, t15);
+        m26 = _mm_adds_epu8(s19, t14);
+        a89 = _mm_min_epu8(m24, m23);
+        d9 = _mm_cmpeq_epi8(a89, m24);
+        a90 = _mm_min_epu8(m26, m25);
+        d10 = _mm_cmpeq_epi8(a90, m26);
+        s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
+        a91 = ((short int*)dec);
+        a92 = (8 * i9);
+        a93 = (a91 + a92);
+        *(a93) = s20;
+        s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
+        a94 = (a93 + 1);
+        *(a94) = s21;
+        s22 = _mm_unpacklo_epi8(a89, a90);
+        s23 = _mm_unpackhi_epi8(a89, a90);
+        a95 = ((__m128i*)Y);
+        *(a95) = s22;
+        a96 = (a95 + 1);
+        *(a96) = s23;
+        a97 = (a71 + 1);
+        s24 = *(a97);
+        a98 = (a71 + 3);
+        s25 = *(a98);
+        a99 = (a77 + 1);
+        a100 = *(a99);
+        a101 = _mm_xor_si128(a76, a100);
+        a102 = (a77 + 3);
+        a103 = *(a102);
+        a104 = _mm_xor_si128(a82, a103);
+        t16 = _mm_avg_epu8(a101, a104);
+        a105 = ((__m128i)t16);
+        a106 = _mm_srli_epi16(a105, 2);
+        a107 = ((__m128i)a106);
+        t17 = _mm_and_si128(
+            a107,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t18 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t17);
+        m27 = _mm_adds_epu8(s24, t17);
+        m28 = _mm_adds_epu8(s25, t18);
+        m29 = _mm_adds_epu8(s24, t18);
+        m30 = _mm_adds_epu8(s25, t17);
+        a108 = _mm_min_epu8(m28, m27);
+        d11 = _mm_cmpeq_epi8(a108, m28);
+        a109 = _mm_min_epu8(m30, m29);
+        d12 = _mm_cmpeq_epi8(a109, m30);
+        s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
+        a110 = (a93 + 2);
+        *(a110) = s26;
+        s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
+        a111 = (a93 + 3);
+        *(a111) = s27;
+        s28 = _mm_unpacklo_epi8(a108, a109);
+        s29 = _mm_unpackhi_epi8(a108, a109);
+        a112 = (a95 + 2);
+        *(a112) = s28;
+        a113 = (a95 + 3);
+        *(a113) = s29;
+        if ((((unsigned char*)Y)[0] > 210)) {
+            __m128i m5, m6;
+            m5 = ((__m128i*)Y)[0];
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
+            m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
+            __m128i m7;
+            m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
+            m7 =
+                ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
+            m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
+            m7 = _mm_unpacklo_epi8(m7, m7);
+            m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
+            m6 = _mm_unpacklo_epi64(m7, m7);
+            ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
+            ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
+            ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
+            ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
+        }
+        unsigned char a188, a194;
+        int a186, a205;
+        short int s48, s49, s54, s55;
+        unsigned char *a187, *a193, *b15;
+        short int *a204, *a206, *a207, *a223, *a224, *b16;
+        __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
+            *a225, *a226;
+        __m128i a199, a200, a218, a219;
+        __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
+            a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
+            m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
+        a184 = ((__m128i*)Y);
+        s46 = *(a184);
+        a185 = (a184 + 2);
+        s47 = *(a185);
+        a186 = (4 * i9);
+        b15 = (a186 + syms);
+        a187 = (b15 + 2);
+        a188 = *(a187);
+        a189 = _mm_set1_epi8(a188);
+        a190 = ((__m128i*)Branchtab);
+        a191 = *(a190);
+        a192 = _mm_xor_si128(a189, a191);
+        a193 = (b15 + 3);
+        a194 = *(a193);
+        a195 = _mm_set1_epi8(a194);
+        a196 = (a190 + 2);
+        a197 = *(a196);
+        a198 = _mm_xor_si128(a195, a197);
+        t25 = _mm_avg_epu8(a192, a198);
+        a199 = ((__m128i)t25);
+        a200 = _mm_srli_epi16(a199, 2);
+        a201 = ((__m128i)a200);
+        t26 = _mm_and_si128(
+            a201,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t27 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t26);
+        m39 = _mm_adds_epu8(s46, t26);
+        m40 = _mm_adds_epu8(s47, t27);
+        m41 = _mm_adds_epu8(s46, t27);
+        m42 = _mm_adds_epu8(s47, t26);
+        a202 = _mm_min_epu8(m40, m39);
+        d17 = _mm_cmpeq_epi8(a202, m40);
+        a203 = _mm_min_epu8(m42, m41);
+        d18 = _mm_cmpeq_epi8(a203, m42);
+        s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
+        a204 = ((short int*)dec);
+        a205 = (8 * i9);
+        b16 = (a204 + a205);
+        a206 = (b16 + 4);
+        *(a206) = s48;
+        s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
+        a207 = (b16 + 5);
+        *(a207) = s49;
+        s50 = _mm_unpacklo_epi8(a202, a203);
+        s51 = _mm_unpackhi_epi8(a202, a203);
+        a208 = ((__m128i*)X);
+        *(a208) = s50;
+        a209 = (a208 + 1);
+        *(a209) = s51;
+        a210 = (a184 + 1);
+        s52 = *(a210);
+        a211 = (a184 + 3);
+        s53 = *(a211);
+        a212 = (a190 + 1);
+        a213 = *(a212);
+        a214 = _mm_xor_si128(a189, a213);
+        a215 = (a190 + 3);
+        a216 = *(a215);
+        a217 = _mm_xor_si128(a195, a216);
+        t28 = _mm_avg_epu8(a214, a217);
+        a218 = ((__m128i)t28);
+        a219 = _mm_srli_epi16(a218, 2);
+        a220 = ((__m128i)a219);
+        t29 = _mm_and_si128(
+            a220,
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
+        t30 = _mm_subs_epu8(
+            _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
+            t29);
+        m43 = _mm_adds_epu8(s52, t29);
+        m44 = _mm_adds_epu8(s53, t30);
+        m45 = _mm_adds_epu8(s52, t30);
+        m46 = _mm_adds_epu8(s53, t29);
+        a221 = _mm_min_epu8(m44, m43);
+        d19 = _mm_cmpeq_epi8(a221, m44);
+        a222 = _mm_min_epu8(m46, m45);
+        d20 = _mm_cmpeq_epi8(a222, m46);
+        s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
+        a223 = (b16 + 6);
+        *(a223) = s54;
+        s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
+        a224 = (b16 + 7);
+        *(a224) = s55;
+        s56 = _mm_unpacklo_epi8(a221, a222);
+        s57 = _mm_unpackhi_epi8(a221, a222);
+        a225 = (a208 + 2);
+        *(a225) = s56;
+        a226 = (a208 + 3);
+        *(a226) = s57;
+        if ((((unsigned char*)X)[0] > 210)) {
+            __m128i m12, m13;
+            m12 = ((__m128i*)X)[0];
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
+            m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
+            __m128i m14;
+            m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
+                                         ((__m128i)m14)));
+            m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
+                                         ((__m128i)m14)));
+            m14 = _mm_unpacklo_epi8(m14, m14);
+            m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
+            m13 = _mm_unpacklo_epi64(m14, m14);
+            ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
+            ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
+            ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
+            ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
+        }
+    }
+
+    renormalize(X, 210);
+
+    /*int ch;
+    for(ch = 0; ch < 64; ch++) {
+      printf("%d,", X[ch]);
+    }
+    printf("\n");*/
+
+    unsigned int j;
+    for (j = 0; j < (framebits + excess) % 2; ++j) {
+        int i;
+        for (i = 0; i < 64 / 2; i++) {
+            BFLY(i,
+                 (((framebits + excess) >> 1) << 1) + j,
+                 syms,
+                 Y,
+                 X,
+                 (decision_t*)dec,
+                 Branchtab);
+        }
+
+
+        renormalize(Y, 210);
+
+        /*printf("\n");
+        for(ch = 0; ch < 64; ch++) {
+          printf("%d,", Y[ch]);
+        }
+        printf("\n");*/
+    }
+    /*skip*/
+}
+
+#endif /*LV_HAVE_NEON*/
+
+#if LV_HAVE_GENERIC
+
+static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
+                                                    unsigned char* X,
+                                                    unsigned char* syms,
+                                                    unsigned char* dec,
+                                                    unsigned int framebits,
+                                                    unsigned int excess,
+                                                    unsigned char* Branchtab)
+{
+    int nbits = framebits + excess;
+    int NUMSTATES = 64;
+    int RENORMALIZE_THRESHOLD = 210;
+
+    int s, i;
+    for (s = 0; s < nbits; s++) {
+        void* tmp;
+        for (i = 0; i < NUMSTATES / 2; i++) {
+            BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
+        }
+
+        renormalize(Y, RENORMALIZE_THRESHOLD);
+
+        ///     Swap pointers to old and new metrics
+        tmp = (void*)X;
+        X = Y;
+        Y = (unsigned char*)tmp;
+    }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt

new file mode 100644 (file)

index 0000000..75055ee
--- /dev/null
+++ b/lib/CMakeLists.txt
@@ -0,0 +1,634 @@
+#
+# Copyright 2011-2012,2014,2018 Free Software Foundation, Inc.
+#
+# This file is part of VOLK.
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+########################################################################
+# header file detection
+########################################################################
+include(CheckIncludeFile)
+CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
+if(HAVE_CPUID_H)
+    add_definitions(-DHAVE_CPUID_H)
+endif()
+
+CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
+if(HAVE_INTRIN_H)
+    add_definitions(-DHAVE_INTRIN_H)
+endif()
+
+CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
+if(HAVE_FENV_H)
+    add_definitions(-DHAVE_FENV_H)
+endif()
+
+CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
+if(HAVE_DLFCN_H)
+    add_definitions(-DHAVE_DLFCN_H)
+    list(APPEND volk_libraries ${CMAKE_DL_LIBS})
+endif()
+
+########################################################################
+# Setup the compiler name
+########################################################################
+set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
+if(MSVC) #its not set otherwise
+    set(COMPILER_NAME MSVC)
+endif()
+
+# Assume "AppleClang == Clang".
+string(TOLOWER ${COMPILER_NAME} COMPILER_NAME_LOWER)
+string(REGEX MATCH "clang" COMPILER_NAME_LOWER ${COMPILER_NAME_LOWER})
+if(${COMPILER_NAME_LOWER} MATCHES "clang")
+    set(COMPILER_NAME "Clang")
+endif()
+
+message(STATUS "Compiler name: ${COMPILER_NAME}")
+
+if(NOT DEFINED COMPILER_NAME)
+    message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
+endif()
+
+########################################################################
+# Special clang flag so flag checks can fail
+########################################################################
+if(COMPILER_NAME MATCHES "GNU")
+    include(CheckCXXCompilerFlag)
+    CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+    if(HAVE_WERROR_UNUSED_CMD_LINE_ARG)
+        set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument")
+    endif()
+endif()
+
+########################################################################
+# POSIX_MEMALIGN: If we have to fall back to `posix_memalign`,
+# make it known to the compiler.
+########################################################################
+if(HAVE_POSIX_MEMALIGN)
+    message(STATUS "Use `posix_memalign` for aligned malloc!")
+    add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+########################################################################
+# detect x86 flavor of CPU
+########################################################################
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64|AMD64)$")
+    message(STATUS "x86* CPU detected")
+    set(CPU_IS_x86 TRUE)
+endif()
+
+########################################################################
+# determine passing architectures based on compile flag tests
+########################################################################
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+    ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+    --mode "arch_flags" --compiler "${COMPILER_NAME}"
+    OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+macro(check_arch arch_name)
+    set(flags ${ARGN})
+    set(have_${arch_name} TRUE)
+    foreach(flag ${flags})
+        if (MSVC AND (${flag} STREQUAL "/arch:SSE2" OR ${flag} STREQUAL "/arch:SSE" ))
+            # SSE/SSE2 is supported in MSVC since VS 2005 but flag not available when compiling 64-bit so do not check
+        else()
+            include(CheckCXXCompilerFlag)
+            set(have_flag have${flag})
+            #make the have_flag have nice alphanum chars (just for looks/not necessary)
+            execute_process(
+                COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
+                OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
+            )
+            if(VOLK_FLAG_CHECK_FLAGS)
+                set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
+            endif()
+            CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
+            unset(CMAKE_REQUIRED_FLAGS)
+            if (NOT ${have_flag})
+                set(have_${arch_name} FALSE)
+            endif()
+        endif()
+    endforeach()
+    if (have_${arch_name})
+        list(APPEND available_archs ${arch_name})
+    endif()
+endmacro(check_arch)
+
+foreach(line ${arch_flag_lines})
+    string(REGEX REPLACE "," ";" arch_flags ${line})
+    check_arch(${arch_flags})
+endforeach(line)
+
+macro(OVERRULE_ARCH arch reason)
+    message(STATUS "${reason}, Overruled arch ${arch}")
+    list(REMOVE_ITEM available_archs ${arch})
+endmacro(OVERRULE_ARCH)
+
+macro(FORCE_ARCH arch reason)
+  message(STATUS "${reason}, Forced arch ${arch}")
+  list(APPEND available_archs ${arch})
+endmacro(FORCE_ARCH)
+
+########################################################################
+# eliminate AVX on if not on x86, or if the compiler does not accept
+# the xgetbv instruction, or {if not cross-compiling and the xgetbv
+# executable does not function correctly}.
+########################################################################
+set(HAVE_AVX_CVTPI32_PS 0)
+if(CPU_IS_x86)
+    #########################################################################
+    # eliminate AVX if cvtpi32_ps intrinsic fails like some versions of clang
+    #########################################################################
+
+    # check to see if the compiler/linker works with cvtpi32_ps instrinsic when using AVX
+    if (CMAKE_SIZEOF_VOID_P EQUAL 4)
+        file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include <immintrin.h>\nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }")
+        execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o
+          ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
+          ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c
+          OUTPUT_QUIET ERROR_QUIET
+          RESULT_VARIABLE avx_compile_result)
+        if(NOT ${avx_compile_result} EQUAL 0)
+          OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic")
+        else()
+          set(HAVE_AVX_CVTPI32_PS 1)
+        endif()
+        file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
+          ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c)
+    else(CMAKE_SIZEOF_VOID_P EQUAL 4)
+           # 64-bit compilations won't need this command so don't overrule AVX
+           set(HAVE_AVX_CVTPI32_PS 0)
+    endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
+
+    # Disable SSE4a if Clang is less than version 3.2
+    if("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+      # Figure out the version of Clang
+
+      if(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
+        OVERRULE_ARCH(sse4_a "Clang >= 3.2 required for SSE4a")
+      endif(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
+    endif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+
+endif(CPU_IS_x86)
+
+if(${HAVE_AVX_CVTPI32_PS})
+    add_definitions(-DHAVE_AVX_CVTPI32_PS)
+endif()
+
+########################################################################
+# if the CPU is not x86, eliminate all Intel SIMD
+########################################################################
+
+if(NOT CPU_IS_x86)
+    OVERRULE_ARCH(3dnow "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(mmx "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse2 "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse3 "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(ssse3 "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse4_a "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse4_1 "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(sse4_2 "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(avx "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(avx512f "Architecture is not x86 or x86_64")
+    OVERRULE_ARCH(avx512cd "Architecture is not x86 or x86_64")
+endif(NOT CPU_IS_x86)
+
+########################################################################
+# Select neon based on ARM ISA version
+########################################################################
+
+# First, compile a test program to see if compiler supports neon.
+
+include(CheckCSourceCompiles)
+
+check_c_source_compiles("#include <arm_neon.h>\nint main(){ uint8_t *dest; uint8x8_t res; vst1_u8(dest, res); }"
+                       neon_compile_result)
+
+if(neon_compile_result)
+    set(CMAKE_REQUIRED_INCLUDES ${PROJECT_SOURCE_DIR}/include)
+    check_c_source_compiles("#include <volk/volk_common.h>\n int main(){__VOLK_ASM __VOLK_VOLATILE(\"vrev32.8 q0, q0\");}"
+                            have_neonv7_result )
+    check_c_source_compiles("#include <volk/volk_common.h>\n int main(){__VOLK_ASM __VOLK_VOLATILE(\"sub v1.4s,v1.4s,v1.4s\");}"
+                            have_neonv8_result )
+
+    if (NOT have_neonv7_result)
+        OVERRULE_ARCH(neonv7 "Compiler doesn't support neonv7")
+    endif()
+
+    if (NOT have_neonv8_result)
+        OVERRULE_ARCH(neonv8 "Compiler doesn't support neonv8")
+    endif()
+else(neon_compile_result)
+    OVERRULE_ARCH(neon "Compiler doesn't support NEON")
+    OVERRULE_ARCH(neonv7 "Compiler doesn't support NEON")
+    OVERRULE_ARCH(neonv8 "Compiler doesn't support NEON")
+endif(neon_compile_result)
+
+########################################################################
+# implement overruling in the ORC case,
+# since ORC always passes flag detection
+########################################################################
+if(NOT ORC_FOUND)
+    OVERRULE_ARCH(orc "ORC support not found")
+endif()
+
+########################################################################
+# implement overruling in the non-multilib case
+# this makes things work when both -m32 and -m64 pass
+########################################################################
+if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
+    include(CheckTypeSize)
+    check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
+    if (${SIZEOF_CPU} EQUAL 64)
+        OVERRULE_ARCH(32 "CPU width is 64 bits")
+    endif()
+    if (${SIZEOF_CPU} EQUAL 32)
+        OVERRULE_ARCH(64 "CPU width is 32 bits")
+    endif()
+
+    #MSVC 64 bit does not have MMX, overrule it
+    if (MSVC)
+      if (${SIZEOF_CPU} EQUAL 64)
+        OVERRULE_ARCH(mmx "No MMX for Win64")
+      endif()
+      FORCE_ARCH(sse "Built-in for MSVC > 2013")
+      FORCE_ARCH(sse2 "Built-in for MSVC > 2013")
+    endif()
+
+
+endif()
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available architectures: ${available_archs}")
+
+########################################################################
+# determine available machines given the available architectures
+########################################################################
+execute_process(
+    COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+    ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+    --mode "machines" --archs "${available_archs}"
+    OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+########################################################################
+# Implement machine overruling for redundant machines:
+# A machine is redundant when expansion rules occur,
+# and the arch superset passes configuration checks.
+# When this occurs, eliminate the redundant machines
+# to avoid unnecessary compilation of subset machines.
+########################################################################
+foreach(arch mmx orc 64 32)
+    foreach(machine_name ${available_machines})
+        string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
+        if (${machine_name} STREQUAL ${machine_name_no_arch})
+        else()
+            list(REMOVE_ITEM available_machines ${machine_name_no_arch})
+        endif()
+    endforeach(machine_name)
+endforeach(arch)
+
+########################################################################
+# done overrules! print the result
+########################################################################
+message(STATUS "Available machines: ${available_machines}")
+
+########################################################################
+# Create rules to run the volk generator
+########################################################################
+
+#dependencies are all python, xml, and header implementation files
+file(GLOB xml_files ${PROJECT_SOURCE_DIR}/gen/*.xml)
+file(GLOB py_files ${PROJECT_SOURCE_DIR}/gen/*.py)
+file(GLOB h_files ${PROJECT_SOURCE_DIR}/kernels/volk/*.h)
+
+macro(gen_template tmpl output)
+    list(APPEND volk_gen_sources ${output})
+    add_custom_command(
+        OUTPUT ${output}
+        DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
+        COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+        ${PROJECT_SOURCE_DIR}/gen/volk_tmpl_utils.py
+        --input ${tmpl} --output ${output} ${ARGN}
+    )
+endmacro(gen_template)
+
+make_directory(${PROJECT_BINARY_DIR}/include/volk)
+
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk.tmpl.h              ${PROJECT_BINARY_DIR}/include/volk/volk.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk.tmpl.c              ${PROJECT_BINARY_DIR}/lib/volk.c)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_typedefs.tmpl.h     ${PROJECT_BINARY_DIR}/include/volk/volk_typedefs.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_cpu.tmpl.h          ${PROJECT_BINARY_DIR}/include/volk/volk_cpu.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_cpu.tmpl.c          ${PROJECT_BINARY_DIR}/lib/volk_cpu.c)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_config_fixed.tmpl.h ${PROJECT_BINARY_DIR}/include/volk/volk_config_fixed.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machines.tmpl.h     ${PROJECT_BINARY_DIR}/lib/volk_machines.h)
+gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machines.tmpl.c     ${PROJECT_BINARY_DIR}/lib/volk_machines.c)
+
+set(BASE_CFLAGS NONE)
+string(TOUPPER ${CMAKE_BUILD_TYPE} CBTU)
+message(STATUS "BUILD TYPE = ${CBTU}")
+message(STATUS "Base cflags = ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS}")
+set(COMPILER_INFO "")
+if(MSVC)
+    if(MSVC90)   #Visual Studio 9
+        set(cmake_c_compiler_version "Microsoft Visual Studio 9.0")
+    elseif(MSVC10) #Visual Studio 10
+        set(cmake_c_compiler_version "Microsoft Visual Studio 10.0")
+    elseif(MSVC11) #Visual Studio 11
+        set(cmake_c_compiler_version "Microsoft Visual Studio 11.0")
+    elseif(MSVC12) #Visual Studio 12
+        SET(cmake_c_compiler_version "Microsoft Visual Studio 12.0")
+    elseif(MSVC14) #Visual Studio 14
+        SET(cmake_c_compiler_version "Microsoft Visual Studio 14.0")
+    endif()
+else()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} --version
+            OUTPUT_VARIABLE cmake_c_compiler_version)
+endif(MSVC)
+set(COMPILER_INFO "${CMAKE_C_COMPILER}:::${CMAKE_C_FLAGS_${GRCBTU}} ${CMAKE_C_FLAGS}\n${CMAKE_CXX_COMPILER}:::${CMAKE_CXX_FLAGS_${GRCBTU}} ${CMAKE_CXX_FLAGS}\n" )
+
+foreach(machine_name ${available_machines})
+    #generate machine source
+    set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_machine_${machine_name}.c)
+    gen_template(${PROJECT_SOURCE_DIR}/tmpl/volk_machine_xxx.tmpl.c ${machine_source} ${machine_name})
+
+    #determine machine flags
+    execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
+        ${PROJECT_SOURCE_DIR}/gen/volk_compile_utils.py
+        --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
+        OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}")
+    set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" )
+    if(${machine_name}_flags AND NOT MSVC)
+        set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+    endif()
+
+    #add to available machine defs
+    string(TOUPPER LV_MACHINE_${machine_name} machine_def)
+    list(APPEND machine_defs ${machine_def})
+endforeach(machine_name)
+
+# Convert to a C string to compile and display properly
+string(STRIP "${cmake_c_compiler_version}" cmake_c_compiler_version)
+string(STRIP ${COMPILER_INFO} COMPILER_INFO)
+MESSAGE(STATUS "Compiler Version: ${cmake_c_compiler_version}")
+string(REPLACE "\n" " \\n" cmake_c_compiler_version ${cmake_c_compiler_version})
+string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO})
+
+
+########################################################################
+# Handle ASM support
+#  on by default, but let users turn it off
+########################################################################
+set(ASM_ARCHS_AVAILABLE "neonv7" "neonv8")
+
+set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}")
+
+# sort through a list of all architectures we have ASM for
+# if we find one that matches our current system architecture
+# set up the assembler flags and include the source files
+foreach(ARCH ${ASM_ARCHS_AVAILABLE})
+  string(REGEX MATCH "${ARCH}" ASM_ARCH "${available_archs}")
+if( ASM_ARCH STREQUAL "neonv7" )
+  message(STATUS "---- Adding ASM files") # we always use ATT syntax
+  message(STATUS "-- Detected neon architecture; enabling ASM")
+  # architecture specific assembler flags are now set in the cmake toolchain file
+  # then add the files
+  include_directories(${PROJECT_SOURCE_DIR}/kernels/volk/asm/neon)
+  file(GLOB asm_files ${PROJECT_SOURCE_DIR}/kernels/volk/asm/neon/*.s)
+  foreach(asm_file ${asm_files})
+    list(APPEND volk_sources ${asm_file})
+    message(STATUS "Adding source file: ${asm_file}")
+  endforeach(asm_file)
+endif()
+enable_language(ASM)
+message(STATUS "c flags: ${FULL_C_FLAGS}")
+message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
+endforeach(ARCH)
+
+
+########################################################################
+# Handle orc support
+########################################################################
+if(ORC_FOUND)
+    #setup orc library usage
+    include_directories(${ORC_INCLUDE_DIRS})
+    link_directories(${ORC_LIBRARY_DIRS})
+
+    #setup orc functions
+    file(GLOB orc_files ${PROJECT_SOURCE_DIR}/kernels/volk/asm/orc/*.orc)
+    foreach(orc_file ${orc_files})
+
+        #extract the name for the generated c source from the orc file
+        get_filename_component(orc_file_name_we ${orc_file} NAME_WE)
+        set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c)
+
+        #create a rule to generate the source and add to the list of sources
+        add_custom_command(
+            COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file}
+            DEPENDS ${orc_file} OUTPUT ${orcc_gen}
+        )
+        list(APPEND volk_sources ${orcc_gen})
+
+    endforeach(orc_file)
+else()
+    message(STATUS "Did not find liborc and orcc, disabling orc support...")
+endif()
+
+########################################################################
+# Handle the generated constants
+########################################################################
+
+message(STATUS "Loading version ${VERSION} into constants...")
+
+#double escape for windows backslash path separators
+string(REPLACE "\\" "\\\\" prefix "${prefix}")
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in
+    ${CMAKE_CURRENT_BINARY_DIR}/constants.c
+@ONLY)
+
+list(APPEND volk_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c)
+
+########################################################################
+# Setup the volk sources list and library
+########################################################################
+if(NOT WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden -Wno-deprecated-declarations")
+endif()
+
+list(APPEND volk_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/volk_prefs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/volk_rank_archs.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/volk_malloc.c
+    ${volk_gen_sources}
+)
+
+#set the machine definitions where applicable
+set_source_files_properties(
+    ${CMAKE_CURRENT_BINARY_DIR}/volk.c
+    ${CMAKE_CURRENT_BINARY_DIR}/volk_machines.c
+PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
+
+if(MSVC)
+    #add compatibility includes for stdint types
+    include_directories(${PROJECT_SOURCE_DIR}/cmake/msvc)
+    add_definitions(-DHAVE_CONFIG_H)
+    #compile the sources as C++ due to the lack of complex.h under MSVC
+    set_source_files_properties(${volk_sources} PROPERTIES LANGUAGE CXX)
+endif()
+
+#Create an object to reference Volk source and object files.
+#
+#NOTE: This object cannot be used to propagate include directories or
+#library linkage, but we have to define them here for compiling to
+#work. There are options starting with CMake 3.13 for using the OBJECT
+#to propagate this information.
+add_library(volk_obj OBJECT ${volk_sources})
+target_include_directories(volk_obj
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    PRIVATE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/kernels>
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+)
+if(VOLK_CPU_FEATURES)
+  set_source_files_properties(volk_cpu.c PROPERTIES COMPILE_DEFINITIONS "VOLK_CPU_FEATURES=1")
+  if(CpuFeatures_FOUND)
+    target_include_directories(volk_obj
+        PRIVATE $<TARGET_PROPERTY:CpuFeatures::cpu_features,INTERFACE_INCLUDE_DIRECTORIES>
+    )
+  else()
+    target_include_directories(volk_obj
+        PRIVATE $<TARGET_PROPERTY:cpu_features,INTERFACE_INCLUDE_DIRECTORIES>
+    )
+  endif()
+endif()
+
+#Configure object target properties
+if(NOT MSVC)
+  set_target_properties(volk_obj PROPERTIES COMPILE_FLAGS "-fPIC")
+endif()
+
+#Add dynamic library
+#
+#NOTE: The PUBLIC include directories and library linkage will be
+#propagated when this target is used to build other targets. Also, the
+#PUBLIC library linkage and include directories INSTALL_INTERFACE will
+#be used to create the target import information. Ordering of the
+#include directories is taken as provided; it -might- matter, but
+#probably doesn't.
+add_library(volk SHARED $<TARGET_OBJECTS:volk_obj>)
+target_link_libraries(volk PUBLIC ${volk_libraries})
+if(VOLK_CPU_FEATURES)
+  target_link_libraries(volk PRIVATE cpu_features)
+endif()
+target_include_directories(volk
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/kernels>
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+    PUBLIC $<INSTALL_INTERFACE:include>
+)
+
+
+#Configure target properties
+if(ORC_FOUND)
+  target_link_libraries(volk PRIVATE ${ORC_LIBRARIES})
+endif()
+if(NOT MSVC)
+  target_link_libraries(volk PUBLIC m)
+endif()
+set_target_properties(volk PROPERTIES VERSION ${VERSION})
+set_target_properties(volk PROPERTIES SOVERSION ${SOVERSION})
+set_target_properties(volk PROPERTIES DEFINE_SYMBOL "volk_EXPORTS")
+
+#Install locations
+install(TARGETS volk
+  EXPORT VOLK-export
+  LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_runtime" # .so file
+  ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel"   # .lib file
+  RUNTIME DESTINATION bin              COMPONENT "volk_runtime" # .dll file
+  )
+
+#Configure static library
+#
+#NOTE: The PUBLIC include directories and library linkage will be
+#propagated when this target is used to build other targets. Also, the
+#PUBLIC library linkage and include directories INSTALL_INTERFACE will
+#be used to create the target import information. Ordering of the
+#include directories is taken as provided; it -might- matter, but
+#probably doesn't.
+if(ENABLE_STATIC_LIBS)
+  add_library(volk_static STATIC $<TARGET_OBJECTS:volk_obj>)
+  target_link_libraries(volk_static PUBLIC ${volk_libraries})
+  if(VOLK_CPU_FEATURES)
+    target_link_libraries(volk_static PRIVATE cpu_features)
+  endif()
+  if(ORC_FOUND)
+    target_link_libraries(volk_static PUBLIC ${ORC_LIBRARIES_STATIC})
+  endif()
+  if(NOT ANDROID)
+    target_link_libraries(volk_static PRIVATE pthread)
+  endif()
+  if(NOT MSVC)
+    target_link_libraries(volk_static PUBLIC m)
+  endif()
+  target_include_directories(volk_static
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/kernels>
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+    PUBLIC $<INSTALL_INTERFACE:include>
+  )
+
+  set_target_properties(volk_static PROPERTIES OUTPUT_NAME volk)
+
+  install(TARGETS volk_static
+    EXPORT VOLK-export
+    ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_devel"
+    )
+endif(ENABLE_STATIC_LIBS)
+
+########################################################################
+# Build the QA test application
+########################################################################
+if(ENABLE_TESTING)
+
+    make_directory(${CMAKE_CURRENT_BINARY_DIR}/.unittest)
+    include(VolkAddTest)
+    if(ENABLE_STATIC_LIBS)
+        VOLK_GEN_TEST(volk_test_all
+            SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
+            TARGET_DEPS volk_static
+          )
+    else()
+        VOLK_GEN_TEST(volk_test_all
+            SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
+            TARGET_DEPS volk
+          )
+    endif()
+    foreach(kernel ${h_files})
+      get_filename_component(kernel ${kernel} NAME)
+      string(REPLACE ".h" "" kernel ${kernel})
+      VOLK_ADD_TEST(${kernel} volk_test_all)
+    endforeach()
+
+endif(ENABLE_TESTING)
diff --git a/lib/constants.c.in b/lib/constants.c.in

new file mode 100644 (file)

index 0000000..fba8c39
--- /dev/null
+++ b/lib/constants.c.in
@@ -0,0 +1,47 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2013, 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <volk/constants.h>
+
+const char*
+volk_prefix()
+{
+  const char *prefix = getenv("VOLK_PREFIX");
+  if (prefix != NULL) return prefix;
+  return "@prefix@";
+}
+
+const char*
+volk_version()
+{
+  return "@VERSION@";
+}
+
+const char*
+volk_c_compiler()
+{
+  return "@cmake_c_compiler_version@";
+}
+
+const char*
+volk_compiler_flags()
+{
+  return "@COMPILER_INFO@";
+}
+
+const char*
+volk_available_machines()
+{
+  return "@available_machines@";
+}
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h

new file mode 100644 (file)

index 0000000..72330e4
--- /dev/null
+++ b/lib/kernel_tests.h
@@ -0,0 +1,188 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2014 - 2021 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "qa_utils.h"
+
+#include <volk/volk.h>
+#include <vector>
+
+// macros for initializing volk_test_case_t. Maccros are needed to generate
+// function names of the pattern kernel_name_*
+
+// for puppets we need to get all the func_variants for the puppet and just
+// keep track of the actual function name to write to results
+#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
+    volk_test_case_t(func##_get_func_desc(),                  \
+                     (void (*)())func##_manual,               \
+                     std::string(#func),                      \
+                     std::string(#puppet_master_func),        \
+                     test_params)
+
+#define VOLK_INIT_TEST(func, test_params)       \
+    volk_test_case_t(func##_get_func_desc(),    \
+                     (void (*)())func##_manual, \
+                     std::string(#func),        \
+                     test_params)
+
+#define QA(test) test_cases.push_back(test);
+std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+{
+
+    // Some kernels need a lower tolerance
+    volk_test_params_t test_params_inacc = test_params.make_tol(1e-2);
+    volk_test_params_t test_params_inacc_tenth = test_params.make_tol(1e-1);
+
+    volk_test_params_t test_params_power(test_params);
+    test_params_power.set_scalar(2.5);
+
+    volk_test_params_t test_params_rotator(test_params);
+    test_params_rotator.set_scalar(std::polar(1.0f, 0.1f));
+    test_params_rotator.set_tol(1e-3);
+
+    std::vector<volk_test_case_t> test_cases;
+    QA(VOLK_INIT_PUPP(volk_64u_popcntpuppet_64u, volk_64u_popcnt, test_params))
+    QA(VOLK_INIT_PUPP(volk_16u_byteswappuppet_16u, volk_16u_byteswap, test_params))
+    QA(VOLK_INIT_PUPP(volk_32u_byteswappuppet_32u, volk_32u_byteswap, test_params))
+    QA(VOLK_INIT_PUPP(volk_32u_popcntpuppet_32u, volk_32u_popcnt_32u, test_params))
+    QA(VOLK_INIT_PUPP(volk_64u_byteswappuppet_64u, volk_64u_byteswap, test_params))
+    QA(VOLK_INIT_PUPP(volk_32fc_s32fc_rotatorpuppet_32fc,
+                      volk_32fc_s32fc_x2_rotator_32fc,
+                      test_params_rotator))
+    QA(VOLK_INIT_PUPP(
+        volk_8u_conv_k7_r2puppet_8u, volk_8u_x4_conv_k7_r2_8u, test_params.make_tol(0)))
+    QA(VOLK_INIT_PUPP(
+        volk_32f_x2_fm_detectpuppet_32f, volk_32f_s32f_32f_fm_detect_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_16i_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_magnitude_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_s32f_magnitude_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_convert_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_x2_multiply_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_16ic_x2_dot_prod_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_convert_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_16i_32fc_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_min_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_index_min_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5)))
+    QA(VOLK_INIT_TEST(volk_32f_expfast_32f, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32f_x2_pow_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_sin_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_cos_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_tan_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_atan_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_asin_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_acos_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_32fc, test_params_power))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_calc_spectral_noise_floor_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_atan2_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_conjugate_dot_prod_32fc, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_accumulator_s32fc, test_params.make_tol(1e-3)))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_64f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_imag_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_deinterleave_real_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_index_min_16u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_index_min_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth))
+    QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_add_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_convert_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_convert_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_convert_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32f_power_spectrum_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_square_dist_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_s32f_square_dist_scalar_mult_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_divide_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_x2_s32f_interleave_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_interleave_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_max_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_min_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_multiply_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_64f_multiply_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_tol(1e-3)))
+    QA(VOLK_INIT_TEST(volk_32f_x2_subtract_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x3_sum_of_poly_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32i_x2_and_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32i_x2_or_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_x2_dot_prod_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_max_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_min_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_multiply_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_64f_x2_add_64f, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_16i_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_32f_x2, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_s32f_deinterleave_real_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_deinterleave_real_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_x2_multiply_conjugate_16ic, test_params))
+    QA(VOLK_INIT_TEST(volk_8ic_x2_s32f_multiply_conjugate_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_8i_convert_16i, test_params))
+    QA(VOLK_INIT_TEST(volk_8i_s32f_convert_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32fc_s32fc_multiply_32fc, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_multiply_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_s32f_add_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_32i, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_binary_slicer_8i, test_params))
+    QA(VOLK_INIT_TEST(volk_32u_reverse_32u, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_tanh_32f, test_params_inacc))
+    QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
+    QA(VOLK_INIT_PUPP(
+        volk_32f_s32f_mod_rangepuppet_32f, volk_32f_s32f_s32f_mod_range_32f, test_params))
+    QA(VOLK_INIT_PUPP(
+        volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
+    QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f,
+                      volk_32f_8u_polarbutterfly_32f,
+                      test_params))
+    QA(VOLK_INIT_PUPP(volk_32fc_s32f_power_spectral_densitypuppet_32f,
+                      volk_32fc_s32f_x2_power_spectral_density_32f,
+                      test_params))
+    // no one uses these, so don't test them
+    // VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results,
+    // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046,
+    // 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_max_star_16i,
+    // 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
+    // VOLK_PROFILE(volk_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results,
+    // benchmark_mode, kernel_regex); VOLK_PROFILE(volk_16i_permute_and_scalar_add, 1e-4,
+    // 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
+    // VOLK_PROFILE(volk_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results,
+    // benchmark_mode, kernel_regex);
+
+    return test_cases;
+}
diff --git a/lib/qa_utils.cc b/lib/qa_utils.cc

new file mode 100644 (file)

index 0000000..a74f7bc
--- /dev/null
+++ b/lib/qa_utils.cc
@@ -0,0 +1,887 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011 - 2020, 2022 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include "qa_utils.h"
+#include <volk/volk.h>
+
+#include <volk/volk.h>        // for volk_func_desc_t
+#include <volk/volk_malloc.h> // for volk_free, volk_m...
+
+#include <assert.h>    // for assert
+#include <stdint.h>    // for uint16_t, uint64_t
+#include <sys/time.h>  // for CLOCKS_PER_SEC
+#include <sys/types.h> // for int16_t, int32_t
+#include <chrono>
+#include <cmath>    // for sqrt, fabs, abs
+#include <cstring>  // for memcpy, memset
+#include <ctime>    // for clock
+#include <fstream>  // for operator<<, basic...
+#include <iostream> // for cout, cerr
+#include <limits>   // for numeric_limits
+#include <map>      // for map, map<>::mappe...
+#include <random>
+#include <vector> // for vector, _Bit_refe...
+
+template <typename T>
+void random_floats(void* buf, unsigned int n, std::default_random_engine& rnd_engine)
+{
+    T* array = static_cast<T*>(buf);
+    std::uniform_real_distribution<T> uniform_dist(T(-1), T(1));
+    for (unsigned int i = 0; i < n; i++) {
+        array[i] = uniform_dist(rnd_engine);
+    }
+}
+
+void load_random_data(void* data, volk_type_t type, unsigned int n)
+{
+    std::random_device rnd_device;
+    std::default_random_engine rnd_engine(rnd_device());
+    if (type.is_complex)
+        n *= 2;
+    if (type.is_float) {
+        if (type.size == 8) {
+            random_floats<double>(data, n, rnd_engine);
+        } else {
+            random_floats<float>(data, n, rnd_engine);
+        }
+    } else {
+        float int_max = float(uint64_t(2) << (type.size * 8));
+        if (type.is_signed)
+            int_max /= 2.0;
+        std::uniform_real_distribution<float> uniform_dist(-int_max, int_max);
+        for (unsigned int i = 0; i < n; i++) {
+            float scaled_rand = uniform_dist(rnd_engine);
+            // man i really don't know how to do this in a more clever way, you have to
+            // cast down at some point
+            switch (type.size) {
+            case 8:
+                if (type.is_signed)
+                    ((int64_t*)data)[i] = (int64_t)scaled_rand;
+                else
+                    ((uint64_t*)data)[i] = (uint64_t)scaled_rand;
+                break;
+            case 4:
+                if (type.is_signed)
+                    ((int32_t*)data)[i] = (int32_t)scaled_rand;
+                else
+                    ((uint32_t*)data)[i] = (uint32_t)scaled_rand;
+                break;
+            case 2:
+                if (type.is_signed)
+                    ((int16_t*)data)[i] = (int16_t)((int16_t)scaled_rand % 8);
+                else
+                    ((uint16_t*)data)[i] = (uint16_t)((int16_t)scaled_rand % 8);
+                break;
+            case 1:
+                if (type.is_signed)
+                    ((int8_t*)data)[i] = (int8_t)scaled_rand;
+                else
+                    ((uint8_t*)data)[i] = (uint8_t)scaled_rand;
+                break;
+            default:
+                throw "load_random_data: no support for data size > 8 or < 1"; // no
+                                                                               // shenanigans
+                                                                               // here
+            }
+        }
+    }
+}
+
+static std::vector<std::string> get_arch_list(volk_func_desc_t desc)
+{
+    std::vector<std::string> archlist;
+
+    for (size_t i = 0; i < desc.n_impls; i++) {
+        archlist.push_back(std::string(desc.impl_names[i]));
+    }
+
+    return archlist;
+}
+
+template <typename T>
+T volk_lexical_cast(const std::string& str)
+{
+    for (unsigned int c_index = 0; c_index < str.size(); ++c_index) {
+        if (str.at(c_index) < '0' || str.at(c_index) > '9') {
+            throw "not all numbers!";
+        }
+    }
+    T var;
+    std::istringstream iss;
+    iss.str(str);
+    iss >> var;
+    // deal with any error bits that may have been set on the stream
+    return var;
+}
+
+volk_type_t volk_type_from_string(std::string name)
+{
+    volk_type_t type;
+    type.is_float = false;
+    type.is_scalar = false;
+    type.is_complex = false;
+    type.is_signed = false;
+    type.size = 0;
+    type.str = name;
+
+    if (name.size() < 2) {
+        throw std::string("name too short to be a datatype");
+    }
+
+    // is it a scalar?
+    if (name[0] == 's') {
+        type.is_scalar = true;
+        name = name.substr(1, name.size() - 1);
+    }
+
+    // get the data size
+    size_t last_size_pos = name.find_last_of("0123456789");
+    if (last_size_pos == std::string::npos) {
+        throw std::string("no size spec in type ").append(name);
+    }
+    // will throw if malformed
+    int size = volk_lexical_cast<int>(name.substr(0, last_size_pos + 1));
+
+    assert(((size % 8) == 0) && (size <= 64) && (size != 0));
+    type.size = size / 8; // in bytes
+
+    for (size_t i = last_size_pos + 1; i < name.size(); i++) {
+        switch (name[i]) {
+        case 'f':
+            type.is_float = true;
+            break;
+        case 'i':
+            type.is_signed = true;
+            break;
+        case 'c':
+            type.is_complex = true;
+            break;
+        case 'u':
+            type.is_signed = false;
+            break;
+        default:
+            throw std::string("Error: no such type: '") + name[i] + "'";
+        }
+    }
+
+    return type;
+}
+
+std::vector<std::string> split_signature(const std::string& protokernel_signature)
+{
+    std::vector<std::string> signature_tokens;
+    std::string token;
+    for (unsigned int loc = 0; loc < protokernel_signature.size(); ++loc) {
+        if (protokernel_signature.at(loc) == '_') {
+            // this is a break
+            signature_tokens.push_back(token);
+            token = "";
+        } else {
+            token.push_back(protokernel_signature.at(loc));
+        }
+    }
+    // Get the last one to the end of the string
+    signature_tokens.push_back(token);
+    return signature_tokens;
+}
+
+static void get_signatures_from_name(std::vector<volk_type_t>& inputsig,
+                                     std::vector<volk_type_t>& outputsig,
+                                     std::string name)
+{
+
+    std::vector<std::string> toked = split_signature(name);
+
+    assert(toked[0] == "volk");
+    toked.erase(toked.begin());
+
+    // ok. we're assuming a string in the form
+    //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
+
+    enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
+    std::string fn_name;
+    volk_type_t type;
+    for (unsigned int token_index = 0; token_index < toked.size(); ++token_index) {
+        std::string token = toked[token_index];
+        try {
+            type = volk_type_from_string(token);
+            if (side == SIDE_NAME)
+                side = SIDE_OUTPUT; // if this is the first one after the name...
+
+            if (side == SIDE_INPUT)
+                inputsig.push_back(type);
+            else
+                outputsig.push_back(type);
+        } catch (...) {
+            if (token[0] == 'x' && (token.size() > 1) &&
+                (token[1] > '0' && token[1] < '9')) { // it's a multiplier
+                if (side == SIDE_INPUT)
+                    assert(inputsig.size() > 0);
+                else
+                    assert(outputsig.size() > 0);
+                int multiplier = volk_lexical_cast<int>(
+                    token.substr(1, token.size() - 1)); // will throw if invalid
+                for (int i = 1; i < multiplier; i++) {
+                    if (side == SIDE_INPUT)
+                        inputsig.push_back(inputsig.back());
+                    else
+                        outputsig.push_back(outputsig.back());
+                }
+            } else if (side ==
+                       SIDE_INPUT) { // it's the function name, at least it better be
+                side = SIDE_NAME;
+                fn_name.append("_");
+                fn_name.append(token);
+            } else if (side == SIDE_OUTPUT) {
+                if (token != toked.back())
+                    throw; // the last token in the name is the alignment
+            }
+        }
+    }
+    // we don't need an output signature (some fn's operate on the input data, "in
+    // place"), but we do need at least one input!
+    assert(inputsig.size() != 0);
+}
+
+inline void run_cast_test1(volk_fn_1arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], vlen, arch.c_str());
+}
+
+inline void run_cast_test2(volk_fn_2arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], vlen, arch.c_str());
+}
+
+inline void run_cast_test3(volk_fn_3arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
+}
+
+inline void run_cast_test4(volk_fn_4arg func,
+                           std::vector<void*>& buffs,
+                           unsigned int vlen,
+                           unsigned int iter,
+                           std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32f(volk_fn_1arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32f(volk_fn_2arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32f(volk_fn_3arg_s32f func,
+                                std::vector<void*>& buffs,
+                                float scalar,
+                                unsigned int vlen,
+                                unsigned int iter,
+                                std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test1_s32fc(volk_fn_1arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test2_s32fc(volk_fn_2arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
+}
+
+inline void run_cast_test3_s32fc(volk_fn_3arg_s32fc func,
+                                 std::vector<void*>& buffs,
+                                 lv_32fc_t scalar,
+                                 unsigned int vlen,
+                                 unsigned int iter,
+                                 std::string arch)
+{
+    while (iter--)
+        func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
+}
+
+template <class t>
+bool fcompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
+    bool fail = false;
+    int print_max_errs = 10;
+    for (unsigned int i = 0; i < vlen; i++) {
+        if (absolute_mode) {
+            if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                              << " in2: " << t(((t*)(in2))[i]);
+                    std::cout << " tolerance was: " << tol << std::endl;
+                }
+            }
+        } else {
+            // for very small numbers we'll see round off errors due to limited
+            // precision. So a special test case...
+            if (fabs(((t*)(in1))[i]) < 1e-30) {
+                if (fabs(((t*)(in2))[i]) > tol) {
+                    fail = true;
+                    if (print_max_errs-- > 0) {
+                        std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                                  << " in2: " << t(((t*)(in2))[i]);
+                        std::cout << " tolerance was: " << tol << std::endl;
+                    }
+                }
+            }
+            // the primary test is the percent different greater than given tol
+            else if (fabs(((t*)(in1))[i] - ((t*)(in2))[i]) / fabs(((t*)in1)[i]) > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i << " in1: " << t(((t*)(in1))[i])
+                              << " in2: " << t(((t*)(in2))[i]);
+                    std::cout << " tolerance was: " << tol << std::endl;
+                }
+            }
+        }
+    }
+
+    return fail;
+}
+
+template <class t>
+bool ccompare(t* in1, t* in2, unsigned int vlen, float tol, bool absolute_mode)
+{
+    if (absolute_mode) {
+        std::cout << "ccompare does not support absolute mode" << std::endl;
+        return true;
+    }
+    bool fail = false;
+    int print_max_errs = 10;
+    for (unsigned int i = 0; i < 2 * vlen; i += 2) {
+        if (std::isnan(in1[i]) || std::isnan(in1[i + 1]) || std::isnan(in2[i]) ||
+            std::isnan(in2[i + 1]) || std::isinf(in1[i]) || std::isinf(in1[i + 1]) ||
+            std::isinf(in2[i]) || std::isinf(in2[i + 1])) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                          << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                          << "j";
+                std::cout << " tolerance was: " << tol << std::endl;
+            }
+        }
+        t diff[2] = { in1[i] - in2[i], in1[i + 1] - in2[i + 1] };
+        t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
+        t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]);
+
+        // for very small numbers we'll see round off errors due to limited
+        // precision. So a special test case...
+        if (norm < 1e-30) {
+            if (err > tol) {
+                fail = true;
+                if (print_max_errs-- > 0) {
+                    std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                              << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                              << "j";
+                    std::cout << " tolerance was: " << tol << std::endl;
+                }
+            }
+        }
+        // the primary test is the percent different greater than given tol
+        else if ((err / norm) > tol) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + "
+                          << in1[i + 1] << "j  in2: " << in2[i] << " + " << in2[i + 1]
+                          << "j";
+                std::cout << " tolerance was: " << tol << std::endl;
+            }
+        }
+    }
+
+    return fail;
+}
+
+template <class t>
+bool icompare(t* in1, t* in2, unsigned int vlen, unsigned int tol, bool absolute_mode)
+{
+    if (absolute_mode) {
+        std::cout << "icompare does not support absolute mode" << std::endl;
+        return true;
+    }
+    bool fail = false;
+    int print_max_errs = 10;
+    for (unsigned int i = 0; i < vlen; i++) {
+        if (((unsigned int)abs(int(((t*)(in1))[i]) - int(((t*)(in2))[i]))) > tol) {
+            fail = true;
+            if (print_max_errs-- > 0) {
+                std::cout << "offset " << i
+                          << " in1: " << static_cast<int>(t(((t*)(in1))[i]))
+                          << " in2: " << static_cast<int>(t(((t*)(in2))[i]));
+                std::cout << " tolerance was: " << tol << std::endl;
+            }
+        }
+    }
+
+    return fail;
+}
+
+class volk_qa_aligned_mem_pool
+{
+public:
+    void* get_new(size_t size)
+    {
+        size_t alignment = volk_get_alignment();
+        void* ptr = volk_malloc(size, alignment);
+        memset(ptr, 0x00, size);
+        _mems.push_back(ptr);
+        return ptr;
+    }
+    ~volk_qa_aligned_mem_pool()
+    {
+        for (unsigned int ii = 0; ii < _mems.size(); ++ii) {
+            volk_free(_mems[ii]);
+        }
+    }
+
+private:
+    std::vector<void*> _mems;
+};
+
+bool run_volk_tests(volk_func_desc_t desc,
+                    void (*manual_func)(),
+                    std::string name,
+                    volk_test_params_t test_params,
+                    std::vector<volk_test_results_t>* results,
+                    std::string puppet_master_name)
+{
+    return run_volk_tests(desc,
+                          manual_func,
+                          name,
+                          test_params.tol(),
+                          test_params.scalar(),
+                          test_params.vlen(),
+                          test_params.iter(),
+                          results,
+                          puppet_master_name,
+                          test_params.absolute_mode(),
+                          test_params.benchmark_mode());
+}
+
+bool run_volk_tests(volk_func_desc_t desc,
+                    void (*manual_func)(),
+                    std::string name,
+                    float tol,
+                    lv_32fc_t scalar,
+                    unsigned int vlen,
+                    unsigned int iter,
+                    std::vector<volk_test_results_t>* results,
+                    std::string puppet_master_name,
+                    bool absolute_mode,
+                    bool benchmark_mode)
+{
+    // Initialize this entry in results vector
+    results->push_back(volk_test_results_t());
+    results->back().name = name;
+    results->back().vlen = vlen;
+    results->back().iter = iter;
+    std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")"
+              << std::endl;
+
+    // vlen_twiddle will increase vlen for malloc and data generation
+    // but kernels will still be called with the user provided vlen.
+    // This is useful for causing errors in kernels that do bad reads
+    const unsigned int vlen_twiddle = 5;
+    vlen = vlen + vlen_twiddle;
+
+    const float tol_f = tol;
+    const unsigned int tol_i = static_cast<const unsigned int>(tol);
+
+    // first let's get a list of available architectures for the test
+    std::vector<std::string> arch_list = get_arch_list(desc);
+
+    if ((!benchmark_mode) && (arch_list.size() < 2)) {
+        std::cout << "no architectures to test" << std::endl;
+        return false;
+    }
+
+    // something that can hang onto memory and cleanup when this function exits
+    volk_qa_aligned_mem_pool mem_pool;
+
+    // now we have to get a function signature by parsing the name
+    std::vector<volk_type_t> inputsig, outputsig;
+    try {
+        get_signatures_from_name(inputsig, outputsig, name);
+    } catch (std::exception& error) {
+        std::cerr << "Error: unable to get function signature from kernel name"
+                  << std::endl;
+        std::cerr << "  - " << name << std::endl;
+        return false;
+    }
+
+    // pull the input scalars into their own vector
+    std::vector<volk_type_t> inputsc;
+    for (size_t i = 0; i < inputsig.size(); i++) {
+        if (inputsig[i].is_scalar) {
+            inputsc.push_back(inputsig[i]);
+            inputsig.erase(inputsig.begin() + i);
+            i -= 1;
+        }
+    }
+    std::vector<void*> inbuffs;
+    for (unsigned int inputsig_index = 0; inputsig_index < inputsig.size();
+         ++inputsig_index) {
+        volk_type_t sig = inputsig[inputsig_index];
+        if (!sig.is_scalar) // we don't make buffers for scalars
+            inbuffs.push_back(
+                mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1)));
+    }
+    for (size_t i = 0; i < inbuffs.size(); i++) {
+        load_random_data(inbuffs[i], inputsig[i], vlen);
+    }
+
+    // ok let's make a vector of vector of void buffers, which holds the input/output
+    // vectors for each arch
+    std::vector<std::vector<void*>> test_data;
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        std::vector<void*> arch_buffs;
+        for (size_t j = 0; j < outputsig.size(); j++) {
+            arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size *
+                                                  (outputsig[j].is_complex ? 2 : 1)));
+        }
+        for (size_t j = 0; j < inputsig.size(); j++) {
+            void* arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size *
+                                                 (inputsig[j].is_complex ? 2 : 1));
+            memcpy(arch_inbuff,
+                   inbuffs[j],
+                   vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
+            arch_buffs.push_back(arch_inbuff);
+        }
+        test_data.push_back(arch_buffs);
+    }
+
+    std::vector<volk_type_t> both_sigs;
+    both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
+    both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
+
+    // now run the test
+    vlen = vlen - vlen_twiddle;
+    std::chrono::time_point<std::chrono::system_clock> start, end;
+    std::vector<double> profile_times;
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        start = std::chrono::system_clock::now();
+
+        switch (both_sigs.size()) {
+        case 1:
+            if (inputsc.size() == 0) {
+                run_cast_test1(
+                    (volk_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test1_s32fc((volk_fn_1arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test1_s32f((volk_fn_1arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 1 arg function >1 scalars";
+            break;
+        case 2:
+            if (inputsc.size() == 0) {
+                run_cast_test2(
+                    (volk_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test2_s32fc((volk_fn_2arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test2_s32f((volk_fn_2arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 2 arg function >1 scalars";
+            break;
+        case 3:
+            if (inputsc.size() == 0) {
+                run_cast_test3(
+                    (volk_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            } else if (inputsc.size() == 1 && inputsc[0].is_float) {
+                if (inputsc[0].is_complex) {
+                    run_cast_test3_s32fc((volk_fn_3arg_s32fc)(manual_func),
+                                         test_data[i],
+                                         scalar,
+                                         vlen,
+                                         iter,
+                                         arch_list[i]);
+                } else {
+                    run_cast_test3_s32f((volk_fn_3arg_s32f)(manual_func),
+                                        test_data[i],
+                                        scalar.real(),
+                                        vlen,
+                                        iter,
+                                        arch_list[i]);
+                }
+            } else
+                throw "unsupported 3 arg function >1 scalars";
+            break;
+        case 4:
+            run_cast_test4(
+                (volk_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
+            break;
+        default:
+            throw "no function handler for this signature";
+            break;
+        }
+
+        end = std::chrono::system_clock::now();
+        std::chrono::duration<double> elapsed_seconds = end - start;
+        double arch_time = 1000.0 * elapsed_seconds.count();
+        std::cout << arch_list[i] << " completed in " << arch_time << " ms" << std::endl;
+        volk_test_time_t result;
+        result.name = arch_list[i];
+        result.time = arch_time;
+        result.units = "ms";
+        result.pass = true;
+        results->back().results[result.name] = result;
+
+        profile_times.push_back(arch_time);
+    }
+
+    // and now compare each output to the generic output
+    // first we have to know which output is the generic one, they aren't in order...
+    size_t generic_offset = 0;
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        if (arch_list[i] == "generic") {
+            generic_offset = i;
+        }
+    }
+
+    // Just in case a kernel wrote to OOB memory, use the twiddled vlen
+    vlen = vlen + vlen_twiddle;
+    bool fail;
+    bool fail_global = false;
+    std::vector<bool> arch_results;
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        fail = false;
+        if (i != generic_offset) {
+            for (size_t j = 0; j < both_sigs.size(); j++) {
+                if (both_sigs[j].is_float) {
+                    if (both_sigs[j].size == 8) {
+                        if (both_sigs[j].is_complex) {
+                            fail = ccompare((double*)test_data[generic_offset][j],
+                                            (double*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
+                        } else {
+                            fail = fcompare((double*)test_data[generic_offset][j],
+                                            (double*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
+                        }
+                    } else {
+                        if (both_sigs[j].is_complex) {
+                            fail = ccompare((float*)test_data[generic_offset][j],
+                                            (float*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
+                        } else {
+                            fail = fcompare((float*)test_data[generic_offset][j],
+                                            (float*)test_data[i][j],
+                                            vlen,
+                                            tol_f,
+                                            absolute_mode);
+                        }
+                    }
+                } else {
+                    // i could replace this whole switch statement with a memcmp if i
+                    // wasn't interested in printing the outputs where they differ
+                    switch (both_sigs[j].size) {
+                    case 8:
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int64_t*)test_data[generic_offset][j],
+                                            (int64_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        } else {
+                            fail = icompare((uint64_t*)test_data[generic_offset][j],
+                                            (uint64_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        }
+                        break;
+                    case 4:
+                        if (both_sigs[j].is_complex) {
+                            if (both_sigs[j].is_signed) {
+                                fail = icompare((int16_t*)test_data[generic_offset][j],
+                                                (int16_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
+                            } else {
+                                fail = icompare((uint16_t*)test_data[generic_offset][j],
+                                                (uint16_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
+                            }
+                        } else {
+                            if (both_sigs[j].is_signed) {
+                                fail = icompare((int32_t*)test_data[generic_offset][j],
+                                                (int32_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
+                            } else {
+                                fail = icompare((uint32_t*)test_data[generic_offset][j],
+                                                (uint32_t*)test_data[i][j],
+                                                vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                                tol_i,
+                                                absolute_mode);
+                            }
+                        }
+                        break;
+                    case 2:
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int16_t*)test_data[generic_offset][j],
+                                            (int16_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        } else {
+                            fail = icompare((uint16_t*)test_data[generic_offset][j],
+                                            (uint16_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        }
+                        break;
+                    case 1:
+                        if (both_sigs[j].is_signed) {
+                            fail = icompare((int8_t*)test_data[generic_offset][j],
+                                            (int8_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        } else {
+                            fail = icompare((uint8_t*)test_data[generic_offset][j],
+                                            (uint8_t*)test_data[i][j],
+                                            vlen * (both_sigs[j].is_complex ? 2 : 1),
+                                            tol_i,
+                                            absolute_mode);
+                        }
+                        break;
+                    default:
+                        fail = 1;
+                    }
+                }
+                if (fail) {
+                    volk_test_time_t* result = &results->back().results[arch_list[i]];
+                    result->pass = false;
+                    fail_global = true;
+                    std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
+                }
+            }
+        }
+        arch_results.push_back(!fail);
+    }
+
+    double best_time_a = std::numeric_limits<double>::max();
+    double best_time_u = std::numeric_limits<double>::max();
+    std::string best_arch_a = "generic";
+    std::string best_arch_u = "generic";
+    for (size_t i = 0; i < arch_list.size(); i++) {
+        if ((profile_times[i] < best_time_u) && arch_results[i] &&
+            desc.impl_alignment[i] == 0) {
+            best_time_u = profile_times[i];
+            best_arch_u = arch_list[i];
+        }
+        if ((profile_times[i] < best_time_a) && arch_results[i]) {
+            best_time_a = profile_times[i];
+            best_arch_a = arch_list[i];
+        }
+    }
+
+    std::cout << "Best aligned arch: " << best_arch_a << std::endl;
+    std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
+
+    if (puppet_master_name == "NULL") {
+        results->back().config_name = name;
+    } else {
+        results->back().config_name = puppet_master_name;
+    }
+    results->back().best_arch_a = best_arch_a;
+    results->back().best_arch_u = best_arch_u;
+
+    return fail_global;
+}
diff --git a/lib/qa_utils.h b/lib/qa_utils.h

new file mode 100644 (file)

index 0000000..40f549c
--- /dev/null
+++ b/lib/qa_utils.h
@@ -0,0 +1,209 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011 - 2020, 2022 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef VOLK_QA_UTILS_H
+#define VOLK_QA_UTILS_H
+
+#include <stdbool.h>   // for bool, false
+#include <volk/volk.h> // for volk_func_desc_t
+#include <cstdlib>     // for NULL
+#include <map>         // for map
+#include <string>      // for string, basic_string
+#include <vector>      // for vector
+
+#include "volk/volk_complex.h" // for lv_32fc_t
+
+/************************************************
+ * VOLK QA type definitions                     *
+ ************************************************/
+struct volk_type_t {
+    bool is_float;
+    bool is_scalar;
+    bool is_signed;
+    bool is_complex;
+    int size;
+    std::string str;
+};
+
+class volk_test_time_t
+{
+public:
+    std::string name;
+    double time;
+    std::string units;
+    bool pass;
+};
+
+class volk_test_results_t
+{
+public:
+    std::string name;
+    std::string config_name;
+    unsigned int vlen;
+    unsigned int iter;
+    std::map<std::string, volk_test_time_t> results;
+    std::string best_arch_a;
+    std::string best_arch_u;
+};
+
+class volk_test_params_t
+{
+private:
+    float _tol;
+    lv_32fc_t _scalar;
+    unsigned int _vlen;
+    unsigned int _iter;
+    bool _benchmark_mode;
+    bool _absolute_mode;
+    std::string _kernel_regex;
+
+public:
+    // ctor
+    volk_test_params_t(float tol,
+                       lv_32fc_t scalar,
+                       unsigned int vlen,
+                       unsigned int iter,
+                       bool benchmark_mode,
+                       std::string kernel_regex)
+        : _tol(tol),
+          _scalar(scalar),
+          _vlen(vlen),
+          _iter(iter),
+          _benchmark_mode(benchmark_mode),
+          _absolute_mode(false),
+          _kernel_regex(kernel_regex){};
+    // setters
+    void set_tol(float tol) { _tol = tol; };
+    void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
+    void set_vlen(unsigned int vlen) { _vlen = vlen; };
+    void set_iter(unsigned int iter) { _iter = iter; };
+    void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
+    void set_regex(std::string regex) { _kernel_regex = regex; };
+    // getters
+    float tol() { return _tol; };
+    lv_32fc_t scalar() { return _scalar; };
+    unsigned int vlen() { return _vlen; };
+    unsigned int iter() { return _iter; };
+    bool benchmark_mode() { return _benchmark_mode; };
+    bool absolute_mode() { return _absolute_mode; };
+    std::string kernel_regex() { return _kernel_regex; };
+    volk_test_params_t make_absolute(float tol)
+    {
+        volk_test_params_t t(*this);
+        t._tol = tol;
+        t._absolute_mode = true;
+        return t;
+    }
+    volk_test_params_t make_tol(float tol)
+    {
+        volk_test_params_t t(*this);
+        t._tol = tol;
+        return t;
+    }
+};
+
+class volk_test_case_t
+{
+private:
+    volk_func_desc_t _desc;
+    void (*_kernel_ptr)();
+    std::string _name;
+    volk_test_params_t _test_parameters;
+    std::string _puppet_master_name;
+
+public:
+    volk_func_desc_t desc() { return _desc; };
+    void (*kernel_ptr())() { return _kernel_ptr; };
+    std::string name() { return _name; };
+    std::string puppet_master_name() { return _puppet_master_name; };
+    volk_test_params_t test_parameters() { return _test_parameters; };
+    // normal ctor
+    volk_test_case_t(volk_func_desc_t desc,
+                     void (*t_kernel_ptr)(),
+                     std::string name,
+                     volk_test_params_t test_parameters)
+        : _desc(desc),
+          _kernel_ptr(t_kernel_ptr),
+          _name(name),
+          _test_parameters(test_parameters),
+          _puppet_master_name("NULL"){};
+    // ctor for puppets
+    volk_test_case_t(volk_func_desc_t desc,
+                     void (*t_kernel_ptr)(),
+                     std::string name,
+                     std::string puppet_master_name,
+                     volk_test_params_t test_parameters)
+        : _desc(desc),
+          _kernel_ptr(t_kernel_ptr),
+          _name(name),
+          _test_parameters(test_parameters),
+          _puppet_master_name(puppet_master_name){};
+};
+
+/************************************************
+ * VOLK QA functions                            *
+ ************************************************/
+volk_type_t volk_type_from_string(std::string);
+
+float uniform(void);
+void random_floats(float* buf, unsigned n);
+
+bool run_volk_tests(volk_func_desc_t,
+                    void (*)(),
+                    std::string,
+                    volk_test_params_t,
+                    std::vector<volk_test_results_t>* results = NULL,
+                    std::string puppet_master_name = "NULL");
+
+bool run_volk_tests(volk_func_desc_t,
+                    void (*)(),
+                    std::string,
+                    float,
+                    lv_32fc_t,
+                    unsigned int,
+                    unsigned int,
+                    std::vector<volk_test_results_t>* results = NULL,
+                    std::string puppet_master_name = "NULL",
+                    bool absolute_mode = false,
+                    bool benchmark_mode = false);
+
+#define VOLK_PROFILE(func, test_params, results) \
+    run_volk_tests(func##_get_func_desc(),       \
+                   (void (*)())func##_manual,    \
+                   std::string(#func),           \
+                   test_params,                  \
+                   results,                      \
+                   "NULL")
+#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) \
+    run_volk_tests(func##_get_func_desc(),                                  \
+                   (void (*)())func##_manual,                               \
+                   std::string(#func),                                      \
+                   test_params,                                             \
+                   results,                                                 \
+                   std::string(#puppet_master_func))
+typedef void (*volk_fn_1arg)(void*,
+                             unsigned int,
+                             const char*); // one input, operate in place
+typedef void (*volk_fn_2arg)(void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_3arg)(void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_4arg)(void*, void*, void*, void*, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32f)(
+    void*, float, unsigned int, const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32f)(void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32f)(void*, void*, void*, float, unsigned int, const char*);
+typedef void (*volk_fn_1arg_s32fc)(
+    void*,
+    lv_32fc_t,
+    unsigned int,
+    const char*); // one input vector, one scalar float input
+typedef void (*volk_fn_2arg_s32fc)(void*, void*, lv_32fc_t, unsigned int, const char*);
+typedef void (*volk_fn_3arg_s32fc)(
+    void*, void*, void*, lv_32fc_t, unsigned int, const char*);
+
+#endif // VOLK_QA_UTILS_H
diff --git a/lib/testqa.cc b/lib/testqa.cc

new file mode 100644 (file)

index 0000000..36e0ef1
--- /dev/null
+++ b/lib/testqa.cc
@@ -0,0 +1,145 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2012-2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <stdbool.h> // for bool, false, true
+#include <fstream>   // IWYU pragma: keep
+#include <iostream>  // for operator<<, basic_ostream, endl, char...
+#include <map>       // for map, map<>::iterator, _Rb_tree_iterator
+#include <string>    // for string, operator<<
+#include <utility>   // for pair
+#include <vector>    // for vector
+
+#include "kernel_tests.h"      // for init_test_list
+#include "qa_utils.h"          // for volk_test_case_t, volk_test_results_t
+#include "volk/volk_complex.h" // for lv_32fc_t
+#include <volk/volk.h>
+
+void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails);
+
+int main(int argc, char* argv[])
+{
+    bool qa_ret_val = 0;
+
+    float def_tol = 1e-6;
+    lv_32fc_t def_scalar = 327.0;
+    int def_iter = 1;
+    int def_vlen = 131071;
+    bool def_benchmark_mode = true;
+    std::string def_kernel_regex = "";
+
+    volk_test_params_t test_params(
+        def_tol, def_scalar, def_vlen, def_iter, def_benchmark_mode, def_kernel_regex);
+    std::vector<volk_test_case_t> test_cases = init_test_list(test_params);
+    std::vector<volk_test_results_t> results;
+
+    if (argc > 1) {
+        for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+            if (std::string(argv[1]) == test_cases[ii].name()) {
+                volk_test_case_t test_case = test_cases[ii];
+                if (run_volk_tests(test_case.desc(),
+                                   test_case.kernel_ptr(),
+                                   test_case.name(),
+                                   test_case.test_parameters(),
+                                   &results,
+                                   test_case.puppet_master_name())) {
+                    return 1;
+                } else {
+                    return 0;
+                }
+            }
+        }
+        std::cerr << "Did not run a test for kernel: " << std::string(argv[1]) << " !"
+                  << std::endl;
+        return 0;
+
+    } else {
+        std::vector<std::string> qa_failures;
+        // Test every kernel reporting failures when they occur
+        for (unsigned int ii = 0; ii < test_cases.size(); ++ii) {
+            bool qa_result = false;
+            volk_test_case_t test_case = test_cases[ii];
+            try {
+                qa_result = run_volk_tests(test_case.desc(),
+                                           test_case.kernel_ptr(),
+                                           test_case.name(),
+                                           test_case.test_parameters(),
+                                           &results,
+                                           test_case.puppet_master_name());
+            } catch (...) {
+                // TODO: what exceptions might we need to catch and how do we handle them?
+                std::cerr << "Exception found on kernel: " << test_case.name()
+                          << std::endl;
+                qa_result = false;
+            }
+
+            if (qa_result) {
+                std::cerr << "Failure on " << test_case.name() << std::endl;
+                qa_failures.push_back(test_case.name());
+            }
+        }
+
+        // Generate XML results
+        print_qa_xml(results, qa_failures.size());
+
+        // Summarize QA results
+        std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
+                  << test_cases.size() << " tests." << std::endl;
+        if (qa_failures.size() > 0) {
+            std::cerr << "The following kernels failed QA:" << std::endl;
+            for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
+                std::cerr << "    " << qa_failures[ii] << std::endl;
+            }
+            qa_ret_val = 1;
+        }
+    }
+
+    return qa_ret_val;
+}
+
+/*
+ * This function prints qa results as XML output similar to output
+ * from Junit. For reference output see http://llg.cubic.org/docs/junit/
+ */
+void print_qa_xml(std::vector<volk_test_results_t> results, unsigned int nfails)
+{
+    std::ofstream qa_file;
+    qa_file.open(".unittest/kernels.xml");
+
+    qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
+    qa_file << "<testsuites name=\"kernels\" "
+            << "tests=\"" << results.size() << "\" "
+            << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
+
+    // Results are in a vector by kernel. Each element has a result
+    // map containing time and arch name with test result
+    for (unsigned int ii = 0; ii < results.size(); ++ii) {
+        volk_test_results_t result = results[ii];
+        qa_file << "  <testsuite name=\"" << result.name << "\">" << std::endl;
+
+        std::map<std::string, volk_test_time_t>::iterator kernel_time_pair;
+        for (kernel_time_pair = result.results.begin();
+             kernel_time_pair != result.results.end();
+             ++kernel_time_pair) {
+            volk_test_time_t test_time = kernel_time_pair->second;
+            qa_file << "    <testcase name=\"" << test_time.name << "\" "
+                    << "classname=\"" << result.name << "\" "
+                    << "time=\"" << test_time.time << "\">" << std::endl;
+            if (!test_time.pass)
+                qa_file << "      <failure "
+                        << "message=\"fail on arch " << test_time.name << "\">"
+                        << "</failure>" << std::endl;
+            qa_file << "    </testcase>" << std::endl;
+        }
+        qa_file << "  </testsuite>" << std::endl;
+    }
+
+
+    qa_file << "</testsuites>" << std::endl;
+    qa_file.close();
+}
diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c

new file mode 100644 (file)

index 0000000..27fb699
--- /dev/null
+++ b/lib/volk_malloc.c
@@ -0,0 +1,87 @@
+/* -*- c -*- */
+/*
+ * Copyright 2014 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <volk/volk_malloc.h>
+
+/*
+ * C11 features:
+ * see: https://en.cppreference.com/w/c/memory/aligned_alloc
+ *
+ * MSVC is broken
+ * see:
+ * https://docs.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=vs-2019
+ * This section:
+ * C11 The Universal CRT implemented the parts of the
+ * C11 Standard Library that are required by C++17,
+ * with the exception of C99 strftime() E/O alternative
+ * conversion specifiers, C11 fopen() exclusive mode,
+ * and C11 aligned_alloc(). The latter is unlikely to
+ * be implemented, because C11 specified aligned_alloc()
+ * in a way that's incompatible with the Microsoft
+ * implementation of free():
+ * namely, that free() must be able to handle highly aligned allocations.
+ *
+ * We must work around this problem because MSVC is non-compliant!
+ */
+
+
+void* volk_malloc(size_t size, size_t alignment)
+{
+    if ((size == 0) || (alignment == 0)) {
+        fprintf(stderr, "VOLK: Error allocating memory: either size or alignment is 0");
+        return NULL;
+    }
+    // Tweak size to satisfy ASAN (the GCC address sanitizer).
+    // Calling 'volk_malloc' might therefor result in the allocation of more memory than
+    // requested for correct alignment. Any allocation size change here will in general
+    // not impact the end result since initial size alignment is required either way.
+    if (size % alignment) {
+        size += alignment - (size % alignment);
+    }
+#if HAVE_POSIX_MEMALIGN
+    // quoting posix_memalign() man page:
+    // "alignment must be a power of two and a multiple of sizeof(void *)"
+    // volk_get_alignment() could return 1 for some machines (e.g. generic_orc)
+    if (alignment == 1) {
+        return malloc(size);
+    }
+    void* ptr;
+    int err = posix_memalign(&ptr, alignment, size);
+    if (err != 0) {
+        ptr = NULL;
+        fprintf(stderr,
+                "VOLK: Error allocating memory "
+                "(posix_memalign: error %d: %s)\n",
+                err,
+                strerror(err));
+    }
+#elif defined(_MSC_VER) || defined(__MINGW32__)
+    void* ptr = _aligned_malloc(size, alignment);
+#else
+    void* ptr = aligned_alloc(alignment, size);
+#endif
+    if (ptr == NULL) {
+        fprintf(stderr,
+                "VOLK: Error allocating memory (aligned_alloc/_aligned_malloc)\n");
+    }
+    return ptr;
+}
+
+void volk_free(void* ptr)
+{
+#if defined(_MSC_VER) || defined(__MINGW32__)
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
diff --git a/lib/volk_prefs.c b/lib/volk_prefs.c

new file mode 100644 (file)

index 0000000..2ff989e
--- /dev/null
+++ b/lib/volk_prefs.c
@@ -0,0 +1,107 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011, 2012, 2015, 2016, 2019, 2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if defined(_MSC_VER)
+#include <io.h>
+#define access _access
+#define F_OK 0
+#else
+#include <unistd.h>
+#endif
+#include <volk/volk_prefs.h>
+
+void volk_get_config_path(char* path, bool read)
+{
+    if (!path)
+        return;
+    const char* suffix = "/.volk/volk_config";
+    const char* suffix2 = "/volk/volk_config"; // non-hidden
+    char* home = NULL;
+
+    // allows config redirection via env variable
+    home = getenv("VOLK_CONFIGPATH");
+    if (home != NULL) {
+        strncpy(path, home, 512);
+        strcat(path, suffix2);
+        if (!read || access(path, F_OK) != -1) {
+            return;
+        }
+    }
+
+    // check for user-local config file
+    home = getenv("HOME");
+    if (home != NULL) {
+        strncpy(path, home, 512);
+        strcat(path, suffix);
+        if (!read || (access(path, F_OK) != -1)) {
+            return;
+        }
+    }
+
+    // check for config file in APPDATA (Windows)
+    home = getenv("APPDATA");
+    if (home != NULL) {
+        strncpy(path, home, 512);
+        strcat(path, suffix);
+        if (!read || (access(path, F_OK) != -1)) {
+            return;
+        }
+    }
+
+    // check for system-wide config file
+    if (access("/etc/volk/volk_config", F_OK) != -1) {
+        strncpy(path, "/etc", 512);
+        strcat(path, suffix2);
+        if (!read || (access(path, F_OK) != -1)) {
+            return;
+        }
+    }
+
+    // If still no path was found set path[0] to '0' and fall through
+    path[0] = 0;
+    return;
+}
+
+size_t volk_load_preferences(volk_arch_pref_t** prefs_res)
+{
+    FILE* config_file;
+    char path[512], line[512];
+    size_t n_arch_prefs = 0;
+    volk_arch_pref_t* prefs = NULL;
+
+    // get the config path
+    volk_get_config_path(path, true);
+    if (!path[0])
+        return n_arch_prefs; // no prefs found
+    config_file = fopen(path, "r");
+    if (!config_file)
+        return n_arch_prefs; // no prefs found
+
+    // reset the file pointer and write the prefs into volk_arch_prefs
+    while (fgets(line, sizeof(line), config_file) != NULL) {
+        void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
+        if (!new_prefs) {
+            printf("volk_load_preferences: bad malloc\n");
+            break;
+        }
+        prefs = (volk_arch_pref_t*)new_prefs;
+        volk_arch_pref_t* p = prefs + n_arch_prefs;
+        if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 &&
+            !strncmp(p->name, "volk_", 5)) {
+            n_arch_prefs++;
+        }
+    }
+    fclose(config_file);
+    *prefs_res = prefs;
+    return n_arch_prefs;
+}
diff --git a/lib/volk_rank_archs.c b/lib/volk_rank_archs.c

new file mode 100644 (file)

index 0000000..750fe54
--- /dev/null
+++ b/lib/volk_rank_archs.c
@@ -0,0 +1,94 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <volk/volk_prefs.h>
+#include <volk_rank_archs.h>
+
+int volk_get_index(const char* impl_names[], // list of implementations by name
+                   const size_t n_impls,     // number of implementations available
+                   const char* impl_name     // the implementation name to find
+)
+{
+    unsigned int i;
+    for (i = 0; i < n_impls; i++) {
+        if (!strncmp(impl_names[i], impl_name, 20)) {
+            return i;
+        }
+    }
+    // TODO return -1;
+    // something terrible should happen here
+    fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
+    return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now
+}
+
+int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
+                    const char* impl_names[], // list of implementations by name
+                    const int* impl_deps,     // requirement mask per implementation
+                    const bool* alignment,    // alignment status of each implementation
+                    size_t n_impls,           // number of implementations available
+                    const bool align          // if false, filter aligned implementations
+)
+{
+    size_t i;
+    static volk_arch_pref_t* volk_arch_prefs;
+    static size_t n_arch_prefs = 0;
+    static int prefs_loaded = 0;
+    if (!prefs_loaded) {
+        n_arch_prefs = volk_load_preferences(&volk_arch_prefs);
+        prefs_loaded = 1;
+    }
+
+    // If we've defined VOLK_GENERIC to be anything, always return the
+    // 'generic' kernel. Used in GR's QA code.
+    char* gen_env = getenv("VOLK_GENERIC");
+    if (gen_env) {
+        return volk_get_index(impl_names, n_impls, "generic");
+    }
+
+    // now look for the function name in the prefs list
+    for (i = 0; i < n_arch_prefs; i++) {
+        if (!strncmp(kern_name,
+                     volk_arch_prefs[i].name,
+                     sizeof(volk_arch_prefs[i].name))) // found it
+        {
+            const char* impl_name =
+                align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u;
+            return volk_get_index(impl_names, n_impls, impl_name);
+        }
+    }
+
+    // return the best index with the largest deps
+    size_t best_index_a = 0;
+    size_t best_index_u = 0;
+    int best_value_a = -1;
+    int best_value_u = -1;
+    for (i = 0; i < n_impls; i++) {
+        const signed val = impl_deps[i];
+        if (alignment[i] && val > best_value_a) {
+            best_index_a = i;
+            best_value_a = val;
+        }
+        if (!alignment[i] && val > best_value_u) {
+            best_index_u = i;
+            best_value_u = val;
+        }
+    }
+
+    // when align and we found a best aligned, use it
+    if (align && best_value_a != -1)
+        return best_index_a;
+
+    // otherwise return the best unaligned
+    return best_index_u;
+}
diff --git a/lib/volk_rank_archs.h b/lib/volk_rank_archs.h

new file mode 100644 (file)

index 0000000..317af74
--- /dev/null
+++ b/lib/volk_rank_archs.h
@@ -0,0 +1,36 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_RANK_ARCHS_H
+#define INCLUDED_VOLK_RANK_ARCHS_H
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int volk_get_index(const char* impl_names[], // list of implementations by name
+                   const size_t n_impls,     // number of implementations available
+                   const char* impl_name     // the implementation name to find
+);
+
+int volk_rank_archs(const char* kern_name,    // name of the kernel to rank
+                    const char* impl_names[], // list of implementations by name
+                    const int* impl_deps,     // requirement mask per implementation
+                    const bool* alignment,    // alignment status of each implementation
+                    size_t n_impls,           // number of implementations available
+                    const bool align          // if false, filter aligned implementations
+);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/
diff --git a/python/volk_modtool/CMakeLists.txt b/python/volk_modtool/CMakeLists.txt

new file mode 100644 (file)

index 0000000..aff7a62
--- /dev/null
+++ b/python/volk_modtool/CMakeLists.txt
@@ -0,0 +1,28 @@
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+
+########################################################################
+# Install python files and apps
+########################################################################
+include(VolkPython)
+
+VOLK_PYTHON_INSTALL(
+    FILES
+    __init__.py
+    cfg.py
+    volk_modtool_generate.py
+    DESTINATION ${VOLK_PYTHON_DIR}/volk_modtool
+    COMPONENT "volk"
+)
+
+VOLK_PYTHON_INSTALL(
+    PROGRAMS
+    volk_modtool
+    DESTINATION ${VOLK_RUNTIME_DIR}
+    COMPONENT "volk"
+)
diff --git a/python/volk_modtool/README.md b/python/volk_modtool/README.md

new file mode 100644 (file)

index 0000000..ba40dff
--- /dev/null
+++ b/python/volk_modtool/README.md
@@ -0,0 +1,113 @@
+The volk_modtool tool is installed along with VOLK as a way of helping
+to construct, add to, and interogate the VOLK library or companion
+libraries.
+
+volk_modtool is installed into $prefix/bin.
+
+VOLK modtool enables creating standalone (out-of-tree) VOLK modules
+and provides a few tools for sharing VOLK kernels between VOLK
+modules.  If you need to design or work with VOLK kernels away from
+the canonical VOLK library, this is the tool.  If you need to tailor
+your own VOLK library for whatever reason, this is the tool.
+
+The canonical VOLK library installs a volk.h and a libvolk.so.  Your
+own library will install volk_$name.h and libvolk_$name.so.  Ya Gronk?
+Good.
+
+There isn't a substantial difference between the canonical VOLK
+module and any other VOLK module.  They're all peers.  Any module
+created via VOLK modtool will come complete with a default
+volk_modtool.cfg file associating the module with the base from which
+it came, its distinctive $name and its destination (or path).  These
+values (created from user input if VOLK modtool runs without a
+user-supplied config file or a default config file) serve as default
+values for some VOLK modtool actions.  It's more or less intended for
+the user to change directories to the top level of a created VOLK
+module and then run volk_modtool to take advantage of the values
+stored in the default volk_modtool.cfg file.
+
+Apart from creating new VOLK modules, VOLK modtool allows you to list
+the names of kernels in other modules, list the names of kernels in
+the current module, add kernels from another module into the current
+module, and remove kernels from the current module.  When moving
+kernels between modules, VOLK modtool does its best to keep the qa
+and profiling code for those kernels intact.  If the base has a test
+or a profiling call for some kernel, those calls will follow the
+kernel when VOLK modtool adds that kernel.  If QA or profiling
+requires a puppet kernel, the puppet kernel will follow the original
+kernel when VOLK modtool adds that original kernel.  VOLK modtool
+respects puppets.
+
+======================================================================
+
+Installing a new VOLK Library:
+
+Run the command "volk_modtool -i". This will ask you three questions:
+
+  name: // the name to give your VOLK library: volk_<name>
+  destination: // directory new source tree is built under -- must exists.
+               // It will create <directory>/volk_<name>
+  base: // the directory containing the original VOLK source code
+
+The name provided must be alphanumeric (and cannot start with a
+number). No special characters including dashes and underscores are
+allowed.
+
+This will build a new skeleton directory in the destination provided
+with the name volk_<name>. It will contain the necessary structure to
+build:
+
+  mkdir build
+  cd build
+  cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../
+  make
+  sudo make install
+
+Right now, the library is empty and contains no kernels. Kernels can
+be added from another VOLK library using the '-a' option. If not
+specified, the kernel will be extracted from the base VOLK
+directory. Using the '-b' allows us to specify another VOLK library to
+use for this purpose.
+
+  volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc
+
+This will put the code for the new kernel into
+<destination>/volk_<name>/kernels/volk_<name>/
+
+Other kernels must be added by hand. See the following webpages for
+more information about creating VOLK kernels:
+  https://www.gnuradio.org/doc/doxygen/volk_guide.html
+  https://wiki.gnuradio.org/index.php/Volk
+
+
+======================================================================
+
+OPTIONS
+
+Options for Adding and Removing Kernels:
+  -a, --add_kernel
+       Add kernel from existing VOLK module. Uses the base VOLK module
+       unless -b is used. Use -n to specify the kernel name.
+       Requires: -n.
+       Optional: -b
+
+  -A, --add_all_kernels
+       Add all kernels from existing VOLK module. Uses the base VOLK
+       module unless -b is used.
+       Optional: -b
+
+  -x, --remove_kernel
+       Remove kernel from module.
+       Required: -n.
+       Optional: -b
+
+Options for Listing Kernels:
+  -l, --list
+       Lists all kernels available in the base VOLK module.
+
+  -k, --kernels
+       Lists all kernels in this VOLK module.
+
+  -r, --remote-list
+       Lists all kernels in another VOLK module that is specified
+       using the -b option.
diff --git a/python/volk_modtool/__init__.py b/python/volk_modtool/__init__.py

new file mode 100644 (file)

index 0000000..0338f27
--- /dev/null
+++ b/python/volk_modtool/__init__.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+from .cfg import volk_modtool_config
+from .volk_modtool_generate import volk_modtool
diff --git a/python/volk_modtool/cfg.py b/python/volk_modtool/cfg.py

new file mode 100644 (file)

index 0000000..ad091dc
--- /dev/null
+++ b/python/volk_modtool/cfg.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import configparser
+import sys
+import os
+import re
+
+
+class volk_modtool_config(object):
+    def key_val_sub(self, num, stuff, section):
+        return re.sub(r'\$' + 'k' + str(num), stuff[num][0], (re.sub(r'\$' + str(num), stuff[num][1], section[1][num])));
+
+    def verify(self):
+        for i in self.verification:
+            self.verify_section(i)
+    def remap(self):
+        for i in self.remapification:
+            self.verify_section(i)
+
+    def verify_section(self, section):
+        stuff = self.cfg.items(section[0])
+        for i in range(len(section[1])):
+            eval(self.key_val_sub(i, stuff, section))
+            try:
+               val = eval(self.key_val_sub(i, stuff, section))
+               if val == False:
+                   raise ValueError
+            except ValueError:
+                raise ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+            except:
+                raise IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1]))
+
+
+    def __init__(self, cfg=None):
+        self.config_name = 'config'
+        self.config_defaults = ['name', 'destination', 'base']
+        self.config_defaults_remap = ['1',
+                                      'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))',
+                                      'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))']
+
+        self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')',
+                                       'os.path.exists(\'$1\')',
+                                       'os.path.exists(\'$2\')']
+        self.remapification = [(self.config_name, self.config_defaults_remap)]
+        self.verification = [(self.config_name, self.config_defaults_verify)]
+        default = os.path.join(os.getcwd(), 'volk_modtool.cfg')
+        icfg = configparser.RawConfigParser()
+        if cfg:
+            icfg.read(cfg)
+        elif os.path.exists(default):
+            icfg.read(default)
+        else:
+            print("Initializing config file...")
+            icfg.add_section(self.config_name)
+            for kn in self.config_defaults:
+                rv = input("%s: "%(kn))
+                icfg.set(self.config_name, kn, rv)
+        self.cfg = icfg
+        self.remap()
+        self.verify()
+
+
+
+    def read_map(self, name, inp):
+        if self.cfg.has_section(name):
+            self.cfg.remove_section(name)
+        self.cfg.add_section(name)
+        for i in inp:
+            self.cfg.set(name, i, inp[i])
+
+    def get_map(self, name):
+        retval = {}
+        stuff = self.cfg.items(name)
+        for i in stuff:
+            retval[i[0]] = i[1]
+        return retval
diff --git a/python/volk_modtool/volk_modtool b/python/volk_modtool/volk_modtool

new file mode 100755 (executable)

index 0000000..15f9f77
--- /dev/null
+++ b/python/volk_modtool/volk_modtool
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+
+from volk_modtool import volk_modtool, volk_modtool_config
+from optparse import OptionParser, OptionGroup
+
+import os
+import sys
+
+if __name__ == '__main__':
+    parser = OptionParser();
+    actions = OptionGroup(parser, 'Actions');
+    actions.add_option('-i', '--install', action='store_true',
+                       help='Create a new volk module.')
+    parser.add_option('-b', '--base_path', action='store', default=None,
+                      help='Base path for action. By default, volk_modtool.cfg loads this value.')
+    parser.add_option('-n', '--kernel_name', action='store', default=None,
+                      help='Kernel name for action. No default')
+    parser.add_option('-c', '--config', action='store', dest='config_file', default=None,
+                      help='Config file for volk_modtool.  By default, volk_modtool.cfg in the local directory will be used/created.')
+    actions.add_option('-a', '--add_kernel', action='store_true',
+                       help='Add kernel from existing volk module. Requires: -n. Optional: -b')
+    actions.add_option('-A', '--add_all_kernels', action='store_true',
+                       help='Add all kernels from existing volk module. Optional: -b')
+    actions.add_option('-x', '--remove_kernel', action='store_true',
+                       help='Remove kernel from module. Required: -n. Optional: -b')
+    actions.add_option('-l', '--list', action='store_true',
+                       help='List all kernels in the base.')
+    actions.add_option('-k', '--kernels', action='store_true',
+                       help='List all kernels in the module.')
+    actions.add_option('-r', '--remote_list', action='store_true',
+                       help='List all available kernels in remote volk module. Requires: -b.')
+    actions.add_option('-m', '--moo', action='store_true',
+                       help='Have you mooed today?')
+    parser.add_option_group(actions)
+
+    (options, args) = parser.parse_args();
+    if len(sys.argv) < 2:
+        parser.print_help()
+
+    elif options.moo:
+        print("         (__)    ")
+        print("         (oo)    ")
+        print("   /------\/     ")
+        print("  / |    ||      ")
+        print(" *  /\---/\      ")
+        print("    ~~   ~~      ")
+
+    else:
+        my_cfg = volk_modtool_config(options.config_file);
+
+        my_modtool = volk_modtool(my_cfg.get_map(my_cfg.config_name));
+
+
+        if options.install:
+            my_modtool.make_module_skeleton();
+            my_modtool.write_default_cfg(my_cfg.cfg);
+
+
+        if options.add_kernel:
+            if not options.kernel_name:
+                raise IOError("This action requires the -n option.");
+            else:
+                name = options.kernel_name;
+            if options.base_path:
+                base = options.base_path;
+            else:
+                base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+                my_modtool.import_kernel(name, base);
+
+        if options.remove_kernel:
+            if not options.kernel_name:
+                raise IOError("This action requires the -n option.");
+            else:
+                name = options.kernel_name;
+            my_modtool.remove_kernel(name);
+
+        if options.add_all_kernels:
+
+            if options.base_path:
+                base = options.base_path;
+            else:
+                base = my_cfg.cfg.get(my_cfg.config_name, 'base');
+            kernelset = my_modtool.get_current_kernels(base);
+            for i in kernelset:
+                my_modtool.import_kernel(i, base);
+
+        if options.remote_list:
+            if not options.base_path:
+                raise IOError("This action requires the -b option.  Try -l or -k for listing kernels in the base or the module.")
+            else:
+                base = options.base_path;
+            kernelset = my_modtool.get_current_kernels(base);
+            for i in kernelset:
+                print(i);
+
+        if options.list:
+            kernelset = my_modtool.get_current_kernels();
+            for i in kernelset:
+                print(i);
+
+        if options.kernels:
+            dest = my_cfg.cfg.get(my_cfg.config_name, 'destination');
+            name = my_cfg.cfg.get(my_cfg.config_name, 'name');
+            base = os.path.join(dest, 'volk_' + name);
+            kernelset = my_modtool.get_current_kernels(base);
+            for i in kernelset:
+                print(i);
diff --git a/python/volk_modtool/volk_modtool_generate.py b/python/volk_modtool/volk_modtool_generate.py

new file mode 100644 (file)

index 0000000..d70667d
--- /dev/null
+++ b/python/volk_modtool/volk_modtool_generate.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2013, 2014 Free Software Foundation, Inc.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import os
+import re
+import glob
+
+
+class volk_modtool(object):
+    def __init__(self, cfg):
+        self.volk = re.compile('volk')
+        self.remove_after_underscore = re.compile("_.*")
+        self.volk_included = re.compile('INCLUDED_VOLK')
+        self.volk_profile = re.compile(r'^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE)
+        self.volk_kernel_tests = re.compile(r'^\s*\((VOLK_INIT_TEST|VOLK_INIT_PUPP).*\n', re.MULTILINE)
+        self.volk_null_kernel = re.compile(r'^\s*;\n', re.MULTILINE)
+        self.my_dict = cfg
+        self.lastline = re.compile(r'\s*char path\[1024\];.*')
+        self.badassert = re.compile(r'^\s*assert\(toked\[0\] == "volk_.*\n', re.MULTILINE)
+        self.goodassert = '    assert(toked[0] == "volk");\n'
+        self.baderase = re.compile(r'^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE)
+        self.gooderase = '    toked.erase(toked.begin());\n    toked.erase(toked.begin());\n'
+
+    def get_basename(self, base=None):
+        if not base:
+            base = self.my_dict['base']
+        candidate = base.split('/')[-1]
+        if len(candidate.split('_')) == 1:
+            return ''
+        else:
+            return candidate.split('_')[-1]
+
+    def get_current_kernels(self, base=None):
+        if not base:
+            base = self.my_dict['base']
+            name = self.get_basename()
+        else:
+            name = self.get_basename(base)
+        if name == '':
+            hdr_files = sorted(glob.glob(os.path.join(base, "kernels/volk/*.h")))
+            begins = re.compile("(?<=volk_).*")
+        else:
+            hdr_files = sorted(glob.glob(os.path.join(base, "kernels/volk_" + name + "/*.h")))
+            begins = re.compile("(?<=volk_" + name + "_).*")
+
+        datatypes = []
+        functions = []
+
+        for line in hdr_files:
+
+            subline = re.search(r".*\.h.*", os.path.basename(line))
+            if subline:
+                subsubline = begins.search(subline.group(0))
+                if subsubline:
+                    dtype = self.remove_after_underscore.sub("", subsubline.group(0))
+                    subdtype = re.search("[0-9]+[A-z]+", dtype)
+                    if subdtype:
+                        datatypes.append(subdtype.group(0))
+
+        datatypes = set(datatypes)
+
+        for line in hdr_files:
+            for dt in datatypes:
+                if dt in line:
+                    subline = re.search(begins.pattern[:-2] + dt + r".*(?=\.h)", line)
+                    if subline:
+                        functions.append(subline.group(0))
+
+        return set(functions)
+
+    def make_module_skeleton(self):
+        dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'])
+        if os.path.exists(dest):
+            raise IOError("Destination %s already exits!" % dest)
+
+        if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name'])):
+            os.makedirs(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'kernels/volk_' + self.my_dict['name']))
+
+        current_kernel_names = self.get_current_kernels()
+        need_ifdef_updates = ["constant.h", "volk_complex.h", "volk_malloc.h", "volk_prefs.h",
+                              "volk_common.h", "volk_cpu.tmpl.h", "volk_config_fixed.tmpl.h",
+                              "volk_typedefs.h", "volk.tmpl.h"]
+        for root, dirnames, filenames in os.walk(self.my_dict['base']):
+            for name in filenames:
+                t_table = [re.search(a, name) for a in current_kernel_names]
+                t_table = set(t_table)
+                if (t_table == set([None])) or (name == "volk_32f_null_32f.h"):
+                    infile = os.path.join(root, name)
+                    instring = open(infile, 'r').read()
+                    outstring = re.sub(self.volk, 'volk_' + self.my_dict['name'], instring)
+                    # Update the header ifdef guards only where needed
+                    if name in need_ifdef_updates:
+                        outstring = re.sub(self.volk_included, 'INCLUDED_VOLK_' + self.my_dict['name'].upper(), outstring)
+                    newname = re.sub(self.volk, 'volk_' + self.my_dict['name'], name)
+                    if name == 'VolkConfig.cmake.in':
+                        outstring = re.sub("VOLK", 'VOLK_' + self.my_dict['name'].upper(), outstring)
+                        newname = "Volk%sConfig.cmake.in" % self.my_dict['name']
+
+                    relpath = os.path.relpath(infile, self.my_dict['base'])
+                    newrelpath = re.sub(self.volk, 'volk_' + self.my_dict['name'], relpath)
+                    dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+
+                    if not os.path.exists(os.path.dirname(dest)):
+                        os.makedirs(os.path.dirname(dest))
+                    open(dest, 'w+').write(outstring)
+
+        infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/kernel_tests.h')
+        instring = open(infile, 'r').read()
+        outstring = re.sub(self.volk_kernel_tests, '', instring)
+        outstring = re.sub(self.volk_null_kernel,'        (VOLK_INIT_TEST(volk_' + self.my_dict['name'] + '_32f_null_32f, test_params))\n        ;', outstring)
+        open(infile, 'w+').write(outstring)
+
+        infile = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/qa_utils.cc')
+        instring = open(infile, 'r').read()
+        outstring = re.sub(self.badassert, self.goodassert, instring)
+        outstring = re.sub(self.baderase, self.gooderase, outstring)
+        open(infile, 'w+').write(outstring)
+
+    def write_default_cfg(self, cfg):
+        outfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'volk_modtool.cfg'), 'wb')
+        cfg.write(outfile)
+        outfile.close()
+
+    def convert_kernel(self, oldvolk, name, base, inpath, top):
+        infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h')
+        instring = open(infile, 'r').read()
+        outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring)
+        newname = 'volk_' + self.my_dict['name'] + '_' + name + '.h'
+        relpath = os.path.relpath(infile, base)
+        newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath)
+        dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+
+        if not os.path.exists(os.path.dirname(dest)):
+            os.makedirs(os.path.dirname(dest))
+        open(dest, 'w+').write(outstring)
+
+        # copy orc proto-kernels if they exist
+        for orcfile in sorted(glob.glob(inpath + '/kernels/volk/asm/orc/' + top + name + '*.orc')):
+            if os.path.isfile(orcfile):
+                instring = open(orcfile, 'r').read()
+                outstring = re.sub(oldvolk, 'volk_' + self.my_dict['name'], instring)
+                newname = 'volk_' + self.my_dict['name'] + '_' + name + '.orc'
+                relpath = os.path.relpath(orcfile, base)
+                newrelpath = re.sub(oldvolk, 'volk_' + self.my_dict['name'], relpath)
+                dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], os.path.dirname(newrelpath), newname)
+                if not os.path.exists(os.path.dirname(dest)):
+                    os.makedirs(os.path.dirname(dest))
+                open(dest, 'w+').write(outstring)
+
+    def remove_kernel(self, name):
+        basename = self.my_dict['name']
+        if len(basename) > 0:
+            top = 'volk_' + basename + '_'
+        else:
+            top = 'volk_'
+        base = os.path.join(self.my_dict['destination'], top[:-1])
+
+        if not name in self.get_current_kernels():
+            raise IOError("Requested kernel %s is not in module %s" % (name, base))
+
+        inpath = os.path.abspath(base)
+        kernel = re.compile(name)
+        search_kernels = set([kernel])
+        profile = re.compile(r'^\s*VOLK_PROFILE')
+        puppet = re.compile(r'^\s*VOLK_PUPPET')
+        src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc')
+        infile = open(src_dest)
+        otherlines = infile.readlines()
+        open(src_dest, 'w+').write('')
+
+        for otherline in otherlines:
+            write_okay = True
+            if kernel.search(otherline):
+                write_okay = False
+                if puppet.match(otherline):
+                    args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline)
+                    m_func = args.group(0).split(',')[0]
+                    func = re.search('(?<=' + top + ').*', m_func)
+                    search_kernels.add(re.compile(func.group(0)))
+            if write_okay:
+                open(src_dest, 'a').write(otherline)
+
+        src_dest = os.path.join(inpath, 'lib/testqa.cc')
+        infile = open(src_dest)
+        otherlines = infile.readlines()
+        open(src_dest, 'w+').write('')
+
+        for otherline in otherlines:
+            write_okay = True
+
+            for kernel in search_kernels:
+                if kernel.search(otherline):
+                    write_okay = False
+
+            if write_okay:
+                open(src_dest, 'a').write(otherline)
+
+        for kernel in search_kernels:
+            infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h')
+            print("Removing kernel %s" % kernel.pattern)
+            if os.path.exists(infile):
+                os.remove(infile)
+        # remove the orc proto-kernels if they exist. There are no puppets here
+        # so just need to glob for files matching kernel name
+        print(glob.glob(inpath + '/kernel/volk/asm/orc/' + top + name + '*.orc'))
+        for orcfile in glob.glob(inpath + '/kernel/volk/asm/orc/' + top + name + '*.orc'):
+            print(orcfile)
+            if os.path.exists(orcfile):
+                os.remove(orcfile)
+
+    def import_kernel(self, name, base):
+        if not base:
+            base = self.my_dict['base']
+            basename = self.getbasename()
+        else:
+            basename = self.get_basename(base)
+        if not name in self.get_current_kernels(base):
+            raise IOError("Requested kernel %s is not in module %s" % (name, base))
+
+        inpath = os.path.abspath(base)
+        if len(basename) > 0:
+            top = 'volk_' + basename + '_'
+        else:
+            top = 'volk_'
+        oldvolk = re.compile(top[:-1])
+
+        self.convert_kernel(oldvolk, name, base, inpath, top)
+
+        kernel = re.compile(name)
+        search_kernels = set([kernel])
+
+        profile = re.compile(r'^\s*VOLK_PROFILE')
+        puppet = re.compile(r'^\s*VOLK_PUPPET')
+        infile = open(os.path.join(inpath, 'apps/', oldvolk.pattern + '_profile.cc'))
+        otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc'))
+        dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'apps/volk_' + self.my_dict['name'] + '_profile.cc')
+        lines = infile.readlines()
+        otherlines = otherinfile.readlines()
+        open(dest, 'w+').write('')
+        insert = False
+        inserted = False
+        for otherline in otherlines:
+
+            if self.lastline.match(otherline):
+                insert = True
+            if insert and not inserted:
+                inserted = True
+                for line in lines:
+                    if kernel.search(line):
+                        if profile.match(line):
+                            outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+                            open(dest, 'a').write(outline)
+                        elif puppet.match(line):
+                            outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+                            open(dest, 'a').write(outline)
+                            args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line)
+                            m_func = args.group(0).split(',')[0]
+                            func = re.search('(?<=' + top + ').*', m_func)
+                            search_kernels.add(re.compile(func.group(0)))
+                            self.convert_kernel(oldvolk, func.group(0), base, inpath, top)
+            write_okay = True
+            for kernel in search_kernels:
+                if kernel.search(otherline):
+                    write_okay = False
+            if write_okay:
+                open(dest, 'a').write(otherline)
+
+        for kernel in search_kernels:
+            print("Adding kernel %s from module %s" % (kernel.pattern, base))
+
+        infile = open(os.path.join(inpath, 'lib/testqa.cc'))
+        otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc'))
+        dest = os.path.join(self.my_dict['destination'], 'volk_' + self.my_dict['name'], 'lib/testqa.cc')
+        lines = infile.readlines()
+        otherlines = otherinfile.readlines()
+        open(dest, 'w+').write('')
+        inserted = False
+        insert = False
+        for otherline in otherlines:
+            if re.match(r'\s*', otherline) is None or re.match(r'\s*#.*', otherline) is None:
+                insert = True
+            if insert and not inserted:
+                inserted = True
+                for line in lines:
+                    for kernel in search_kernels:
+                        if kernel.search(line):
+                            if self.volk_run_tests.match(line):
+                                outline = re.sub(oldvolk, 'volk_' + self.my_dict['name'], line)
+                                open(dest, 'a').write(outline)
+            write_okay = True
+            for kernel in search_kernels:
+                if kernel.search(otherline):
+                    write_okay = False
+            if write_okay:
+                open(dest, 'a').write(otherline)
diff --git a/scripts/ci/download_intel_sde.sh b/scripts/ci/download_intel_sde.sh

new file mode 100755 (executable)

index 0000000..fb7d782
--- /dev/null
+++ b/scripts/ci/download_intel_sde.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Copyright 2019, 2020 Christoph Mayer
+#
+# This script is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+set -e
+set -x
+
+function test_sde
+{
+    if ! [ -f ${SDE} ]; then
+        echo "1"
+    else
+        ${SDE} -- ls > /dev/null
+        echo $?
+    fi
+}
+
+mkdir -p cache
+cd cache
+
+[ -z "${SDE_VERSION}" ] && SDE_VERSION=sde-external-8.50.0-2020-03-26-lin
+[ -z "${SDE_URL}" ] && SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/
+[ -z "${SDE}" ] && SDE=${SDE_VERSION}/sde64
+
+
+if [ _$(test_sde) == _0 ]; then
+    MSG="found working version: ${SDE_VERSION}"
+else
+    MSG="downloading: ${SDE_VERSION}"
+    wget ${SDE_URL}/${SDE_VERSION}.tar.bz2
+    tar xvf ${SDE_VERSION}.tar.bz2
+fi
+
+echo $SDE
diff --git a/scripts/licensing/count_contrib.sh b/scripts/licensing/count_contrib.sh

new file mode 100755 (executable)

index 0000000..f8d88c3
--- /dev/null
+++ b/scripts/licensing/count_contrib.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# Script to check the list of git submitters against the table of re-submitting
+# users from the AUTHORFILE. Requires the authors to be listed in
+# | ... | ... | email@address.com |
+# format.
+#
+# We can add another table of "git committers who are exempt from the need to
+# relicense due to their contributions being under an acceptable license
+# already" if we need; no changes to this script would be necessary.
+#
+# This script is part of VOLK.
+#
+# Copyright 2021 Marcus Müller
+# SPDX-License-Identifier: MPL-2.0
+
+rootdir=`git rev-parse --show-toplevel`
+if [[ "$#" -lt 1 ]]
+then
+    authorfile=$rootdir/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md
+else
+    authorfile=$1
+fi
+if [[ ! -r $authorfile ]]
+then
+    echo "$authorfile: file not readable"
+    exit -1
+fi
+
+allfiles=`git ls-files $rootdir`
+lgplers="$(sed -ne 's/^|[^|]*|[^|]*| \([^|]*\)|/\1/ip' $authorfile)"
+lgplers="$lgplers 32478819+fritterhoff@users.noreply.github.com douggeiger@users.noreply.github.com"
+authorcounts="$(echo "$allfiles" | while read f; do git blame --line-porcelain --ignore-rev 092a59997a1e1d5f421a0a5f87ee655ad173b93f $f 2>/dev/null | sed -ne 's/^author-mail <\([^>]*\)>/\1/p'; done | sort -f | uniq -ic | sort -n)"
+
+total_loc=0
+missing_loc=0
+
+while read -r line
+do
+    authoremail=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\2/g')
+    authorlines=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\1/g')
+    total_loc=$(( $authorlines + $total_loc ))
+    if ! ( echo "$lgplers" | grep -i "$authoremail" ) > /dev/null
+    then
+        echo "missing: \"$authoremail\" (${authorlines} LOC)"
+        missingloc=$(($missingloc + $authorlines))
+    fi
+done < <(echo "$authorcounts")
+
+percentage=$(echo "scale=2; 100.0 * $missingloc/$total_loc" | bc)
+echo "Missing $missingloc of $total_loc LOC in total ($percentage%)"
+
+if [[  "$missingloc" -gt 0 ]]
+then
+   exit -2
+fi
+exit 0
diff --git a/scripts/tools/release.sh b/scripts/tools/release.sh

new file mode 100755 (executable)

index 0000000..5361db2
--- /dev/null
+++ b/scripts/tools/release.sh
@@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+#
+# Copyright 2020 Marcus Müller, Johannes Demel, Ryan Volz
+#
+# This script is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+setopt ERR_EXIT #exit on error
+#Project name
+project=volk
+#What to prefix in release tags in front of the version number
+releaseprefix="v"
+#Remote to push to
+remote="origin"
+#Name of the Changelog file
+changelog="docs/CHANGELOG.md"
+#Name of the file where the last release tag is stored
+lastreleasefile=".lastrelease"
+
+#use your $EDITOR, unless unset, in which case: do the obvious
+EDITOR="${EDITOR:=vim}"
+
+tempdir="$(mktemp -d)"
+deltafile="${tempdir}/delta.md"
+annotationfile="${tempdir}/tag.rst"
+
+# if using BSD signify:
+pubkey="${HOME}/.signify/${project}-signing-001.pub"
+seckey="${HOME}/.signify/${project}-signing-001.sec"
+
+#use parallel pigz if available, else gzip
+gz=$(which pigz 2> /dev/null || which gzip)
+
+# uncomment the following lines if using GPG
+# echo "Will use the following key for signing…"
+# signingkey=$(git config user.signingkey)
+# gpg2 --list-keys "${signingkey}" || echo "Can't get info about key ${signingkey}.  Did you forget to do 'git config --local user.signingkey=0xDEADBEEF'?'"
+# echo "… end of key info."
+
+# 1. Get the version number from CMake file
+version_major=$(grep -i 'set(version_info_major' CMakeLists.txt |\
+                    sed 's/.*VERSION_INFO_MAJOR_VERSION[[:space:]]*\([[:digit:]a-zA-z-]*\))/\1/i')
+version_minor=$(grep -i 'set(version_info_minor' CMakeLists.txt |\
+                    sed 's/.*VERSION_INFO_MINOR_VERSION[[:space:]]*\([[:digit:]a-zA-z-]*\))/\1/i')
+version_maint=$(grep -i 'set(version_info_maint' CMakeLists.txt |\
+                    sed 's/.*VERSION_INFO_MAINT_VERSION[[:space:]]*\([[:digit:]a-zA-z-]*\))/\1/i')
+version="${version_major}.${version_minor}.${version_maint}"
+last_release="$(cat ${lastreleasefile})"
+echo "Releasing version ${version}"
+
+# 2. Prepare Changelog
+echo "appending git shortlog to CHANGELOG:"
+shortlog="
+
+## [${version}] - $(date +'%Y-%m-%d')
+
+$(git shortlog -e ${last_release}..HEAD)
+"
+echo "${shortlog}"
+
+echo "${shortlog}" > ${deltafile}
+
+${EDITOR} ${deltafile}
+
+echo "$(cat ${deltafile})" >> ${changelog}
+echo "${releaseprefix}${version}" > ${lastreleasefile}
+
+# 3. Commit changelog
+git commit -m "Release ${version}" "${changelog}" "${lastreleasefile}" CMakeLists.txt
+
+# 4. prepare tag
+cat "${deltafile}" > ${annotationfile}
+# Append the HEAD commit hash to the annotation
+echo "git-describes-hash: $(git rev-parse --verify HEAD)" >> "${annotationfile}"
+
+echo "Signing git tag..."
+if type 'signify-openbsd' > /dev/null; then
+    signaturefile="${tempdir}/annotationfile.sig"
+    signify-openbsd -S -x "${signaturefile}" -s "${seckey}" -m "${annotationfile}"
+    echo "-----BEGIN SIGNIFY SIGNATURE-----" >> "${annotationfile}"
+    cat "${signaturefile}" >> "${annotationfile}"
+    echo "-----END SIGNIFY SIGNATURE-----" >> "${annotationfile}"
+fi
+
+# finally tag the release and sign it
+## add --sign to sign using GPG
+git tag --annotate --cleanup=verbatim -F "${annotationfile}" "${releaseprefix}${version}"
+
+# 5. Create archive
+tarprefix="${project}-${version}"
+outfile="${tempdir}/${tarprefix}.tar"
+git archive "--output=${outfile}" "--prefix=${tarprefix}/" HEAD
+# Append submodule archives
+git submodule foreach --recursive "git archive --output ${tempdir}/${tarprefix}-sub-\${sha1}.tar --prefix=${tarprefix}/\${sm_path}/ HEAD"
+if [[ $(ls ${tempdir}/${tarprefix}-sub*.tar | wc -l) != 0  ]]; then
+  # combine all archives into one tar
+  tar --concatenate --file ${outfile} ${tempdir}/${tarprefix}-sub*.tar
+fi
+echo "Created tape archive '${outfile}' of size $(du -h ${outfile})."
+
+# 6. compress
+echo  "compressing:"
+echo  "gzip…"
+${gz} --keep --best "${outfile}"
+#--threads=0: guess number of CPU cores
+echo "xz…"
+xz --keep -9 --threads=0 "${outfile}"
+# echo "zstd…"
+# zstd --threads=0 -18 "${outfile}"
+echo "…compressed."
+
+# 7. sign
+
+# 7.1 with openbsd-signify
+echo "signing file list…"
+filelist="${tempdir}/${version}.sha256"
+pushd "${tempdir}"
+sha256sum --tag *.tar.* > "${filelist}"
+signify-openbsd -S -e -s "${seckey}" -m "${filelist}"
+echo "…signed. Check with 'signify -C -p \"${pubkey}\" -x \"${filelist}\"'."
+signify-openbsd -C -p "${pubkey}" -x "${filelist}.sig"
+popd
+echo "checked."
+
+# 7.2 with GPG
+echo "signing tarballs with GPG ..."
+gpg --armor --detach-sign "${outfile}".gz
+gpg --armor --detach-sign "${outfile}".xz
+
+#8. bundle archives
+mkdir -p archives
+cp "${tempdir}"/*.tar.* "${filelist}.sig" "${pubkey}" archives/
+echo "Results can be found under $(pwd)/archives"
+
+#9. Push to origin
+echo "Finished release!"
+echo "Remember to push release commit AND release tag!"
+echo "Release commit: 'git push ${remote} HEAD'"
+echo "Release tag: 'git push ${remote} v${releaseprefix}${version}'"
+#read -q "push?Do you want to push to origin? (y/n)" || echo "not pushing"
+#if [ "${push}" = "y" ]; then
+#    git push "${remote}" HEAD
+#    git push "${remote}" "v${releaseprefix}${version}"
+#fi
+
diff --git a/scripts/tools/run_citations_update.py b/scripts/tools/run_citations_update.py

new file mode 100644 (file)

index 0000000..275bd44
--- /dev/null
+++ b/scripts/tools/run_citations_update.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright 2022 Johannes Demel.
+#
+# This file is part of VOLK
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+
+import argparse
+from pprint import pprint
+import regex
+import json
+import pathlib
+
+
+def parse_name(contributor):
+    name_pattern = "(.*?) <"
+    name = regex.search(name_pattern, contributor).group().replace(" <", "")
+    email_pattern = "<(.*?)>"
+    email = regex.search(email_pattern, contributor).group().replace(
+        "<", "").replace(">", "")
+    return name, email
+
+
+def parse_contributors(contributors):
+    result = []
+    for c in contributors:
+        name, email = parse_name(c)
+        result.append({"name": name, "email": email})
+    return result
+
+
+name_aliases = {
+    'alesha72003': "Alexey Slokva",
+    'dernasherbrezon': "Andrey Rodionov",
+    'Doug': "Douglas Geiger",
+    'Doug Geiger': "Douglas Geiger",
+    "Federico 'Larroca' La Rocca": "Federico Larroca",
+    'git-artes': "Federico 'Larroca' La Rocca",
+    'ghostop14': "Mike Piscopo",
+    'gnieboer': "Geof Nieboer",
+    'Jam Quiceno': "Jam M. Hernandez Quiceno",
+    'jdemel': "Johannes Demel",
+    'Marc L': "Marc Lichtman",
+    'Marcus Mueller': "Marcus Müller",
+    'Michael L Dickens': "Michael Dickens",
+    'Micheal Dickens': "Michael Dickens",
+    'namccart': "Nicholas McCarthy",
+    'hcab14': "Christoph Mayer",
+    'cmayer': "Christoph Mayer",
+    'root': "Philip Balister", }
+
+
+def fix_known_names(contributors):
+    for c in contributors:
+        c['name'] = name_aliases.get(c['name'], c['name'])
+    return contributors
+
+
+def merge_names(contributors):
+    results = []
+    names = sorted(list(set([c['name'] for c in contributors])))
+    for name in names:
+        emails = []
+        for c in contributors:
+            if name == c['name']:
+                emails.append(c['email'])
+        results.append({'name': name, 'email': emails})
+    return results
+
+
+def normalize_names(contributors):
+    for c in contributors:
+        name = c['name'].split(' ')
+        if len(name) > 1:
+            name = f'{name[-1]}, {" ".join(name[0:-1])}'
+        else:
+            name = name[0]
+        c['name'] = name
+
+    return contributors
+
+
+def find_citation_file(filename='.zenodo.json'):
+    # careful! This function makes quite specific folder structure assumptions!
+    file_loc = pathlib.Path(__file__).absolute()
+    file_loc = file_loc.parent
+    citation_file = next(file_loc.glob(f'../../{filename}'))
+    return citation_file.resolve()
+
+
+def load_citation_file(filename):
+    with open(filename, 'r') as file:
+        citation_file = json.load(file)
+    return citation_file
+
+
+def update_citation_file(filename, citation_data):
+    with open(filename, 'w')as file:
+        json.dump(citation_data, file, indent=4)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Update citation file')
+    parser.add_argument('contributors', metavar='N', type=str, nargs='+',
+                        help='contributors with emails: Name <email>')
+    args = parser.parse_args()
+
+    contributors = args.contributors[0].split("\n")
+    contributors = parse_contributors(contributors)
+    contributors = fix_known_names(contributors)
+    contributors = merge_names(contributors)
+    contributors = normalize_names(contributors)
+
+    citation_file_name = find_citation_file()
+    citation_file = load_citation_file(citation_file_name)
+
+    creators = citation_file['creators']
+
+    git_names = sorted([c['name'] for c in contributors])
+    cite_names = sorted([c['name'] for c in creators])
+    git_only_names = []
+    for n in git_names:
+        if n not in cite_names:
+            git_only_names.append(n)
+
+    for name in git_only_names:
+        creators.append({'name': name})
+
+    # make sure all contributors are sorted alphabetically by their family name.
+    creators = sorted(creators, key=lambda x: x['name'])
+    maintainers = ["Demel, Johannes", "Dickens, Michael"]
+    maintainer_list = list(
+        filter(lambda x: x['name'] in maintainers, creators))
+    creators = list(filter(lambda x: x['name'] not in maintainers, creators))
+    nick_list = list(filter(lambda x: ', ' not in x['name'], creators))
+    fullname_list = list(filter(lambda x: ', ' in x['name'], creators))
+
+    creators = maintainer_list + fullname_list + nick_list
+
+    citation_file['creators'] = creators
+    update_citation_file(citation_file_name, citation_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/tools/update_citations.sh b/scripts/tools/update_citations.sh

new file mode 100755 (executable)

index 0000000..4cefeec
--- /dev/null
+++ b/scripts/tools/update_citations.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+#
+# Copyright 2022 Johannes Demel
+#
+# This script is part of VOLK.
+#
+# SPDX-License-Identifier: LGPL-3.0-or-later
+#
+# Find all contributors according to git and update `.zenodo.json` accordingly.
+
+script_name=$0
+script_full_path=$(dirname "$0")
+python_script=$"$script_full_path/run_citations_update.py"
+
+contributors_list="$(git log --pretty="%an <%ae>" | sort | uniq)"
+
+# Run a Python script to make things easier.
+python3 $python_script "$contributors_list"
diff --git a/tmpl/volk.pc.in b/tmpl/volk.pc.in

new file mode 100644 (file)

index 0000000..60dc8b0
--- /dev/null
+++ b/tmpl/volk.pc.in
@@ -0,0 +1,13 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+LV_CXXFLAGS=@LV_CXXFLAGS@
+
+
+Name: volk
+Description: VOLK: Vector Optimized Library of Kernels
+Requires:
+Version: @SOVERSION@
+Libs: -L${libdir} -lvolk
+Cflags: -I${includedir} ${LV_CXXFLAGS}
diff --git a/tmpl/volk.tmpl.c b/tmpl/volk.tmpl.c

new file mode 100644 (file)

index 0000000..62a9400
--- /dev/null
+++ b/tmpl/volk.tmpl.c
@@ -0,0 +1,198 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_typedefs.h>
+#include <volk/volk_cpu.h>
+#include "volk_rank_archs.h"
+#include <volk/volk.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+static size_t __alignment = 0;
+static intptr_t __alignment_mask = 0;
+
+struct volk_machine *get_machine(void)
+{
+  extern struct volk_machine *volk_machines[];
+  extern unsigned int n_volk_machines;
+  static struct volk_machine *machine = NULL;
+
+  if(machine != NULL)
+    return machine;
+  else {
+    unsigned int max_score = 0;
+    unsigned int i;
+    struct volk_machine *max_machine = NULL;
+    for(i=0; i<n_volk_machines; i++) {
+      if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+        if(volk_machines[i]->caps > max_score) {
+          max_score = volk_machines[i]->caps;
+          max_machine = volk_machines[i];
+        }
+      }
+    }
+    machine = max_machine;
+    //printf("Using Volk machine: %s\n", machine->name);
+    __alignment = machine->alignment;
+    __alignment_mask = (intptr_t)(__alignment-1);
+    return machine;
+  }
+}
+
+void volk_list_machines(void)
+{
+  extern struct volk_machine *volk_machines[];
+  extern unsigned int n_volk_machines;
+
+  unsigned int i;
+  for(i=0; i<n_volk_machines; i++) {
+    if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+        printf("%s;", volk_machines[i]->name);
+    }
+  }
+  printf("\n");
+}
+
+const char* volk_get_machine(void)
+{
+  extern struct volk_machine *volk_machines[];
+  extern unsigned int n_volk_machines;
+  static struct volk_machine *machine = NULL;
+
+  if(machine != NULL)
+    return machine->name;
+  else {
+    unsigned int max_score = 0;
+    unsigned int i;
+    struct volk_machine *max_machine = NULL;
+    for(i=0; i<n_volk_machines; i++) {
+      if(!(volk_machines[i]->caps & (~volk_get_lvarch()))) {
+        if(volk_machines[i]->caps > max_score) {
+          max_score = volk_machines[i]->caps;
+          max_machine = volk_machines[i];
+        }
+      }
+    }
+    machine = max_machine;
+    return machine->name;
+  }
+}
+
+size_t volk_get_alignment(void)
+{
+    get_machine(); //ensures alignment is set
+    return __alignment;
+}
+
+bool volk_is_aligned(const void *ptr)
+{
+    return ((intptr_t)(ptr) & __alignment_mask) == 0;
+}
+
+#define LV_HAVE_GENERIC
+#define LV_HAVE_DISPATCHER
+
+%for kern in kernels:
+
+%if kern.has_dispatcher:
+#include <volk/${kern.name}.h> //pulls in the dispatcher
+%endif
+
+static inline void __${kern.name}_d(${kern.arglist_full})
+{
+    %if kern.has_dispatcher:
+    ${kern.name}_dispatcher(${kern.arglist_names});
+    return;
+    %endif
+
+    if (volk_is_aligned(<% num_open_parens = 0 %>
+    %for arg_type, arg_name in kern.args:
+        %if '*' in arg_type:
+        VOLK_OR_PTR(${arg_name},<% num_open_parens += 1 %>
+        %endif
+    %endfor
+        0<% end_open_parens = ')'*num_open_parens %>${end_open_parens}
+    )){
+        ${kern.name}_a(${kern.arglist_names});
+    }
+    else{
+        ${kern.name}_u(${kern.arglist_names});
+    }
+}
+
+static inline void __init_${kern.name}(void)
+{
+    const char *name = get_machine()->${kern.name}_name;
+    const char **impl_names = get_machine()->${kern.name}_impl_names;
+    const int *impl_deps = get_machine()->${kern.name}_impl_deps;
+    const bool *alignment = get_machine()->${kern.name}_impl_alignment;
+    const size_t n_impls = get_machine()->${kern.name}_n_impls;
+    const size_t index_a = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
+    const size_t index_u = volk_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
+    ${kern.name}_a = get_machine()->${kern.name}_impls[index_a];
+    ${kern.name}_u = get_machine()->${kern.name}_impls[index_u];
+
+    assert(${kern.name}_a);
+    assert(${kern.name}_u);
+
+    ${kern.name} = &__${kern.name}_d;
+}
+
+static inline void __${kern.name}_a(${kern.arglist_full})
+{
+    __init_${kern.name}();
+    ${kern.name}_a(${kern.arglist_names});
+}
+
+static inline void __${kern.name}_u(${kern.arglist_full})
+{
+    __init_${kern.name}();
+    ${kern.name}_u(${kern.arglist_names});
+}
+
+static inline void __${kern.name}(${kern.arglist_full})
+{
+    __init_${kern.name}();
+    ${kern.name}(${kern.arglist_names});
+}
+
+${kern.pname} ${kern.name}_a = &__${kern.name}_a;
+${kern.pname} ${kern.name}_u = &__${kern.name}_u;
+${kern.pname} ${kern.name}   = &__${kern.name};
+
+void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
+{
+    const int index = volk_get_index(
+        get_machine()->${kern.name}_impl_names,
+        get_machine()->${kern.name}_n_impls,
+        impl_name
+    );
+    get_machine()->${kern.name}_impls[index](
+        ${kern.arglist_names}
+    );
+}
+
+volk_func_desc_t ${kern.name}_get_func_desc(void) {
+    const char **impl_names = get_machine()->${kern.name}_impl_names;
+    const int *impl_deps = get_machine()->${kern.name}_impl_deps;
+    const bool *alignment = get_machine()->${kern.name}_impl_alignment;
+    const size_t n_impls = get_machine()->${kern.name}_n_impls;
+    volk_func_desc_t desc = {
+        impl_names,
+        impl_deps,
+        alignment,
+        n_impls
+    };
+    return desc;
+}
+
+%endfor
diff --git a/tmpl/volk.tmpl.h b/tmpl/volk.tmpl.h

new file mode 100644 (file)

index 0000000..ad5595b
--- /dev/null
+++ b/tmpl/volk.tmpl.h
@@ -0,0 +1,111 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_RUNTIME
+#define INCLUDED_VOLK_RUNTIME
+
+#include <volk/volk_typedefs.h>
+#include <volk/volk_config_fixed.h>
+#include <volk/volk_common.h>
+#include <volk/volk_complex.h>
+#include <volk/volk_malloc.h>
+#include <volk/volk_version.h>
+
+#include <stdlib.h>
+#include <stdbool.h>
+
+__VOLK_DECL_BEGIN
+
+typedef struct volk_func_desc
+{
+    const char **impl_names;
+    const int *impl_deps;
+    const bool *impl_alignment;
+    size_t n_impls;
+} volk_func_desc_t;
+
+//! Prints a list of machines available
+VOLK_API void volk_list_machines(void);
+
+//! Returns the name of the machine this instance will use
+VOLK_API const char* volk_get_machine(void);
+
+//! Get the machine alignment in bytes
+VOLK_API size_t volk_get_alignment(void);
+
+/*!
+ * The VOLK_OR_PTR macro is a convenience macro
+ * for checking the alignment of a set of pointers.
+ * Example usage:
+ * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2)))
+ */
+#define VOLK_OR_PTR(ptr0, ptr1) \
+    (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1)))
+
+/*!
+ * Is the pointer on a machine alignment boundary?
+ *
+ * Note: for performance reasons, this function
+ * is not usable until another volk API call is made
+ * which will perform certain initialization tasks.
+ *
+ * \param ptr the pointer to some memory buffer
+ * \return 1 for alignment boundary, else 0
+ */
+VOLK_API bool volk_is_aligned(const void *ptr);
+
+// Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`.
+// We just assume the compiler and the system are tight together as far as Mako templates are concerned.
+<%
+deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8', 
+                      'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i', 
+                      'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i')
+from platform import system
+if system() == 'Windows':
+    deprecated_kernels = ()
+%>
+%for kern in kernels:
+
+% if kern.name in deprecated_kernels:
+//! A function pointer to the dispatcher implementation
+extern VOLK_API ${kern.pname} ${kern.name} __attribute__((deprecated));
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API ${kern.pname} ${kern.name}_a __attribute__((deprecated));
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API ${kern.pname} ${kern.name}_u __attribute__((deprecated));
+
+//! Call into a specific implementation given by name
+extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) __attribute__((deprecated));
+
+//! Get description parameters for this kernel
+extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void) __attribute__((deprecated));
+% else:
+//! A function pointer to the dispatcher implementation
+extern VOLK_API ${kern.pname} ${kern.name};
+
+//! A function pointer to the fastest aligned implementation
+extern VOLK_API ${kern.pname} ${kern.name}_a;
+
+//! A function pointer to the fastest unaligned implementation
+extern VOLK_API ${kern.pname} ${kern.name}_u;
+
+//! Call into a specific implementation given by name
+extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name);
+
+//! Get description parameters for this kernel
+extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void);
+% endif
+
+%endfor
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_RUNTIME*/
diff --git a/tmpl/volk_config_fixed.tmpl.h b/tmpl/volk_config_fixed.tmpl.h

new file mode 100644 (file)

index 0000000..f656d3e
--- /dev/null
+++ b/tmpl/volk_config_fixed.tmpl.h
@@ -0,0 +1,17 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_CONFIG_FIXED_H
+#define INCLUDED_VOLK_CONFIG_FIXED_H
+
+%for i, arch in enumerate(archs):
+#define LV_${arch.name.upper()} ${i}
+%endfor
+
+#endif /*INCLUDED_VOLK_CONFIG_FIXED*/
diff --git a/tmpl/volk_cpu.tmpl.c b/tmpl/volk_cpu.tmpl.c

new file mode 100644 (file)

index 0000000..a11a626
--- /dev/null
+++ b/tmpl/volk_cpu.tmpl.c
@@ -0,0 +1,100 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <volk/volk_cpu.h>
+#include <volk/volk_config_fixed.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#if defined(VOLK_CPU_FEATURES)
+#include "cpu_features_macros.h"
+#if defined(CPU_FEATURES_ARCH_X86)
+#include "cpuinfo_x86.h"
+#elif defined(CPU_FEATURES_ARCH_ARM)
+#include "cpuinfo_arm.h"
+#elif defined(CPU_FEATURES_ARCH_AARCH64)
+#include "cpuinfo_aarch64.h"
+#elif defined(CPU_FEATURES_ARCH_MIPS)
+#include "cpuinfo_mips.h"
+#elif defined(CPU_FEATURES_ARCH_PPC)
+#include "cpuinfo_ppc.h"
+#endif
+
+// This is required for MSVC
+#if defined(__cplusplus)
+using namespace cpu_features;
+#endif
+#endif
+
+
+struct VOLK_CPU volk_cpu;
+
+%for arch in archs:
+static int i_can_has_${arch.name} (void) {
+    %for check, params in arch.checks:
+        %if "neon" in arch.name:
+#if defined(CPU_FEATURES_ARCH_ARM)
+    if (GetArmInfo().features.${check} == 0){ return 0; }
+#endif
+        %elif "mips" in arch.name:
+#if defined(CPU_FEATURES_ARCH_MIPS)
+    if (GetMipsInfo().features.${check} == 0){ return 0; }
+#endif
+        %else:
+#if defined(CPU_FEATURES_ARCH_X86)
+    if (GetX86Info().features.${check} == 0){ return 0; }
+#endif
+        %endif
+    %endfor
+    return 1;
+}
+
+%endfor
+
+#if defined(HAVE_FENV_H)
+    #if defined(FE_TONEAREST)
+        #include <fenv.h>
+        static inline void set_float_rounding(void){
+            fesetround(FE_TONEAREST);
+        }
+    #else
+        static inline void set_float_rounding(void){
+            //do nothing
+        }
+    #endif
+#elif defined(_MSC_VER)
+    #include <float.h>
+    static inline void set_float_rounding(void){
+        unsigned int cwrd;
+        _controlfp_s(&cwrd, 0, 0);
+        _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
+    }
+#else
+    static inline void set_float_rounding(void){
+        //do nothing
+    }
+#endif
+
+
+void volk_cpu_init() {
+    %for arch in archs:
+    volk_cpu.has_${arch.name} = &i_can_has_${arch.name};
+    %endfor
+    set_float_rounding();
+}
+
+unsigned int volk_get_lvarch() {
+    unsigned int retval = 0;
+    volk_cpu_init();
+    %for arch in archs:
+    retval += volk_cpu.has_${arch.name}() << LV_${arch.name.upper()};
+    %endfor
+    return retval;
+}
diff --git a/tmpl/volk_cpu.tmpl.h b/tmpl/volk_cpu.tmpl.h

new file mode 100644 (file)

index 0000000..db4adfc
--- /dev/null
+++ b/tmpl/volk_cpu.tmpl.h
@@ -0,0 +1,30 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_CPU_H
+#define INCLUDED_VOLK_CPU_H
+
+#include <volk/volk_common.h>
+
+__VOLK_DECL_BEGIN
+
+struct VOLK_CPU {
+    %for arch in archs:
+    int (*has_${arch.name}) ();
+    %endfor
+};
+
+extern struct VOLK_CPU volk_cpu;
+
+void volk_cpu_init ();
+unsigned int volk_get_lvarch ();
+
+__VOLK_DECL_END
+
+#endif /*INCLUDED_VOLK_CPU_H*/
diff --git a/tmpl/volk_machine_xxx.tmpl.c b/tmpl/volk_machine_xxx.tmpl.c

new file mode 100644 (file)

index 0000000..b74a931
--- /dev/null
+++ b/tmpl/volk_machine_xxx.tmpl.c
@@ -0,0 +1,49 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+<% this_machine = machine_dict[args[0]] %>
+<% arch_names = this_machine.arch_names %>
+
+%for arch in this_machine.archs:
+#define LV_HAVE_${arch.name.upper()} 1
+%endfor
+
+#include <volk/volk_common.h>
+#include "volk_machines.h"
+#include <volk/volk_config_fixed.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+%for kern in kernels:
+#include <volk/${kern.name}.h>
+%endfor
+
+struct volk_machine volk_machine_${this_machine.name} = {
+<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %>    ${make_arch_have_list},
+<% this_machine_name = "\""+this_machine.name+"\"" %>    ${this_machine_name},
+    ${this_machine.alignment},
+##//list all kernels
+    %for kern in kernels:
+<% impls = kern.get_impls(arch_names) %>
+##//kernel name
+<% kern_name = "\""+kern.name+"\"" %>    ${kern_name},
+##//list of kernel implementations by name
+<% make_impl_name_list = "{"+', '.join(['"%s"'%i.name for i in impls])+"}" %>    ${make_impl_name_list},
+##//list of arch dependencies per implementation
+<% make_impl_deps_list = "{"+', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in impls])+"}" %>    ${make_impl_deps_list},
+##//alignment required? for each implementation
+<% make_impl_align_list = "{"+', '.join(['true' if i.is_aligned else 'false' for i in impls])+"}" %>    ${make_impl_align_list},
+##//pointer to each implementation
+<% make_impl_fcn_list = "{"+', '.join(['%s_%s'%(kern.name, i.name) for i in impls])+"}" %>    ${make_impl_fcn_list},
+##//number of implementations listed here
+<% len_impls = len(impls) %>    ${len_impls},
+    %endfor
+};
diff --git a/tmpl/volk_machines.tmpl.c b/tmpl/volk_machines.tmpl.c

new file mode 100644 (file)

index 0000000..09e7523
--- /dev/null
+++ b/tmpl/volk_machines.tmpl.c
@@ -0,0 +1,22 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+#include "volk_machines.h"
+
+struct volk_machine *volk_machines[] = {
+%for machine in machines:
+#ifdef LV_MACHINE_${machine.name.upper()}
+&volk_machine_${machine.name},
+#endif
+%endfor
+};
+
+unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines);
diff --git a/tmpl/volk_machines.tmpl.h b/tmpl/volk_machines.tmpl.h

new file mode 100644 (file)

index 0000000..70fdd4d
--- /dev/null
+++ b/tmpl/volk_machines.tmpl.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_LIBVOLK_MACHINES_H
+#define INCLUDED_LIBVOLK_MACHINES_H
+
+#include <volk/volk_common.h>
+#include <volk/volk_typedefs.h>
+
+#include <stdbool.h>
+#include <stdlib.h>
+
+__VOLK_DECL_BEGIN
+
+struct volk_machine {
+    const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format)
+    const char *name;
+    const size_t alignment; //the maximum byte alignment required for functions in this library
+    %for kern in kernels:
+    const char *${kern.name}_name;
+    const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}];
+    const int ${kern.name}_impl_deps[${len_archs}];
+    const bool ${kern.name}_impl_alignment[${len_archs}];
+    const ${kern.pname} ${kern.name}_impls[${len_archs}];
+    const size_t ${kern.name}_n_impls;
+    %endfor
+};
+
+%for machine in machines:
+#ifdef LV_MACHINE_${machine.name.upper()}
+extern struct volk_machine volk_machine_${machine.name};
+#endif
+%endfor
+
+__VOLK_DECL_END
+
+#endif //INCLUDED_LIBVOLK_MACHINES_H
diff --git a/tmpl/volk_typedefs.tmpl.h b/tmpl/volk_typedefs.tmpl.h

new file mode 100644 (file)

index 0000000..06abe02
--- /dev/null
+++ b/tmpl/volk_typedefs.tmpl.h
@@ -0,0 +1,20 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2011-2012 Free Software Foundation, Inc.
+ *
+ * This file is part of VOLK
+ *
+ * SPDX-License-Identifier: LGPL-3.0-or-later
+ */
+
+#ifndef INCLUDED_VOLK_TYPEDEFS
+#define INCLUDED_VOLK_TYPEDEFS
+
+#include <inttypes.h>
+#include <volk/volk_complex.h>
+
+%for kern in kernels:
+typedef void (*${kern.pname})(${kern.arglist_types});
+%endfor
+
+#endif /*INCLUDED_VOLK_TYPEDEFS*/
author	A. Maitland Bottoms <bottoms@debian.org>
	Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)
committer	A. Maitland Bottoms <bottoms@debian.org>
	Sat, 14 Jan 2023 19:01:06 +0000 (19:01 +0000)