From 4b08e126025f7325bcf20322a998d83ccf3758aa Mon Sep 17 00:00:00 2001 From: "A. Maitland Bottoms" Date: Fri, 22 Oct 2021 04:30:05 +0100 Subject: [PATCH] Import volk_2.5.0-2.debian.tar.xz [dgit import tarball volk 2.5.0-2 volk_2.5.0-2.debian.tar.xz] --- changelog | 530 ++++ control | 83 + copyright | 195 ++ libvolk2-bin.install | 2 + libvolk2-bin.manpages | 3 + libvolk2-dev.acc | 50 + libvolk2-dev.install | 5 + libvolk2-doc.doc-base | 19 + libvolk2-doc.docs | 1 + libvolk2.5.install | 1 + not-installed | 6 + ...001-Add-volk_32f-c-_index_min_16-32u.patch | 2160 +++++++++++++++++ ...002-Fix-volk_32fc_index_min_32u_neon.patch | 26 + ...003-Fix-volk_32fc_index_min_32u_neon.patch | 26 + patches/0004-Code-cleanup.patch | 1278 ++++++++++ patches/0005-Fix-clang-format-errors.patch | 112 + ...w-generic-implementation-fixed-typos.patch | 100 + ...contributors-agreeing-to-LGPL-licens.patch | 313 +++ patches/0009-Code-cleanup.patch | 924 +++++++ patches/0010-Fix-clang-format-errors.patch | 57 + patches/0011-Code-cleanup.patch | 139 ++ patches/0012-Fix-clang-format-errors.patch | 82 + ...5-asan-Fix-volk_malloc-alignment-bug.patch | 44 + patches/0056-format-Fix-code-format.patch | 31 + patches/make-acc-happy | 60 + patches/optional-static-apps | 20 + patches/remove-external-HTML-resources | 20 + patches/series | 18 + patches/skip-cpu_features-on-kfreebsd | 20 + patches/use-system-cpu-features-package.patch | 37 + rules | 19 + source/format | 1 + volk-config-info.1 | 45 + volk_modtool.1 | 112 + volk_profile.1 | 5 + watch | 4 + 36 files changed, 6548 insertions(+) create mode 100644 changelog create mode 100644 control create mode 100644 copyright create mode 100644 libvolk2-bin.install create mode 100644 libvolk2-bin.manpages create mode 100644 libvolk2-dev.acc create mode 100644 libvolk2-dev.install create mode 100644 libvolk2-doc.doc-base create mode 100644 libvolk2-doc.docs create mode 100644 libvolk2.5.install create mode 100644 not-installed create mode 100644 patches/0001-Add-volk_32f-c-_index_min_16-32u.patch create mode 100644 patches/0002-Fix-volk_32fc_index_min_32u_neon.patch create mode 100644 patches/0003-Fix-volk_32fc_index_min_32u_neon.patch create mode 100644 patches/0004-Code-cleanup.patch create mode 100644 patches/0005-Fix-clang-format-errors.patch create mode 100644 patches/0006-New-generic-implementation-fixed-typos.patch create mode 100644 patches/0007-Add-the-list-of-contributors-agreeing-to-LGPL-licens.patch create mode 100644 patches/0009-Code-cleanup.patch create mode 100644 patches/0010-Fix-clang-format-errors.patch create mode 100644 patches/0011-Code-cleanup.patch create mode 100644 patches/0012-Fix-clang-format-errors.patch create mode 100644 patches/0055-asan-Fix-volk_malloc-alignment-bug.patch create mode 100644 patches/0056-format-Fix-code-format.patch create mode 100644 patches/make-acc-happy create mode 100644 patches/optional-static-apps create mode 100644 patches/remove-external-HTML-resources create mode 100644 patches/series create mode 100644 patches/skip-cpu_features-on-kfreebsd create mode 100644 patches/use-system-cpu-features-package.patch create mode 100755 rules create mode 100644 source/format create mode 100644 volk-config-info.1 create mode 100644 volk_modtool.1 create mode 100644 volk_profile.1 create mode 100644 watch diff --git a/changelog b/changelog new file mode 100644 index 0000000..4d07699 --- /dev/null +++ b/changelog @@ -0,0 +1,530 @@ +volk (2.5.0-2) unstable; urgency=medium + + * upload to unstable + * with some upstream bugfixes + + -- A. Maitland Bottoms Thu, 21 Oct 2021 23:30:05 -0400 + +volk (2.5.0-1) experimental; urgency=medium + + * New upstream release + * Use libcpu-features-dev on powerpc and x32 (Closes: #978602) + * Mention volk-config-info and volk_modtool in description (Closes: #989263) + * Upload to experimental for soversion bump + + -- A. Maitland Bottoms Thu, 10 Jun 2021 18:29:47 -0400 + +volk (2.4.1-2) unstable; urgency=medium + + [ Shengjing Zhu ] + * Use system cpu_features package + + [ A. Maitland Bottoms ] + * Adopt Use system cpu_features package patch (Closes: #978096) + + -- A. Maitland Bottoms Sun, 27 Dec 2020 15:16:07 -0500 + +volk (2.4.1-1) unstable; urgency=medium + + * New upstream release + + -- A. Maitland Bottoms Thu, 17 Dec 2020 23:53:21 -0500 + +volk (2.4.0-4) unstable; urgency=medium + + * skip cpu_features on "Unsupported OS" kFreeBSD + * bump Standards-Version - no other changes. + + -- A. Maitland Bottoms Tue, 15 Dec 2020 19:53:16 -0500 + +volk (2.4.0-3) unstable; urgency=medium + + * Fix binary-indep build (Closes: #976300) + * Upload to unstable + + -- A. Maitland Bottoms Thu, 03 Dec 2020 20:43:29 -0500 + +volk (2.4.0-2) experimental; urgency=medium + + * Make use of cpu_features a CMake option with sensible defaults per arch + + -- A. Maitland Bottoms Mon, 30 Nov 2020 16:19:19 -0500 + +volk (2.4.0-1) experimental; urgency=medium + + * New upstream release + * cpu_features git submodule packaged as cpu-features source component. + * Upload to experimental for soversion bump + + -- A. Maitland Bottoms Sun, 22 Nov 2020 12:35:43 -0500 + +volk (2.3.0-3) unstable; urgency=medium + + * update to v2.3.0-14-g91e5d07 + emit an emms instruction after using the mmx extension + + -- A. Maitland Bottoms Tue, 30 Jun 2020 19:48:20 -0400 + +volk (2.3.0-2) unstable; urgency=medium + + * Upload to unstable + + -- A. Maitland Bottoms Mon, 11 May 2020 07:26:03 -0400 + +volk (2.3.0-1) experimental; urgency=medium + + * New upstream release, to experimental for soversion bump + * Kernels + - volk: accurate exp kernel + - exp: Rename SSE4.1 to SSE2 kernel + - Add 32f_s32f_add_32f kernel + - This kernel adds in vector + scalar functionality + - Fix the broken index max kernels + - Treat the mod_range puppet as such + - Add puppet for power spectral density kernel + - Updated log10 calcs to use faster log2 approach + - fix: Use unaligned load + - divide: Optimize complexmultiplyconjugate + + -- A. Maitland Bottoms Sat, 09 May 2020 15:42:23 -0400 + +volk (2.2.1-3) unstable; urgency=medium + + * update to v2.2.1-34-gd4756c5 + + -- A. Maitland Bottoms Sun, 05 Apr 2020 10:37:46 -0400 + +volk (2.2.1-2) unstable; urgency=medium + + * update to v2.2.1-11-gfaf230e + * cmake: Remove the ORC from the VOLK public link interface + * Fix the broken index max kernels + + -- A. Maitland Bottoms Fri, 27 Mar 2020 21:48:10 -0400 + +volk (2.2.1-1) unstable; urgency=high + + * New upstream bugfix release + reason for high urgency: + - Fix loop bound in AVX rotator (only one fixed in 2.2.0-3) + - Fix out-of-bounds read in AVX2 square dist kernel + - Fix length checks in AVX2 index max kernels + + -- A. Maitland Bottoms Mon, 24 Feb 2020 18:08:05 -0500 + +volk (2.2.0-3) unstable; urgency=high + + * Update to v2.2.0-6-g5701f8f + reason for high urgency: + - Fix loop bound in AVX rotator + + -- A. Maitland Bottoms Sun, 23 Feb 2020 23:49:18 -0500 + +volk (2.2.0-2) unstable; urgency=medium + + * Upload to unstable + + -- A. Maitland Bottoms Tue, 18 Feb 2020 17:56:58 -0500 + +volk (2.2.0-1) experimental; urgency=medium + + * New upstream release + - Remove build dependency on python six + - Fixup VolkConfigVersion + - add volk_version.h + + -- A. Maitland Bottoms Sun, 16 Feb 2020 18:25:20 -0500 + +volk (2.1.0-2) unstable; urgency=medium + + * Upload to unstable + + -- A. Maitland Bottoms Sun, 05 Jan 2020 23:17:57 -0500 + +volk (2.1.0-1) experimental; urgency=medium + + * New upstream release + - The AVX FMA rotator bug is fixed + - VOLK offers `volk::vector<>` for C++ to follow RAII + - Use C++17 `std::filesystem` + - This enables VOLK to be built without Boost if available! + - lots of bugfixes + - more optimized kernels, especially more NEON versions + * Upload to experimental for new ABI library package libvolk2.1 + + -- A. Maitland Bottoms Sun, 22 Dec 2019 10:27:36 -0500 + +volk (2.0.0-3) unstable; urgency=medium + + * update to v2.0.0-4-gf04a46f + + -- A. Maitland Bottoms Thu, 14 Nov 2019 22:47:23 -0500 + +volk (2.0.0-2) unstable; urgency=medium + + * Upload to unstable + + -- A. Maitland Bottoms Mon, 12 Aug 2019 22:49:11 -0400 + +volk (2.0.0-1) experimental; urgency=medium + + * New upstream release + + -- A. Maitland Bottoms Wed, 07 Aug 2019 23:31:20 -0400 + +volk (1.4-4) unstable; urgency=medium + + * working volk_modtool with Python 3 + * build and install libvolk.a + + -- A. Maitland Bottoms Mon, 29 Oct 2018 01:32:05 -0400 + +volk (1.4-3) unstable; urgency=medium + + * update to v1.4-9-g297fefd + Added an AVX protokernel for volk_32fc_x2_32f_square_dist_scalar_mult_32f + fixed a buffer over-read and over-write in + volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx + Fix 32u_reverse_32u for ARM + + -- A. Maitland Bottoms Sat, 12 May 2018 15:25:04 -0400 + +volk (1.4-2) unstable; urgency=medium + + * Upload to unstable, needed by gnuradio (>= 3.7.12.0) + + -- A. Maitland Bottoms Tue, 03 Apr 2018 01:03:19 -0400 + +volk (1.4-1) experimental; urgency=medium + + * New upstream release + upstream changelog http://libvolk.org/release-v14.html + + -- A. Maitland Bottoms Tue, 27 Mar 2018 22:57:42 -0400 + +volk (1.3.1-1) unstable; urgency=medium + + * New upstream bugfix release + * Refresh all debian patches for use with git am + + -- A. Maitland Bottoms Tue, 27 Mar 2018 21:54:29 -0400 + +volk (1.3-3) unstable; urgency=medium + + * update to v1.3-23-g0109b2e + * update debian/libvolk1-dev.abi.tar.gz.amd64 + * Add breaks/replaces gnuradio (<=3.7.2.1) (LP: #1614235) + + -- A. Maitland Bottoms Sun, 04 Feb 2018 13:12:21 -0500 + +volk (1.3-2) unstable; urgency=medium + + * update to v1.3-16-g28b03a9 + apps: fix profile update reading end of lines + qa: lower tolerance for 32fc_mag to fix issue #96 + * include upstream master patch to sort input files + + -- A. Maitland Bottoms Sun, 27 Aug 2017 13:44:55 -0400 + +volk (1.3-1) unstable; urgency=medium + + * New upstream release + * The index_max kernels were named with the wrong output datatype. To + fix this there are new kernels that return a 32u (int32_t) and the + existing kernels had their signatures changed to return 16u (int16_t). + * The output to stdout and stderr has been shuffled around. There is no + longer a message that prints what VOLK machine is being used and the + warning messages go to stderr rather than stdout. + * The 32fc_index_max kernels previously were only accurate to the SSE + register width (4 points). This was a pretty serious and long-lived + bug that's been fixed and the QA updated appropriately. + + -- A. Maitland Bottoms Sat, 02 Jul 2016 16:30:47 -0400 + +volk (1.2.2-2) unstable; urgency=medium + + * update to v1.2.2-11-g78c8bc4 (to follow gnuradio maint branch) + + -- A. Maitland Bottoms Sun, 19 Jun 2016 14:44:15 -0400 + +volk (1.2.2-1) unstable; urgency=medium + + * New upstream release + + -- A. Maitland Bottoms Fri, 08 Apr 2016 00:12:10 -0400 + +volk (1.2.1-2) unstable; urgency=medium + + * Upstream patches: + Fix some CMake complaints + The fix for compilation with cmake 3.5 + + -- A. Maitland Bottoms Wed, 23 Mar 2016 17:47:54 -0400 + +volk (1.2.1-1) unstable; urgency=medium + + * New upstream release + + -- A. Maitland Bottoms Sun, 07 Feb 2016 19:38:32 -0500 + +volk (1.2-1) unstable; urgency=medium + + * New upstream release + + -- A. Maitland Bottoms Thu, 24 Dec 2015 20:28:13 -0500 + +volk (1.1.1-5) experimental; urgency=medium + + * update to v1.1.1-22-gef53547 to support gnuradio 3.7.9 + + -- A. Maitland Bottoms Fri, 11 Dec 2015 13:12:55 -0500 + +volk (1.1.1-4) unstable; urgency=medium + + * more lintian fixes + + -- A. Maitland Bottoms Wed, 25 Nov 2015 21:49:58 -0500 + +volk (1.1.1-3) unstable; urgency=medium + + * Lintian fixes Pre-Depends + + -- A. Maitland Bottoms Thu, 19 Nov 2015 21:24:27 -0500 + +volk (1.1.1-2) unstable; urgency=medium + + * Note that libvolk1-dev replaces files in gnuradio-dev versions <<3.7.8 + (Closes: #802646) again. Thanks Andreas Beckmann. + + -- A. Maitland Bottoms Fri, 13 Nov 2015 18:45:49 -0500 + +volk (1.1.1-1) unstable; urgency=medium + + * New upstream release + * New architectures exist for the AVX2 and FMA ISAs. + * The profiler now generates buffers that are vlen + a tiny amount and + generates random data to fill buffers. This is intended to catch bugs + in protokernels that write beyond num_points. + * Note that libvolk1-dev replaces files in earlier gnuradio-dev versions + (Closes: #802646) + + -- A. Maitland Bottoms Sun, 01 Nov 2015 18:45:43 -0500 + +volk (1.1-4) unstable; urgency=medium + + * update to v1.1-12-g264addc + + -- A. Maitland Bottoms Tue, 29 Sep 2015 23:41:50 -0400 + +volk (1.1-3) unstable; urgency=low + + * drop dh_acc to get reproducible builds + + -- A. Maitland Bottoms Fri, 11 Sep 2015 22:57:06 -0400 + +volk (1.1-2) unstable; urgency=low + + * use dh-acc + + -- A. Maitland Bottoms Mon, 07 Sep 2015 15:45:20 -0400 + +volk (1.1-1) unstable; urgency=medium + + * re-organize package naming convention + * New upstream release tag v1.1 + New architectures exist for the AVX2 and FMA ISAs. Along + with the build-system support the following kernels have + no proto-kernels taking advantage of these architectures: + + * 32f_x2_dot_prod_32f + * 32fc_x2_multiply_32fc + * 64_byteswap + * 32f_binary_slicer_8i + * 16u_byteswap + * 32u_byteswap + + QA/profiler + ----------- + + The profiler now generates buffers that are vlen + a tiny + amount and generates random data to fill buffers. This is + intended to catch bugs in protokernels that write beyond + num_points. + + -- A. Maitland Bottoms Wed, 26 Aug 2015 09:22:48 -0400 + +volk (1.0.2-2) unstable; urgency=low + + * Use SOURCE_DATE_EPOCH from the environment, if defined, + rather than current date and time to implement volk_build_date() + (embedding build date in a library does not help reproducible builds) + * add watch file + + -- A. Maitland Bottoms Sat, 15 Aug 2015 17:43:15 -0400 + +volk (1.0.2-1) unstable; urgency=medium + + * Maintenance release 24 Jul 2015 by Nathan West + * The major change is the CMake logic to add ASM protokernels. Rather + than depending on CFLAGS and ASMFLAGS we use the results of VOLK's + built in has_ARCH tests. All configurations should work the same as + before, but manually specifying CFLAGS and ASMFLAGS on the cmake call + for ARM native builds should no longer be necessary. + * The 32fc_s32fc_x2_rotator_32fc generic protokernel now includes a + previously implied header. + * Finally, there is a fix to return the "best" protokernel to the + dispatcher when no volk_config exists. Thanks to Alexandre Raymond for + pointing this out. + * with maint branch patch: + kernels-add-missing-include-arm_neon.h + * removed unused build-dependency on liboil0.3-dev (closes: #793626) + + -- A. Maitland Bottoms Wed, 05 Aug 2015 00:43:40 -0400 + +volk (1.0.1-1) unstable; urgency=low + + * Maintenance Release v1.0.1 08 Jul 2015 by Nathan West + This is a maintenance release with bug fixes since the initial release of + v1.0 in April. + + * Contributors + + The following authors have contributed code to this release: + + Doug Geiger doug.geiger@bioradiation.net + Elliot Briggs elliot.briggs@gmail.com + Marcus Mueller marcus@hostalia.de + Nathan West nathan.west@okstate.edu + Tom Rondeau tom@trondeau.com + + * Kernels + + Several bug fixes in different kernels. The NEON implementations of the + following kernels have been fixed: + + 32f_x2_add_32f + 32f_x2_dot_prod_32f + 32fc_s32fc_multiply_32fc + 32fc_x2_multiply_32fc + + Additionally the NEON asm based 32f_x2_add_32f protokernels were not being + used and are now included and available for use via the dispatcher. + + The 32f_s32f_x2_fm_detect_32f kernel now has a puppet. This solves QA seg + faults on 32-bit machines and provide a better test for this kernel. + + The 32fc_s32fc_x2_rotator_32fc generic protokernel replaced cabsf with + hypotf for better Android support. + + * Building + + Static builds now trigger the applications (volk_profile and + volk-config-info) to be statically linked. + + The file gcc_x86_cpuid.h has been removed since it was no longer being + used. Previously it provided cpuid functionality for ancient compilers + that we do not support. + + All build types now use -Wall. + + * QA and Testing + + The documentation around the --update option to volk_profile now makes it + clear that the option will only profile kernels without entries in + volk_profile. The signature of run_volk_tests with expanded args changed + signed types to unsigned types to reflect the actual input. + + The remaining changes are all non-functional changes to address issues + from Coverity. + + -- A. Maitland Bottoms Fri, 10 Jul 2015 17:57:42 -0400 + +volk (1.0-5) unstable; urgency=medium + + * native-armv7-build-support skips neon on Debian armel (Closes: #789972) + + -- A. Maitland Bottoms Sat, 04 Jul 2015 12:36:36 -0400 + +volk (1.0-4) unstable; urgency=low + + * update native-armv7-build-support patch from gnuradio volk package + + -- A. Maitland Bottoms Thu, 25 Jun 2015 16:38:49 -0400 + +volk (1.0-3) unstable; urgency=medium + + * Add Breaks/Replaces (Closes: #789893, #789894) + * Allow failing tests + + -- A. Maitland Bottoms Thu, 25 Jun 2015 12:46:06 -0400 + +volk (1.0-2) unstable; urgency=medium + + * kernels-add-missing-math.h-include-to-rotator + + -- A. Maitland Bottoms Wed, 24 Jun 2015 21:09:32 -0400 + +volk (1.0-1) unstable; urgency=low + + * Initial package (Closes: #782417) + Initial Release 11 Apr 2015 by Nathan West + + VOLK 1.0 is available. This is the first release of VOLK as an independently + tracked sub-project of GNU Radio. + + * Contributors + + VOLK has been tracked separately from GNU Radio since 2014 Dec 23. + Contributors between the split and the initial release are + + Albert Holguin aholguin_77@yahoo.com + Doug Geiger doug.geiger@bioradiation.net + Elliot Briggs elliot.briggs@gmail.com + Julien Olivain julien.olivain@lsv.ens-cachan.fr + Michael Dickens michael.dickens@ettus.com + Nathan West nathan.west@okstate.edu + Tom Rondeau tom@trondeau.com + + * QA + + The test and profiler have significantly changed. The profiler supports + run-time changes to vlen and iters to help kernel development and provide + more flexibility on embedded systems. Additionally there is a new option + to update an existing volk_profile results file with only new kernels which + will save time when updating to newer versions of VOLK + + The QA system creates a static list of kernels and test cases. The QA + testing and profiler iterate over this static list rather than each source + file keeping its own list. The QA also emits XML results to + lib/.unittest/kernels.xml which is formatted similarly to JUnit results. + + * Modtool + + Modtool was updated to support the QA and profiler changes. + + * Kernels + + New proto-kernels: + + 16ic_deinterleave_real_8i_neon + 16ic_s32f_deinterleave_32f_neon + fix preprocessor errors for some compilers on byteswap and popcount puppets + + ORC was moved to the asm kernels directory. + volk_malloc + + The posix_memalign implementation of Volk_malloc now falls back to a standard + malloc if alignment is 1. + + * Miscellaneous + + Several build system and cmake changes have made it possible to build VOLK + both independently with proper soname versions and in-tree for projects + such as GNU Radio. + + The static builds take advantage of cmake object libraries to speed up builds. + + Finally, there are a number of changes to satisfy compiler warnings and make + QA work on multiple machines. + + -- A. Maitland Bottoms Sun, 12 Apr 2015 23:20:41 -0400 diff --git a/control b/control new file mode 100644 index 0000000..56a16d2 --- /dev/null +++ b/control @@ -0,0 +1,83 @@ +Source: volk +Section: libdevel +Priority: optional +Maintainer: A. Maitland Bottoms +Build-Depends: cmake, + debhelper-compat (= 13), + dh-python, + liborc-0.4-dev, + libcpu-features-dev [amd64 arm64 armel armhf i386 mips64el mipsel powerpc ppc64 ppc64el x32], + python3-dev, + python3-mako +Build-Depends-Indep: doxygen, graphviz +Standards-Version: 4.5.1 +Rules-Requires-Root: no +Homepage: https://libvolk.org +Vcs-Browser: https://salsa.debian.org/bottoms/pkg-volk +Vcs-Git: https://salsa.debian.org/bottoms/pkg-volk.git + +Package: libvolk2.5 +Section: libs +Architecture: any +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends}, ${shlibs:Depends} +Multi-Arch: same +Recommends: libvolk2-bin +Suggests: libvolk2-dev +Description: vector optimized functions + Vector-Optimized Library of Kernels is designed to help + applications work with the processor's SIMD instruction sets. These are + very powerful vector operations that can give signal processing a + huge boost in performance. + +Package: libvolk2-dev +Architecture: any +Pre-Depends: ${misc:Pre-Depends} +Depends: libvolk2.5 (=${binary:Version}), ${misc:Depends} +Breaks: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev +Replaces: gnuradio-dev (<<3.7.8), libvolk-dev, libvolk1.0-dev, libvolk1-dev +Suggests: libvolk2-doc +Multi-Arch: same +Description: vector optimized function headers + Vector-Optimized Library of Kernels is designed to help + applications work with the processor's SIMD instruction sets. These are + very powerful vector operations that can give signal processing a + huge boost in performance. + . + This package contains the header files. + For documentation, see libvolk-doc. + +Package: libvolk2-bin +Section: libs +Architecture: any +Pre-Depends: ${misc:Pre-Depends} +Depends: libvolk2.5 (=${binary:Version}), + ${misc:Depends}, + ${python3:Depends}, + ${shlibs:Depends} +Breaks: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) +Replaces: libvolk1-bin, libvolk-bin, libvolk1.0-bin, gnuradio (<=3.7.2.1) +Description: vector optimized runtime tools + Vector-Optimized Library of Kernels is designed to help + applications work with the processor's SIMD instruction sets. These are + very powerful vector operations that can give signal processing a + huge boost in performance. + . + This package includes: the volk_profile tool to customize settings for + the system; volk_modtool to create new optimized modules; and + volk-config-info to show settings. + +Package: libvolk2-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: ${misc:Depends} +Recommends: www-browser +Description: vector optimized library documentation + Vector-Optimized Library of Kernels is designed to help + applications work with the processor's SIMD instruction sets. These are + very powerful vector operations that can give signal processing a + huge boost in performance. + . + This package includes the Doxygen generated documentation in + /usr/share/doc/libvolk2-dev/html/index.html diff --git a/copyright b/copyright new file mode 100644 index 0000000..92bbc43 --- /dev/null +++ b/copyright @@ -0,0 +1,195 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: volk +Upstream-Contact: http://libvolk.org/ +Source: + https://github.com/gnuradio/volk + https://github.com/google/cpu_features +Comment: + Debian packages by A. Maitland Bottoms + git archive --format=tar --prefix=volk-2.3.0/ v2.3.0 | xz > ../volk_2.3.0.orig.tar.xz + git archive --format=tar --prefix=cpu_features/ v0.6.0 | xz > ../volk_2.4.0.orig-cpu_features.tar.xz + . + Upstream Maintainers: + Johannes Demel + Michael Dickens +Copyright: 2014-2020 Free Software Foundation, Inc. +License: GPL-3+ + +Files: * +Copyright: 2006, 2009-2020, Free Software Foundation, Inc. +License: GPL-3+ + +Files: Doxyfile.in + DoxygenLayout.xml + volk.pc.in +Copyright: 2014-2020 Free Software Foundation, Inc. +License: GPL-3+ + +Files: apps/volk_profile.h +Copyright: 2014-2020 Free Software Foundation, Inc. +License: GPL-3+ + +Files: appveyor.yml +Copyright: 2016 Paul Cercueil +License: GPL-3+ + +Files: cmake/* +Copyright: 2014-2020 Free Software Foundation, Inc. +License: GPL-3+ + +Files: cmake/Modules/* +Copyright: 2006, 2009-2020, Free Software Foundation, Inc. +License: GPL-3+ + +Files: cpu_features/* +Copyright: 2020 Google LLC +License: Apache-2.0 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + . + On Debian systems, the complete text of the Apache-2.0 License + can be found in "/usr/share/common-licenses/Apache-2.0". + +Files: cmake/Modules/CMakeParseArgumentsCopy.cmake +Copyright: 2010 Alexander Neundorf +License: Kitware-BSD + All rights reserved. + . + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + . + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + . + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + . + * Neither the names of Kitware, Inc., the Insight Software Consortium, + nor the names of their contributors may be used to endorse or promote + products derived from this software without specific prior written + permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Files: cmake/Modules/FindORC.cmake + cmake/Modules/VolkConfig.cmake.in +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: cmake/msvc/* +Copyright: 2006-2008, Alexander Chemeris +License: BSD-2-clause + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + . + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + . + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + . + 3. The name of the author may be used to endorse or promote products + derived from this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO + EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Files: debian/* +Copyright: 2015-2020 Free Software Foundation, Inc +License: GPL-3+ +Comment: assigned by A. Maitland Bottoms + +Files: docs/* +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: gen/archs.xml + gen/machines.xml +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: include/volk/volk_common.h + include/volk/volk_complex.h + include/volk/volk_prefs.h +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: kernels/volk/asm/* +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: kernels/volk/volk_16u_byteswappuppet_16u.h + kernels/volk/volk_32u_byteswappuppet_32u.h + kernels/volk/volk_64u_byteswappuppet_64u.h +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +Files: lib/kernel_tests.h + lib/qa_utils.cc + lib/qa_utils.h + lib/volk_prefs.c +Copyright: 2014-2015 Free Software Foundation, Inc. +License: GPL-3+ + +License: LGPL-2+ + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + . + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + . + You should have received a copy of the GNU Library General Public License + along with this library; see the file COPYING.LIB. If not, write to + the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. + +License: GPL-3+ + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see . + . + On Debian systems, the complete text of the GNU General + Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". diff --git a/libvolk2-bin.install b/libvolk2-bin.install new file mode 100644 index 0000000..7221b71 --- /dev/null +++ b/libvolk2-bin.install @@ -0,0 +1,2 @@ +usr/bin/volk* +usr/lib/python3/dist-packages diff --git a/libvolk2-bin.manpages b/libvolk2-bin.manpages new file mode 100644 index 0000000..95bae9e --- /dev/null +++ b/libvolk2-bin.manpages @@ -0,0 +1,3 @@ +debian/volk-config-info.1 +debian/volk_modtool.1 +debian/volk_profile.1 diff --git a/libvolk2-dev.acc b/libvolk2-dev.acc new file mode 100644 index 0000000..37f5a79 --- /dev/null +++ b/libvolk2-dev.acc @@ -0,0 +1,50 @@ + + + + + -DHAVE_CPUID_H + -DHAVE_DLFCN_H + -DHAVE_FENV_H + -DHAVE_POSIX_MEMALIGN + -DHAVE_XGETBV + -D_GLIBCXX_USE_CXX11_ABI=1 + -I/usr/include/orc-0.4 + -DNDEBUG + -std=gnu11 + -m64 + -mmmx + -msse + -msse2 + -msse3 + -mssse3 + -msse4.1 + -msse4.2 + -mpopcnt + -mavx + -mfma + -mavx2 + -mavx512f + -mavx512cd + -fPIC + -g + -O2 + -fstack-protector-strong + -Wformat + -Werror=format-security + -Wdate-time + -D_FORTIFY_SOURCE=2 + -fvisibility=hidden + -Wsign-compare + -Wall + -Wno-uninitialized + + + +debian/libvolk2-dev/usr/include/volk/ + + + +debian/libvolk2.0/usr/lib/ + + + diff --git a/libvolk2-dev.install b/libvolk2-dev.install new file mode 100644 index 0000000..8b14c56 --- /dev/null +++ b/libvolk2-dev.install @@ -0,0 +1,5 @@ +usr/include/* +usr/lib/*/*volk.a +usr/lib/*/*volk*so +usr/lib/*/cmake/volk +usr/lib/*/pkgconfig/*volk* diff --git a/libvolk2-doc.doc-base b/libvolk2-doc.doc-base new file mode 100644 index 0000000..3d5fdc8 --- /dev/null +++ b/libvolk2-doc.doc-base @@ -0,0 +1,19 @@ +Document: libvolk2-doc +Title: Vector-Optimized Library of Kernels Reference Manual +Author: GNU Radio Developers +Abstract: VOLK is the Vector-Optimized Library of Kernels. + It is a library that contains kernels of hand-written SIMD code for + different mathematical operations. Since each SIMD architecture can + be very different and no compiler has yet come along to handle + vectorization properly or highly efficiently, VOLK approaches the + problem differently. For each architecture or platform that a + developer wishes to vectorize for, a new proto-kernel is added to + VOLK. At runtime, VOLK will select the correct proto-kernel. In this + way, the users of VOLK call a kernel for performing the operation + that is platform/architecture agnostic. This allows us to write + portable SIMD code. +Section: Programming/C++ + +Format: HTML +Index: /usr/share/doc/libvolk2-dev/html/index.html +Files: /usr/share/doc/libvolk2-dev/html/*.html diff --git a/libvolk2-doc.docs b/libvolk2-doc.docs new file mode 100644 index 0000000..87dd314 --- /dev/null +++ b/libvolk2-doc.docs @@ -0,0 +1 @@ +obj-*/html diff --git a/libvolk2.5.install b/libvolk2.5.install new file mode 100644 index 0000000..e4252f4 --- /dev/null +++ b/libvolk2.5.install @@ -0,0 +1 @@ +usr/lib/*/libvolk.so.* diff --git a/not-installed b/not-installed new file mode 100644 index 0000000..6f354d0 --- /dev/null +++ b/not-installed @@ -0,0 +1,6 @@ +usr/bin/list_cpu_features +usr/lib/*/cmake/CpuFeatures/CpuFeaturesConfig.cmake +usr/lib/*/cmake/CpuFeatures/CpuFeaturesConfigVersion.cmake +usr/lib/*/cmake/CpuFeatures/CpuFeaturesTargets-relwithdebinfo.cmake +usr/lib/*/cmake/CpuFeatures/CpuFeaturesTargets.cmake +usr/lib/*/libcpu_features.a diff --git a/patches/0001-Add-volk_32f-c-_index_min_16-32u.patch b/patches/0001-Add-volk_32f-c-_index_min_16-32u.patch new file mode 100644 index 0000000..e10de2e --- /dev/null +++ b/patches/0001-Add-volk_32f-c-_index_min_16-32u.patch @@ -0,0 +1,2160 @@ +From 7b5349217768244e646e12c8f53bbed3d66e0761 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Wed, 9 Jun 2021 22:47:04 +0200 +Subject: [PATCH 01/73] Add volk_32f(c)_index_min_16/32u + +Signed-off-by: Zlika +--- + docs/kernels.dox | 4 + + include/volk/volk_avx2_intrinsics.h | 114 ++++- + kernels/volk/volk_32f_index_min_16u.h | 375 +++++++++++++++++ + kernels/volk/volk_32f_index_min_32u.h | 558 +++++++++++++++++++++++++ + kernels/volk/volk_32fc_index_min_16u.h | 482 +++++++++++++++++++++ + kernels/volk/volk_32fc_index_min_32u.h | 524 +++++++++++++++++++++++ + lib/kernel_tests.h | 4 + + 7 files changed, 2060 insertions(+), 1 deletion(-) + create mode 100644 kernels/volk/volk_32f_index_min_16u.h + create mode 100644 kernels/volk/volk_32f_index_min_32u.h + create mode 100644 kernels/volk/volk_32fc_index_min_16u.h + create mode 100644 kernels/volk/volk_32fc_index_min_32u.h + +diff --git a/docs/kernels.dox b/docs/kernels.dox +index e9898f1..55e567b 100644 +--- a/docs/kernels.dox ++++ b/docs/kernels.dox +@@ -48,6 +48,8 @@ + \li \subpage volk_32fc_deinterleave_real_64f + \li \subpage volk_32fc_index_max_16u + \li \subpage volk_32fc_index_max_32u ++\li \subpage volk_32fc_index_min_16u ++\li \subpage volk_32fc_index_min_32u + \li \subpage volk_32fc_magnitude_32f + \li \subpage volk_32fc_magnitude_squared_32f + \li \subpage volk_32f_cos_32f +@@ -63,6 +65,8 @@ + \li \subpage volk_32f_expfast_32f + \li \subpage volk_32f_index_max_16u + \li \subpage volk_32f_index_max_32u ++\li \subpage volk_32f_index_min_16u ++\li \subpage volk_32f_index_min_32u + \li \subpage volk_32f_invsqrt_32f + \li \subpage volk_32f_log2_32f + \li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f +diff --git a/include/volk/volk_avx2_intrinsics.h b/include/volk/volk_avx2_intrinsics.h +index 2c397d9..21060d6 100644 +--- a/include/volk/volk_avx2_intrinsics.h ++++ b/include/volk/volk_avx2_intrinsics.h +@@ -130,7 +130,7 @@ static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, + * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) + * bool compare = abs_squared > max_values[j]; + * max_values[j] = compare ? abs_squared : max_values[j]; +- * max_indices[j] = compare ? current_indices[j] > max_indices[j] ++ * max_indices[j] = compare ? current_indices[j] : max_indices[j] + * current_indices[j] += 8; // update for next outer loop iteration + * ++src0; + * } +@@ -231,4 +231,116 @@ static inline void vector_32fc_index_max_variant1(__m256 in0, + *current_indices = _mm256_add_epi32(*current_indices, indices_increment); + } + ++/* ++ * The function below vectorizes the inner loop of the following code: ++ * ++ * float min_values[8] = {FLT_MAX}; ++ * unsigned min_indices[8] = {0}; ++ * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7}; ++ * for (unsigned i = 0; i < num_points / 8; ++i) { ++ * for (unsigned j = 0; j < 8; ++j) { ++ * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1) ++ * bool compare = abs_squared < min_values[j]; ++ * min_values[j] = compare ? abs_squared : min_values[j]; ++ * min_indices[j] = compare ? current_indices[j] : min_indices[j] ++ * current_indices[j] += 8; // update for next outer loop iteration ++ * ++src0; ++ * } ++ * } ++ */ ++static inline void vector_32fc_index_min_variant0(__m256 in0, ++ __m256 in1, ++ __m256* min_values, ++ __m256i* min_indices, ++ __m256i* current_indices, ++ __m256i indices_increment) ++{ ++ in0 = _mm256_mul_ps(in0, in0); ++ in1 = _mm256_mul_ps(in1, in1); ++ ++ /* ++ * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0) ++ * hadd_ps(a, b) computes ++ * (b_7 + b_6, ++ * b_5 + b_4, ++ * --------- ++ * a_7 + b_6, ++ * a_5 + a_4, ++ * --------- ++ * b_3 + b_2, ++ * b_1 + b_0, ++ * --------- ++ * a_3 + a_2, ++ * a_1 + a_0). ++ * The result is the squared absolute value of complex numbers at index ++ * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of ++ * current_indices! ++ */ ++ __m256 abs_squared = _mm256_hadd_ps(in0, in1); ++ ++ /* ++ * Compare the recently computed squared absolute values with the ++ * previously determined minimum values. cmp_ps(a, b) determines ++ * a < b ? 0xFFFFFFFF for each element in the vectors => ++ * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0 ++ * ++ * If either operand is NaN, 0 is returned as an “ordered” comparision is ++ * used => the blend operation will select the value from *min_values. ++ */ ++ __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); ++ ++ /* Select minimum by blending. This is the only line which differs from variant1 */ ++ *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask); ++ ++ /* ++ * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for ++ * each element in the vectors => ++ * min_indices = compare_mask ? current_indices : min_indices ++ * ++ * Note: The casting of data types is required to make the compiler happy ++ * and does not change values. ++ */ ++ *min_indices = ++ _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), ++ _mm256_castsi256_ps(*current_indices), ++ compare_mask)); ++ ++ /* compute indices of complex numbers which will be loaded in the next iteration */ ++ *current_indices = _mm256_add_epi32(*current_indices, indices_increment); ++} ++ ++/* See _variant0 for details */ ++static inline void vector_32fc_index_min_variant1(__m256 in0, ++ __m256 in1, ++ __m256* min_values, ++ __m256i* min_indices, ++ __m256i* current_indices, ++ __m256i indices_increment) ++{ ++ in0 = _mm256_mul_ps(in0, in0); ++ in1 = _mm256_mul_ps(in1, in1); ++ ++ __m256 abs_squared = _mm256_hadd_ps(in0, in1); ++ __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS); ++ ++ /* ++ * This is the only line which differs from variant0. Using maxps instead of ++ * blendvps is faster on Intel CPUs (on the ones tested with). ++ * ++ * Note: The order of arguments matters if a NaN is encountered in which ++ * case the value of the second argument is selected. This is consistent ++ * with the “ordered” comparision and the blend operation: The comparision ++ * returns false if a NaN is encountered and the blend operation ++ * consequently selects the value from min_indices. ++ */ ++ *min_values = _mm256_min_ps(abs_squared, *min_values); ++ ++ *min_indices = ++ _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices), ++ _mm256_castsi256_ps(*current_indices), ++ compare_mask)); ++ ++ *current_indices = _mm256_add_epi32(*current_indices, indices_increment); ++} ++ + #endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */ +diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h +new file mode 100644 +index 0000000..848b75c +--- /dev/null ++++ b/kernels/volk/volk_32f_index_min_16u.h +@@ -0,0 +1,375 @@ ++/* -*- c++ -*- */ ++/* ++ * Copyright 2021 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++/*! ++ * \page volk_32f_index_min_16u ++ * ++ * \b Overview ++ * ++ * Returns Argmin_i x[i]. Finds and returns the index which contains ++ * the fist minimum value in the given vector. ++ * ++ * Note that num_points is a uint32_t, but the return value is ++ * uint16_t. Providing a vector larger than the max of a uint16_t ++ * (65536) would miss anything outside of this boundary. The kernel ++ * will check the length of num_points and cap it to this max value, ++ * anyways. ++ * ++ * Dispatcher Prototype ++ * \code ++ * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points) ++ * \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector of floats. ++ * \li num_points: The number of data points. ++ * ++ * \b Outputs ++ * \li target: The index of the fist minimum value in the input buffer. ++ * ++ * \b Example ++ * \code ++ * int N = 10; ++ * uint32_t alignment = volk_get_alignment(); ++ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); ++ * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); ++ * ++ * for(uint32_t ii = 0; ii < N; ++ii){ ++ * float x = (float)ii; ++ * // a parabola with a minimum at x=4 ++ * in[ii] = (x-4) * (x-4) - 5; ++ * } ++ * ++ * volk_32f_index_min_16u(out, in, N); ++ * ++ * printf("minimum is %1.2f at index %u\n", in[*out], *out); ++ * ++ * volk_free(in); ++ * volk_free(out); ++ * \endcode ++ */ ++ ++#ifndef INCLUDED_volk_32f_index_min_16u_a_H ++#define INCLUDED_volk_32f_index_min_16u_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++ ++static inline void ++volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ uint32_t number = 0; ++ const uint32_t eighthPoints = num_points / 8; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (; number < eighthPoints; number++) { ++ ++ currentValues = _mm256_load_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ ++ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm256_store_ps(minValuesBuffer, minValues); ++ _mm256_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = eighthPoints * 8; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint16_t)index; ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++static inline void ++volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ ++ minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint16_t)index; ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++ ++#ifdef LV_HAVE_SSE ++ ++#include ++ ++static inline void ++volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ ++ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++ _mm_andnot_ps(compareResults, minValuesIndex)); ++ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++ _mm_andnot_ps(compareResults, minValues)); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint16_t)index; ++} ++ ++#endif /*LV_HAVE_SSE*/ ++ ++ ++#ifdef LV_HAVE_GENERIC ++ ++static inline void ++volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ float min = src0[0]; ++ uint16_t index = 0; ++ ++ uint32_t i = 1; ++ ++ for (; i < num_points; ++i) { ++ if (src0[i] < min) { ++ index = i; ++ min = src0[i]; ++ } ++ } ++ target[0] = index; ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++ ++#endif /*INCLUDED_volk_32f_index_min_16u_a_H*/ ++ ++ ++#ifndef INCLUDED_volk_32f_index_min_16u_u_H ++#define INCLUDED_volk_32f_index_min_16u_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++ ++static inline void ++volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ uint32_t number = 0; ++ const uint32_t eighthPoints = num_points / 8; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (; number < eighthPoints; number++) { ++ ++ currentValues = _mm256_loadu_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ ++ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm256_storeu_ps(minValuesBuffer, minValues); ++ _mm256_storeu_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = eighthPoints * 8; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint16_t)index; ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/ +diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h +new file mode 100644 +index 0000000..67ee426 +--- /dev/null ++++ b/kernels/volk/volk_32f_index_min_32u.h +@@ -0,0 +1,558 @@ ++/* -*- c++ -*- */ ++/* ++ * Copyright 2021 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++/*! ++ * \page volk_32f_index_min_32u ++ * ++ * \b Overview ++ * ++ * Returns Argmin_i x[i]. Finds and returns the index which contains the first minimum ++ * value in the given vector. ++ * ++ * Dispatcher Prototype ++ * \code ++ * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points) ++ * \endcode ++ * ++ * \b Inputs ++ * \li src0: The input vector of floats. ++ * \li num_points: The number of data points. ++ * ++ * \b Outputs ++ * \li target: The index of the first minimum value in the input buffer. ++ * ++ * \b Example ++ * \code ++ * int N = 10; ++ * uint32_t alignment = volk_get_alignment(); ++ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment); ++ * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); ++ * ++ * for(uint32_t ii = 0; ii < N; ++ii){ ++ * float x = (float)ii; ++ * // a parabola with a minimum at x=4 ++ * in[ii] = (x-4) * (x-4) - 5; ++ * } ++ * ++ * volk_32f_index_min_32u(out, in, N); ++ * ++ * printf("minimum is %1.2f at index %u\n", in[*out], *out); ++ * ++ * volk_free(in); ++ * volk_free(out); ++ * \endcode ++ */ ++ ++#ifndef INCLUDED_volk_32f_index_min_32u_a_H ++#define INCLUDED_volk_32f_index_min_32u_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++static inline void ++volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ ++ minValuesIndex = ++ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++ ++#ifdef LV_HAVE_SSE ++ ++#include ++ ++static inline void ++volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ ++ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++ _mm_andnot_ps(compareResults, minValuesIndex)); ++ ++ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++ _mm_andnot_ps(compareResults, minValues)); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE*/ ++ ++ ++#ifdef LV_HAVE_AVX ++#include ++ ++static inline void ++volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 8; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (; number < quarterPoints; number++) { ++ currentValues = _mm256_load_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ minValuesIndex = ++ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 8 points ++ _mm256_store_ps(minValuesBuffer, minValues); ++ _mm256_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 8; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++ ++#ifdef LV_HAVE_NEON ++#include ++ ++static inline void ++volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ float32x4_t indexIncrementValues = vdupq_n_f32(4); ++ __VOLK_ATTR_ALIGNED(16) ++ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; ++ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); ++ ++ float min = src0[0]; ++ float index = 0; ++ float32x4_t minValues = vdupq_n_f32(min); ++ uint32x4_t minValuesIndex = vmovq_n_u32(0); ++ uint32x4_t compareResults; ++ uint32x4_t currentIndexes_u; ++ float32x4_t currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ currentValues = vld1q_f32(inputPtr); ++ inputPtr += 4; ++ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); ++ currentIndexes_u = vcvtq_u32_f32(currentIndexes); ++ compareResults = vcgeq_f32(currentValues, minValues); ++ minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex), ++ vbicq_u32(currentIndexes_u, compareResults)); ++ minValues = vminq_f32(currentValues, minValues); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ vst1q_f32(minValuesBuffer, minValues); ++ vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValues[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_NEON*/ ++ ++ ++#ifdef LV_HAVE_GENERIC ++ ++static inline void ++volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ float min = src0[0]; ++ uint32_t index = 0; ++ ++ uint32_t i = 1; ++ ++ for (; i < num_points; ++i) { ++ if (src0[i] < min) { ++ index = i; ++ min = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++ ++#endif /*INCLUDED_volk_32f_index_min_32u_a_H*/ ++ ++ ++#ifndef INCLUDED_volk_32f_index_min_32u_u_H ++#define INCLUDED_volk_32f_index_min_32u_u_H ++ ++#include ++#include ++#include ++ ++ ++#ifdef LV_HAVE_AVX ++#include ++ ++static inline void ++volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 8; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (; number < quarterPoints; number++) { ++ currentValues = _mm256_loadu_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ minValuesIndex = ++ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smalles value from the remaining 8 points ++ _mm256_store_ps(minValuesBuffer, minValues); ++ _mm256_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 8; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++static inline void ++volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ currentValues = _mm_loadu_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ minValuesIndex = ++ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm_blendv_ps(minValues, currentValues, compareResults); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE ++#include ++ ++static inline void ++volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) ++{ ++ if (num_points > 0) { ++ uint32_t number = 0; ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = src0[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (; number < quarterPoints; number++) { ++ currentValues = _mm_loadu_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++ _mm_andnot_ps(compareResults, minValuesIndex)); ++ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++ _mm_andnot_ps(compareResults, minValues)); ++ } ++ ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ ++ for (number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) ++ index = minIndexesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for (; number < num_points; number++) { ++ if (src0[number] < min) { ++ index = number; ++ min = src0[number]; ++ } ++ } ++ target[0] = (uint32_t)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE*/ ++ ++#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/ +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +new file mode 100644 +index 0000000..5539ebf +--- /dev/null ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -0,0 +1,482 @@ ++/* -*- c++ -*- */ ++/* ++ * Copyright 2021 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++/*! ++ * \page volk_32fc_index_min_16u ++ * ++ * \b Overview ++ * ++ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the ++ * minimum magnitude for complex points in the given vector. ++ * ++ * Note that num_points is a uint32_t, but the return value is ++ * uint16_t. Providing a vector larger than the max of a uint16_t ++ * (65536) would miss anything outside of this boundary. The kernel ++ * will check the length of num_points and cap it to this max value, ++ * anyways. ++ * ++ * Dispatcher Prototype ++ * \code ++ * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t ++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input vector. ++ * \li num_points: The number of samples. ++ * ++ * \b Outputs ++ * \li target: The index of the point with minimum magnitude. ++ * ++ * \b Example ++ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around ++ * the unit circle. ++ * \code ++ * int N = 10; ++ * uint32_t alignment = volk_get_alignment(); ++ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); ++ * uint16_t* min = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment); ++ * ++ * for(uint32_t ii = 0; ii < N/2; ++ii){ ++ * float real = 2.f * ((float)ii / (float)N) - 1.f; ++ * float imag = std::sqrt(1.f - real * real); ++ * in[ii] = lv_cmake(real, imag); ++ * in[ii] = in[ii] * in[ii] + in[ii]; ++ * in[N-ii] = lv_cmake(real, imag); ++ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; ++ * } ++ * ++ * volk_32fc_index_min_16u(min, in, N); ++ * ++ * printf("index of min value = %u\n", *min); ++ * ++ * volk_free(in); ++ * volk_free(min); ++ * \endcode ++ */ ++ ++#ifndef INCLUDED_volk_32fc_index_min_16u_a_H ++#define INCLUDED_volk_32fc_index_min_16u_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_load_ps((float*)src0); ++ __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant0( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_load_ps((float*)src0); ++ __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant1( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++#include ++ ++static inline void ++volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ const uint32_t num_bytes = num_points * 8; ++ ++ union bit128 holderf; ++ union bit128 holderi; ++ float sq_dist = 0.0; ++ ++ union bit128 xmm5, xmm4; ++ __m128 xmm1, xmm2, xmm3; ++ __m128i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++ xmm5.int_vec = _mm_setzero_si128(); ++ xmm4.int_vec = _mm_setzero_si128(); ++ holderf.int_vec = _mm_setzero_si128(); ++ holderi.int_vec = _mm_setzero_si128(); ++ ++ int bound = num_bytes >> 5; ++ int i = 0; ++ ++ xmm8 = _mm_setr_epi32(0, 1, 2, 3); ++ xmm9 = _mm_setzero_si128(); ++ xmm10 = _mm_setr_epi32(4, 4, 4, 4); ++ xmm3 = _mm_set_ps1(FLT_MAX); ++ ++ for (; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)src0); ++ xmm2 = _mm_load_ps((float*)&src0[2]); ++ ++ src0 += 4; ++ ++ xmm1 = _mm_mul_ps(xmm1, xmm1); ++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++ xmm1 = _mm_hadd_ps(xmm1, xmm2); ++ ++ xmm3 = _mm_min_ps(xmm1, xmm3); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ } ++ ++ if (num_bytes >> 4 & 1) { ++ xmm2 = _mm_load_ps((float*)src0); ++ ++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); ++ xmm8 = bit128_p(&xmm1)->int_vec; ++ ++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++ src0 += 2; ++ ++ xmm1 = _mm_hadd_ps(xmm2, xmm2); ++ ++ xmm3 = _mm_min_ps(xmm1, xmm3); ++ ++ xmm10 = _mm_setr_epi32(2, 2, 2, 2); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ } ++ ++ if (num_bytes >> 3 & 1) { ++ sq_dist = ++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++ xmm2 = _mm_load1_ps(&sq_dist); ++ ++ xmm1 = xmm3; ++ ++ xmm3 = _mm_min_ss(xmm3, xmm2); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ } ++ ++ _mm_store_ps((float*)&(holderf.f), xmm3); ++ _mm_store_si128(&(holderi.int_vec), xmm9); ++ ++ target[0] = holderi.i[0]; ++ sq_dist = holderf.f[0]; ++ target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0]; ++ sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist; ++ target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0]; ++ sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist; ++ target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0]; ++ sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist; ++} ++ ++#endif /*LV_HAVE_SSE3*/ ++ ++#ifdef LV_HAVE_GENERIC ++static inline void ++volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ const uint32_t num_bytes = num_points * 8; ++ ++ float sq_dist = 0.0; ++ float min = FLT_MAX; ++ uint16_t index = 0; ++ ++ uint32_t i = 0; ++ ++ for (; i> 3; ++i) { ++ sq_dist = ++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++ if (sq_dist < min) { ++ index = i; ++ min = sq_dist; ++ } ++ } ++ target[0] = index; ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/ ++ ++#ifndef INCLUDED_volk_32fc_index_min_16u_u_H ++#define INCLUDED_volk_32fc_index_min_16u_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_loadu_ps((float*)src0); ++ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant0( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; ++ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_loadu_ps((float*)src0); ++ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant1( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/ +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +new file mode 100644 +index 0000000..290b754 +--- /dev/null ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -0,0 +1,524 @@ ++/* -*- c++ -*- */ ++/* ++ * Copyright 2021 Free Software Foundation, Inc. ++ * ++ * This file is part of GNU Radio ++ * ++ * GNU Radio is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 3, or (at your option) ++ * any later version. ++ * ++ * GNU Radio is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNU Radio; see the file COPYING. If not, write to ++ * the Free Software Foundation, Inc., 51 Franklin Street, ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++/*! ++ * \page volk_32fc_index_min_32u ++ * ++ * \b Overview ++ * ++ * Returns Argmin_i mag(x[i]). Finds and returns the index which contains the ++ * minimum magnitude for complex points in the given vector. ++ * ++ * Dispatcher Prototype ++ * \code ++ * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t ++ * num_points) \endcode ++ * ++ * \b Inputs ++ * \li src0: The complex input vector. ++ * \li num_points: The number of samples. ++ * ++ * \b Outputs ++ * \li target: The index of the point with minimum magnitude. ++ * ++ * \b Example ++ * Calculate the index of the minimum value of \f$x^2 + x\f$ for points around ++ * the unit circle. ++ * \code ++ * int N = 10; ++ * uint32_t alignment = volk_get_alignment(); ++ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment); ++ * uint32_t* min = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment); ++ * ++ * for(uint32_t ii = 0; ii < N/2; ++ii){ ++ * float real = 2.f * ((float)ii / (float)N) - 1.f; ++ * float imag = std::sqrt(1.f - real * real); ++ * in[ii] = lv_cmake(real, imag); ++ * in[ii] = in[ii] * in[ii] + in[ii]; ++ * in[N-ii] = lv_cmake(real, imag); ++ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii]; ++ * } ++ * ++ * volk_32fc_index_min_32u(min, in, N); ++ * ++ * printf("index of min value = %u\n", *min); ++ * ++ * volk_free(in); ++ * volk_free(min); ++ * \endcode ++ */ ++ ++#ifndef INCLUDED_volk_32fc_index_min_32u_a_H ++#define INCLUDED_volk_32fc_index_min_32u_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_load_ps((float*)src0); ++ __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant0( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_load_ps((float*)src0); ++ __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant1( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++#include ++ ++static inline void ++volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++{ ++ const uint32_t num_bytes = num_points * 8; ++ ++ union bit128 holderf; ++ union bit128 holderi; ++ float sq_dist = 0.0; ++ ++ union bit128 xmm5, xmm4; ++ __m128 xmm1, xmm2, xmm3; ++ __m128i xmm8, xmm11, xmm12, xmm9, xmm10; ++ ++ xmm5.int_vec = _mm_setzero_si128(); ++ xmm4.int_vec = _mm_setzero_si128(); ++ holderf.int_vec = _mm_setzero_si128(); ++ holderi.int_vec = _mm_setzero_si128(); ++ ++ int bound = num_bytes >> 5; ++ int i = 0; ++ ++ xmm8 = _mm_setr_epi32(0, 1, 2, 3); ++ xmm9 = _mm_setzero_si128(); ++ xmm10 = _mm_setr_epi32(4, 4, 4, 4); ++ xmm3 = _mm_set_ps1(FLT_MAX); ++ ++ for (; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)src0); ++ xmm2 = _mm_load_ps((float*)&src0[2]); ++ ++ src0 += 4; ++ ++ xmm1 = _mm_mul_ps(xmm1, xmm1); ++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++ xmm1 = _mm_hadd_ps(xmm1, xmm2); ++ ++ xmm3 = _mm_min_ps(xmm1, xmm3); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ } ++ ++ if (num_bytes >> 4 & 1) { ++ xmm2 = _mm_load_ps((float*)src0); ++ ++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); ++ xmm8 = bit128_p(&xmm1)->int_vec; ++ ++ xmm2 = _mm_mul_ps(xmm2, xmm2); ++ ++ src0 += 2; ++ ++ xmm1 = _mm_hadd_ps(xmm2, xmm2); ++ ++ xmm3 = _mm_min_ps(xmm1, xmm3); ++ ++ xmm10 = _mm_setr_epi32(2, 2, 2, 2); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ ++ xmm8 = _mm_add_epi32(xmm8, xmm10); ++ } ++ ++ if (num_bytes >> 3 & 1) { ++ sq_dist = ++ lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ ++ xmm2 = _mm_load1_ps(&sq_dist); ++ ++ xmm1 = xmm3; ++ ++ xmm3 = _mm_min_ss(xmm3, xmm2); ++ ++ xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3); ++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3); ++ ++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00); ++ ++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec); ++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec); ++ ++ xmm9 = _mm_add_epi32(xmm11, xmm12); ++ } ++ ++ _mm_store_ps((float*)&(holderf.f), xmm3); ++ _mm_store_si128(&(holderi.int_vec), xmm9); ++ ++ target[0] = holderi.i[0]; ++ sq_dist = holderf.f[0]; ++ target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0]; ++ sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist; ++ target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0]; ++ sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist; ++ target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0]; ++ sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist; ++} ++ ++#endif /*LV_HAVE_SSE3*/ ++ ++#ifdef LV_HAVE_GENERIC ++static inline void ++volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++{ ++ const uint32_t num_bytes = num_points * 8; ++ ++ float sq_dist = 0.0; ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ ++ uint32_t i = 0; ++ ++ for (; i> 3; ++i) { ++ sq_dist = ++ lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ ++ if (sq_dist < min) { ++ index = i; ++ min = sq_dist; ++ } ++ } ++ target[0] = index; ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_32fc_index_min_32u_a_H*/ ++ ++#ifndef INCLUDED_volk_32fc_index_min_32u_u_H ++#define INCLUDED_volk_32fc_index_min_32u_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_loadu_ps((float*)src0); ++ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant0( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_AVX2 ++#include ++#include ++ ++static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, ++ lv_32fc_t* src0, ++ uint32_t num_points) ++{ ++ const __m256i indices_increment = _mm256_set1_epi32(8); ++ /* ++ * At the start of each loop iteration current_indices holds the indices of ++ * the complex numbers loaded from memory. Explanation for odd order is given ++ * in implementation of vector_32fc_index_min_variant0(). ++ */ ++ __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); ++ ++ __m256 min_values = _mm256_set1_ps(FLT_MAX); ++ __m256i min_indices = _mm256_setzero_si256(); ++ ++ for (unsigned i = 0; i < num_points / 8u; ++i) { ++ __m256 in0 = _mm256_loadu_ps((float*)src0); ++ __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ vector_32fc_index_min_variant1( ++ in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); ++ src0 += 8; ++ } ++ ++ // determine minimum value and index in the result of the vectorized loop ++ __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8]; ++ __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8]; ++ _mm256_store_ps(min_values_buffer, min_values); ++ _mm256_store_si256((__m256i*)min_indices_buffer, min_indices); ++ ++ float min = FLT_MAX; ++ uint32_t index = 0; ++ for (unsigned i = 0; i < 8; i++) { ++ if (min_values_buffer[i] < min) { ++ min = min_values_buffer[i]; ++ index = min_indices_buffer[i]; ++ } ++ } ++ ++ // handle tail not processed by the vectorized loop ++ for (unsigned i = num_points & (~7u); i < num_points; ++i) { ++ const float abs_squared = ++ lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ if (abs_squared < min) { ++ min = abs_squared; ++ index = i; ++ } ++ ++src0; ++ } ++ ++ *target = index; ++} ++ ++#endif /*LV_HAVE_AVX2*/ ++ ++#ifdef LV_HAVE_NEON ++#include ++#include ++ ++static inline void ++volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++{ ++ unsigned int number = 0; ++ const uint32_t quarter_points = num_points / 4; ++ const lv_32fc_t* src0Ptr = src0; ++ ++ uint32_t indices[4] = { 0, 1, 2, 3 }; ++ const uint32x4_t vec_indices_incr = vdupq_n_u32(4); ++ uint32x4_t vec_indices = vld1q_u32(indices); ++ uint32x4_t vec_min_indices = vec_indices; ++ ++ if (num_points) { ++ float min = *src0Ptr; ++ uint32_t index = 0; ++ ++ float32x4_t vec_min = vdupq_n_f32(*src0Ptr); ++ ++ for (; number < quarter_points; number++) { ++ // Load complex and compute magnitude squared ++ const float32x4_t vec_mag2 = ++ _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); ++ __VOLK_PREFETCH(src0Ptr += 4); ++ // a < b? ++ const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min); ++ vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min); ++ vec_min_indices = vbslq_u32(lt_mask, vec_indices, vec_min_indices); ++ vec_indices = vaddq_u32(vec_indices, vec_indices_incr); ++ } ++ uint32_t tmp_min_indices[4]; ++ float tmp_min[4]; ++ vst1q_u32(tmp_min_indices, vec_min_indices); ++ vst1q_f32(tmp_min, vec_min); ++ ++ for (int i = 0; i < 4; i++) { ++ if (tmp_min[i] < min) { ++ min = tmp_min[i]; ++ index = tmp_min_indices[i]; ++ } ++ } ++ ++ // Deal with the rest ++ for (number = quarter_points * 4; number < num_points; number++) { ++ const float re = lv_creal(*src0Ptr); ++ const float im = lv_cimag(*src0Ptr); ++ if ((re * re + im * im) < min) { ++ min = *src0Ptr; ++ index = number; ++ } ++ src0Ptr++; ++ } ++ *target = index; ++ } ++} ++ ++#endif /*LV_HAVE_NEON*/ ++ ++#endif /*INCLUDED_volk_32fc_index_min_32u_u_H*/ +diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h +index 6df83ab..9f947cf 100644 +--- a/lib/kernel_tests.h ++++ b/lib/kernel_tests.h +@@ -68,6 +68,8 @@ std::vector init_test_list(volk_test_params_t test_params) + QA(VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32f_index_max_32u, test_params)) ++ QA(VOLK_INIT_TEST(volk_32f_index_min_16u, test_params)) ++ QA(VOLK_INIT_TEST(volk_32f_index_min_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_32f_add_32fc, test_params)) + QA(VOLK_INIT_TEST(volk_32f_log2_32f, test_params.make_absolute(1e-5))) +@@ -94,6 +96,8 @@ std::vector init_test_list(volk_test_params_t test_params) + QA(VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc)) + QA(VOLK_INIT_TEST(volk_32fc_index_max_16u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_index_max_32u, test_params)) ++ QA(VOLK_INIT_TEST(volk_32fc_index_min_16u, test_params)) ++ QA(VOLK_INIT_TEST(volk_32fc_index_min_32u, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc_tenth)) + QA(VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params)) +-- +2.30.2 + diff --git a/patches/0002-Fix-volk_32fc_index_min_32u_neon.patch b/patches/0002-Fix-volk_32fc_index_min_32u_neon.patch new file mode 100644 index 0000000..f12b824 --- /dev/null +++ b/patches/0002-Fix-volk_32fc_index_min_32u_neon.patch @@ -0,0 +1,26 @@ +From 6d21053e58073f82f1ec9bd83707c95b77807fce Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Fri, 11 Jun 2021 11:13:28 +0200 +Subject: [PATCH 02/73] Fix volk_32fc_index_min_32u_neon + +Signed-off-by: Zlika +--- + kernels/volk/volk_32fc_index_min_32u.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index 290b754..31eb094 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -477,7 +477,7 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi + uint32x4_t vec_min_indices = vec_indices; + + if (num_points) { +- float min = *src0Ptr; ++ float min = FLT_MAX; + uint32_t index = 0; + + float32x4_t vec_min = vdupq_n_f32(*src0Ptr); +-- +2.30.2 + diff --git a/patches/0003-Fix-volk_32fc_index_min_32u_neon.patch b/patches/0003-Fix-volk_32fc_index_min_32u_neon.patch new file mode 100644 index 0000000..1678bca --- /dev/null +++ b/patches/0003-Fix-volk_32fc_index_min_32u_neon.patch @@ -0,0 +1,26 @@ +From ac395a54e62429ff043ba240986f27507a54df75 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Fri, 11 Jun 2021 16:46:51 +0200 +Subject: [PATCH 03/73] Fix volk_32fc_index_min_32u_neon + +Signed-off-by: Zlika +--- + kernels/volk/volk_32fc_index_min_32u.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index 31eb094..545f9bf 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -480,7 +480,7 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi + float min = FLT_MAX; + uint32_t index = 0; + +- float32x4_t vec_min = vdupq_n_f32(*src0Ptr); ++ float32x4_t vec_min = vdupq_n_f32(FLT_MAX); + + for (; number < quarter_points; number++) { + // Load complex and compute magnitude squared +-- +2.30.2 + diff --git a/patches/0004-Code-cleanup.patch b/patches/0004-Code-cleanup.patch new file mode 100644 index 0000000..da0c9b1 --- /dev/null +++ b/patches/0004-Code-cleanup.patch @@ -0,0 +1,1278 @@ +From 7739ff89e4908a2d30013cd89c529daac9c26049 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Wed, 16 Jun 2021 15:11:25 +0200 +Subject: [PATCH 04/73] Code cleanup + +Signed-off-by: Zlika +--- + kernels/volk/volk_32f_index_min_16u.h | 92 +++++++--------- + kernels/volk/volk_32f_index_min_32u.h | 142 +++++++++++-------------- + kernels/volk/volk_32fc_index_min_16u.h | 82 +++++++------- + kernels/volk/volk_32fc_index_min_32u.h | 102 +++++++++--------- + 4 files changed, 190 insertions(+), 228 deletions(-) + +diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h +index 848b75c..d8ffcc7 100644 +--- a/kernels/volk/volk_32f_index_min_16u.h ++++ b/kernels/volk/volk_32f_index_min_16u.h +@@ -36,11 +36,11 @@ + * + * Dispatcher Prototype + * \code +- * void volk_32f_index_min_16u(uint16_t* target, const float* src0, uint32_t num_points) ++ * void volk_32f_index_min_16u(uint16_t* target, const float* source, uint32_t num_points) + * \endcode + * + * \b Inputs +- * \li src0: The input vector of floats. ++ * \li source: The input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs +@@ -80,19 +80,17 @@ + #include + + static inline void +-volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- +- uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); +@@ -102,7 +100,7 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + +- for (; number < eighthPoints; number++) { ++ for (uint32_t number = 0; number < eighthPoints; number++) { + + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; +@@ -118,7 +116,7 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 8; number++) { ++ for (uint32_t number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -128,11 +126,10 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p + } + } + +- number = eighthPoints * 8; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = eighthPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint16_t)index; +@@ -144,19 +141,17 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_p + #include + + static inline void +-volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -166,7 +161,7 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; +@@ -182,7 +177,7 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -192,11 +187,10 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint16_t)index; +@@ -210,19 +204,17 @@ volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t nu + #include + + static inline void +-volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -232,7 +224,7 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; +@@ -250,7 +242,7 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -260,11 +252,10 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint16_t)index; +@@ -276,19 +267,17 @@ volk_32f_index_min_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_p + #ifdef LV_HAVE_GENERIC + + static inline void +-volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + +- float min = src0[0]; ++ float min = source[0]; + uint16_t index = 0; + +- uint32_t i = 1; +- +- for (; i < num_points; ++i) { +- if (src0[i] < min) { ++ for (uint32_t i = 1; i < num_points; ++i) { ++ if (source[i] < min) { + index = i; +- min = src0[i]; ++ min = source[i]; + } + } + target[0] = index; +@@ -312,19 +301,17 @@ volk_32f_index_min_16u_generic(uint16_t* target, const float* src0, uint32_t num + #include + + static inline void +-volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- +- uint32_t number = 0; + const uint32_t eighthPoints = num_points / 8; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); +@@ -334,7 +321,7 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + +- for (; number < eighthPoints; number++) { ++ for (uint32_t number = 0; number < eighthPoints; number++) { + + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; +@@ -350,7 +337,7 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p + _mm256_storeu_ps(minValuesBuffer, minValues); + _mm256_storeu_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 8; number++) { ++ for (uint32_t number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -360,11 +347,10 @@ volk_32f_index_min_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_p + } + } + +- number = eighthPoints * 8; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = eighthPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint16_t)index; +diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h +index 67ee426..23c2d17 100644 +--- a/kernels/volk/volk_32f_index_min_32u.h ++++ b/kernels/volk/volk_32f_index_min_32u.h +@@ -30,11 +30,11 @@ + * + * Dispatcher Prototype + * \code +- * void volk_32f_index_min_32u(uint32_t* target, const float* src0, uint32_t num_points) ++ * void volk_32f_index_min_32u(uint32_t* target, const float* source, uint32_t num_points) + * \endcode + * + * \b Inputs +- * \li src0: The input vector of floats. ++ * \li source: The input vector of floats. + * \li num_points: The number of data points. + * + * \b Outputs +@@ -73,18 +73,17 @@ + #include + + static inline void +-volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -94,7 +93,7 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; +@@ -111,7 +110,7 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -121,11 +120,10 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -140,18 +138,17 @@ volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t nu + #include + + static inline void +-volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -161,7 +158,7 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; +@@ -180,7 +177,7 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -190,11 +187,10 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -208,18 +204,17 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_p + #include + + static inline void +-volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); +@@ -229,7 +224,7 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +@@ -243,7 +238,7 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 8; number++) { ++ for (uint32_t number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -253,11 +248,10 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p + } + } + +- number = quarterPoints * 8; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -271,19 +265,18 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_p + #include + + static inline void +-volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + float32x4_t indexIncrementValues = vdupq_n_f32(4); + __VOLK_ATTR_ALIGNED(16) + float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + float32x4_t minValues = vdupq_n_f32(min); + uint32x4_t minValuesIndex = vmovq_n_u32(0); +@@ -294,7 +287,7 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + currentValues = vld1q_f32(inputPtr); + inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); +@@ -308,7 +301,7 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po + // Calculate the smallest value from the remaining 4 points + vst1q_f32(minValuesBuffer, minValues); + vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -318,11 +311,10 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -335,18 +327,16 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* src0, uint32_t num_po + #ifdef LV_HAVE_GENERIC + + static inline void +-volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- float min = src0[0]; ++ float min = source[0]; + uint32_t index = 0; + +- uint32_t i = 1; +- +- for (; i < num_points; ++i) { +- if (src0[i] < min) { ++ for (uint32_t i = 1; i < num_points; ++i) { ++ if (source[i] < min) { + index = i; +- min = src0[i]; ++ min = source[i]; + } + } + target[0] = index; +@@ -371,18 +361,17 @@ volk_32f_index_min_32u_generic(uint32_t* target, const float* src0, uint32_t num + #include + + static inline void +-volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 8; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m256 indexIncrementValues = _mm256_set1_ps(8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m256 minValues = _mm256_set1_ps(min); + __m256 minValuesIndex = _mm256_setzero_ps(); +@@ -392,7 +381,7 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +@@ -406,7 +395,7 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p + _mm256_store_ps(minValuesBuffer, minValues); + _mm256_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 8; number++) { ++ for (uint32_t number = 0; number < 8; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -416,11 +405,10 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p + } + } + +- number = quarterPoints * 8; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -434,18 +422,17 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_p + #include + + static inline void +-volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -455,7 +442,7 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +@@ -469,7 +456,7 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -479,11 +466,10 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +@@ -496,18 +482,17 @@ volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t nu + #include + + static inline void +-volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) ++volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points) + { + if (num_points > 0) { +- uint32_t number = 0; + const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)src0; ++ float* inputPtr = (float*)source; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = src0[0]; ++ float min = source[0]; + float index = 0; + __m128 minValues = _mm_set1_ps(min); + __m128 minValuesIndex = _mm_setzero_ps(); +@@ -517,7 +502,7 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p + __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +@@ -532,7 +517,7 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p + _mm_store_ps(minValuesBuffer, minValues); + _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (number = 0; number < 4; number++) { ++ for (uint32_t number = 0; number < 4; number++) { + if (minValuesBuffer[number] < min) { + index = minIndexesBuffer[number]; + min = minValuesBuffer[number]; +@@ -542,11 +527,10 @@ volk_32f_index_min_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_p + } + } + +- number = quarterPoints * 4; +- for (; number < num_points; number++) { +- if (src0[number] < min) { ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { + index = number; +- min = src0[number]; ++ min = source[number]; + } + } + target[0] = (uint32_t)index; +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +index 5539ebf..bf7f6e3 100644 +--- a/kernels/volk/volk_32fc_index_min_16u.h ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -36,11 +36,11 @@ + * + * Dispatcher Prototype + * \code +- * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* src0, uint32_t ++ * void volk_32fc_index_min_16u(uint16_t* target, lv_32fc_t* source, uint32_t + * num_points) \endcode + * + * \b Inputs +- * \li src0: The complex input vector. ++ * \li source: The complex input vector. + * \li num_points: The number of samples. + * + * \b Outputs +@@ -87,7 +87,7 @@ + #include + + static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -104,11 +104,11 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_load_ps((float*)src0); +- __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_load_ps((float*)source); ++ __m256 in1 = _mm256_load_ps((float*)(source + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -129,12 +129,12 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -147,7 +147,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, + #include + + static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -164,11 +164,11 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_load_ps((float*)src0); +- __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_load_ps((float*)source); ++ __m256 in1 = _mm256_load_ps((float*)(source + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -189,12 +189,12 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -207,7 +207,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + #include + + static inline void +-volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + const uint32_t num_bytes = num_points * 8; +@@ -225,19 +225,18 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); + +- int bound = num_bytes >> 5; +- int i = 0; +- + xmm8 = _mm_setr_epi32(0, 1, 2, 3); + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + +- for (; i < bound; ++i) { +- xmm1 = _mm_load_ps((float*)src0); +- xmm2 = _mm_load_ps((float*)&src0[2]); ++ int bound = num_bytes >> 5; ++ ++ for (int i = 0; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)source); ++ xmm2 = _mm_load_ps((float*)&source[2]); + +- src0 += 4; ++ source += 4; + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); +@@ -258,14 +257,14 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p + } + + if (num_bytes >> 4 & 1) { +- xmm2 = _mm_load_ps((float*)src0); ++ xmm2 = _mm_load_ps((float*)source); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + +- src0 += 2; ++ source += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + +@@ -286,7 +285,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p + + if (num_bytes >> 3 & 1) { + sq_dist = +- lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + +@@ -322,21 +321,18 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* src0, uint32_t num_p + + #ifdef LV_HAVE_GENERIC + static inline void +-volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_points) ++volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- + const uint32_t num_bytes = num_points * 8; + + float sq_dist = 0.0; + float min = FLT_MAX; + uint16_t index = 0; + +- uint32_t i = 0; +- +- for (; i> 3; ++i) { ++ for (uint32_t i = 0; i> 3; ++i) { + sq_dist = +- lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); + + if (sq_dist < min) { + index = i; +@@ -364,7 +360,7 @@ volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* src0, uint32_t num_ + #include + + static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -381,11 +377,11 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_loadu_ps((float*)src0); +- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_loadu_ps((float*)source); ++ __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -406,12 +402,12 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -424,7 +420,7 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, + #include + + static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -441,11 +437,11 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_loadu_ps((float*)src0); +- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_loadu_ps((float*)source); ++ __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -466,12 +462,12 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index 545f9bf..0539dd5 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -30,11 +30,11 @@ + * + * Dispatcher Prototype + * \code +- * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* src0, uint32_t ++ * void volk_32fc_index_min_32u(uint32_t* target, lv_32fc_t* source, uint32_t + * num_points) \endcode + * + * \b Inputs +- * \li src0: The complex input vector. ++ * \li source: The complex input vector. + * \li num_points: The number of samples. + * + * \b Outputs +@@ -80,7 +80,7 @@ + #include + + static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -95,11 +95,11 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_load_ps((float*)src0); +- __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_load_ps((float*)source); ++ __m256 in1 = _mm256_load_ps((float*)(source + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -120,12 +120,12 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -138,7 +138,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, + #include + + static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -153,11 +153,11 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_load_ps((float*)src0); +- __m256 in1 = _mm256_load_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_load_ps((float*)source); ++ __m256 in1 = _mm256_load_ps((float*)(source + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -178,12 +178,12 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -196,7 +196,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + #include + + static inline void +-volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num_points) + { + const uint32_t num_bytes = num_points * 8; + +@@ -213,19 +213,18 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p + holderf.int_vec = _mm_setzero_si128(); + holderi.int_vec = _mm_setzero_si128(); + +- int bound = num_bytes >> 5; +- int i = 0; +- + xmm8 = _mm_setr_epi32(0, 1, 2, 3); + xmm9 = _mm_setzero_si128(); + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + +- for (; i < bound; ++i) { +- xmm1 = _mm_load_ps((float*)src0); +- xmm2 = _mm_load_ps((float*)&src0[2]); ++ int bound = num_bytes >> 5; + +- src0 += 4; ++ for (int i = 0; i < bound; ++i) { ++ xmm1 = _mm_load_ps((float*)source); ++ xmm2 = _mm_load_ps((float*)&source[2]); ++ ++ source += 4; + + xmm1 = _mm_mul_ps(xmm1, xmm1); + xmm2 = _mm_mul_ps(xmm2, xmm2); +@@ -246,14 +245,14 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p + } + + if (num_bytes >> 4 & 1) { +- xmm2 = _mm_load_ps((float*)src0); ++ xmm2 = _mm_load_ps((float*)source); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); + xmm8 = bit128_p(&xmm1)->int_vec; + + xmm2 = _mm_mul_ps(xmm2, xmm2); + +- src0 += 2; ++ source += 2; + + xmm1 = _mm_hadd_ps(xmm2, xmm2); + +@@ -274,7 +273,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p + + if (num_bytes >> 3 & 1) { + sq_dist = +- lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]); ++ lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + +@@ -310,7 +309,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_p + + #ifdef LV_HAVE_GENERIC + static inline void +-volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t num_points) + { + const uint32_t num_bytes = num_points * 8; + +@@ -318,11 +317,9 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ + float min = FLT_MAX; + uint32_t index = 0; + +- uint32_t i = 0; +- +- for (; i> 3; ++i) { ++ for (uint32_t i = 0; i> 3; ++i) { + sq_dist = +- lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]); ++ lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); + + if (sq_dist < min) { + index = i; +@@ -349,7 +346,7 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_ + #include + + static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -364,11 +361,11 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_loadu_ps((float*)src0); +- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_loadu_ps((float*)source); ++ __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); + vector_32fc_index_min_variant0( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -389,12 +386,12 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -407,7 +404,7 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, + #include + + static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, +- lv_32fc_t* src0, ++ lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -422,11 +419,11 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + __m256i min_indices = _mm256_setzero_si256(); + + for (unsigned i = 0; i < num_points / 8u; ++i) { +- __m256 in0 = _mm256_loadu_ps((float*)src0); +- __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4)); ++ __m256 in0 = _mm256_loadu_ps((float*)source); ++ __m256 in1 = _mm256_loadu_ps((float*)(source + 4)); + vector_32fc_index_min_variant1( + in0, in1, &min_values, &min_indices, ¤t_indices, indices_increment); +- src0 += 8; ++ source += 8; + } + + // determine minimum value and index in the result of the vectorized loop +@@ -447,12 +444,12 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + // handle tail not processed by the vectorized loop + for (unsigned i = num_points & (~7u); i < num_points; ++i) { + const float abs_squared = +- lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0); ++ lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source); + if (abs_squared < min) { + min = abs_squared; + index = i; + } +- ++src0; ++ ++source; + } + + *target = index; +@@ -465,11 +462,10 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + #include + + static inline void +-volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points) ++volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* source, uint32_t num_points) + { +- unsigned int number = 0; + const uint32_t quarter_points = num_points / 4; +- const lv_32fc_t* src0Ptr = src0; ++ const lv_32fc_t* sourcePtr = source; + + uint32_t indices[4] = { 0, 1, 2, 3 }; + const uint32x4_t vec_indices_incr = vdupq_n_u32(4); +@@ -482,11 +478,11 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi + + float32x4_t vec_min = vdupq_n_f32(FLT_MAX); + +- for (; number < quarter_points; number++) { ++ for (uint32_t number = 0; number < quarter_points; number++) { + // Load complex and compute magnitude squared + const float32x4_t vec_mag2 = +- _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr)); +- __VOLK_PREFETCH(src0Ptr += 4); ++ _vmagnitudesquaredq_f32(vld2q_f32((float*)sourcePtr)); ++ __VOLK_PREFETCH(sourcePtr += 4); + // a < b? + const uint32x4_t lt_mask = vcltq_f32(vec_mag2, vec_min); + vec_min = vbslq_f32(lt_mask, vec_mag2, vec_min); +@@ -506,14 +502,14 @@ volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_poi + } + + // Deal with the rest +- for (number = quarter_points * 4; number < num_points; number++) { +- const float re = lv_creal(*src0Ptr); +- const float im = lv_cimag(*src0Ptr); ++ for (uint32_t number = quarter_points * 4; number < num_points; number++) { ++ const float re = lv_creal(*sourcePtr); ++ const float im = lv_cimag(*sourcePtr); + if ((re * re + im * im) < min) { +- min = *src0Ptr; ++ min = *sourcePtr; + index = number; + } +- src0Ptr++; ++ sourcePtr++; + } + *target = index; + } +-- +2.30.2 + diff --git a/patches/0005-Fix-clang-format-errors.patch b/patches/0005-Fix-clang-format-errors.patch new file mode 100644 index 0000000..4898b11 --- /dev/null +++ b/patches/0005-Fix-clang-format-errors.patch @@ -0,0 +1,112 @@ +From 2fb097c2f25f215bdb6a906a12aa6468d5bfc5c9 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Wed, 16 Jun 2021 15:21:46 +0200 +Subject: [PATCH 05/73] Fix clang-format errors + +Signed-off-by: Zlika +--- + kernels/volk/volk_32f_index_min_16u.h | 5 +++-- + kernels/volk/volk_32f_index_min_32u.h | 10 ++++++---- + kernels/volk/volk_32fc_index_min_16u.h | 8 ++++---- + kernels/volk/volk_32fc_index_min_32u.h | 8 ++++---- + 4 files changed, 17 insertions(+), 14 deletions(-) + +diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h +index d8ffcc7..115835e 100644 +--- a/kernels/volk/volk_32f_index_min_16u.h ++++ b/kernels/volk/volk_32f_index_min_16u.h +@@ -140,8 +140,9 @@ volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num + #ifdef LV_HAVE_SSE4_1 + #include + +-static inline void +-volk_32f_index_min_16u_a_sse4_1(uint16_t* target, const float* source, uint32_t num_points) ++static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target, ++ const float* source, ++ uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + const uint32_t quarterPoints = num_points / 4; +diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h +index 23c2d17..a68ba9c 100644 +--- a/kernels/volk/volk_32f_index_min_32u.h ++++ b/kernels/volk/volk_32f_index_min_32u.h +@@ -72,8 +72,9 @@ + #ifdef LV_HAVE_SSE4_1 + #include + +-static inline void +-volk_32f_index_min_32u_a_sse4_1(uint32_t* target, const float* source, uint32_t num_points) ++static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target, ++ const float* source, ++ uint32_t num_points) + { + if (num_points > 0) { + const uint32_t quarterPoints = num_points / 4; +@@ -421,8 +422,9 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num + #ifdef LV_HAVE_SSE4_1 + #include + +-static inline void +-volk_32f_index_min_32u_u_sse4_1(uint32_t* target, const float* source, uint32_t num_points) ++static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target, ++ const float* source, ++ uint32_t num_points) + { + if (num_points > 0) { + const uint32_t quarterPoints = num_points / 4; +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +index bf7f6e3..8f40730 100644 +--- a/kernels/volk/volk_32fc_index_min_16u.h ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -284,8 +284,8 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num + } + + if (num_bytes >> 3 & 1) { +- sq_dist = +- lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); ++ sq_dist = lv_creal(source[0]) * lv_creal(source[0]) + ++ lv_cimag(source[0]) * lv_cimag(source[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + +@@ -331,8 +331,8 @@ volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t nu + uint16_t index = 0; + + for (uint32_t i = 0; i> 3; ++i) { +- sq_dist = +- lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); ++ sq_dist = lv_creal(source[i]) * lv_creal(source[i]) + ++ lv_cimag(source[i]) * lv_cimag(source[i]); + + if (sq_dist < min) { + index = i; +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index 0539dd5..efa33ee 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -272,8 +272,8 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + } + + if (num_bytes >> 3 & 1) { +- sq_dist = +- lv_creal(source[0]) * lv_creal(source[0]) + lv_cimag(source[0]) * lv_cimag(source[0]); ++ sq_dist = lv_creal(source[0]) * lv_creal(source[0]) + ++ lv_cimag(source[0]) * lv_cimag(source[0]); + + xmm2 = _mm_load1_ps(&sq_dist); + +@@ -318,8 +318,8 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t nu + uint32_t index = 0; + + for (uint32_t i = 0; i> 3; ++i) { +- sq_dist = +- lv_creal(source[i]) * lv_creal(source[i]) + lv_cimag(source[i]) * lv_cimag(source[i]); ++ sq_dist = lv_creal(source[i]) * lv_creal(source[i]) + ++ lv_cimag(source[i]) * lv_cimag(source[i]); + + if (sq_dist < min) { + index = i; +-- +2.30.2 + diff --git a/patches/0006-New-generic-implementation-fixed-typos.patch b/patches/0006-New-generic-implementation-fixed-typos.patch new file mode 100644 index 0000000..bbf7b32 --- /dev/null +++ b/patches/0006-New-generic-implementation-fixed-typos.patch @@ -0,0 +1,100 @@ +From 78a900ad5030ce13e38994a6d2c5c74e6c80b2d2 Mon Sep 17 00:00:00 2001 +From: Magnus Lundmark +Date: Fri, 18 Jun 2021 15:16:22 +0200 +Subject: [PATCH 06/73] New generic implementation, fixed typos + +Signed-off-by: Magnus Lundmark +--- + .../volk/volk_32f_stddev_and_mean_32f_x2.h | 11 ++++-- + .../volk_32fc_x2_conjugate_dot_prod_32fc.h | 37 +++++++++++++++++-- + 2 files changed, 41 insertions(+), 7 deletions(-) + +diff --git a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +index f62e630..accb441 100644 +--- a/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h ++++ b/kernels/volk/volk_32f_stddev_and_mean_32f_x2.h +@@ -43,16 +43,19 @@ + * + * \b Example + * Generate random numbers with c++11's normal distribution and estimate the mean and +- * standard deviation \code int N = 1000; unsigned int alignment = volk_get_alignment(); ++ * standard deviation ++ * \code ++ * int N = 1000; ++ * unsigned int alignment = volk_get_alignment(); + * float* rand_numbers = (float*) volk_malloc(sizeof(float)*N, alignment); + * float* mean = (float*) volk_malloc(sizeof(float), alignment); + * float* stddev = (float*) volk_malloc(sizeof(float), alignment); + * +- * // Use a normal generator with 0 mean, stddev 1 ++ * // Use a normal generator with 0 mean, stddev 1000 + * std::default_random_engine generator; +- * std::normal_distribution distribution(0,1000); ++ * std::normal_distribution distribution(0, 1000); + * +- * for(unsigned int ii = 0; ii < N; ++ii){ ++ * for(unsigned int ii = 0; ii < N; ++ii) { + * rand_numbers[ii] = distribution(generator); + * } + * +diff --git a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +index 0f69499..4aeb05a 100644 +--- a/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h ++++ b/kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h +@@ -47,12 +47,27 @@ + * + * \b Example + * \code +- * int N = 10000; ++ * unsigned int N = 1000; ++ * unsigned int alignment = volk_get_alignment(); + * +- * ++ * lv_32fc_t* a = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); ++ * lv_32fc_t* b = (lv_32fc_t*) volk_malloc(sizeof(lv_32fc_t) * N, alignment); + * +- * volk_32fc_x2_conjugate_dot_prod_32fc(); ++ * for (int i = 0; i < N; ++i) { ++ * a[i] = lv_cmake(.50f, .50f); ++ * b[i] = lv_cmake(.50f, .75f); ++ * } + * ++ * lv_32fc_t e = (float) N * a[0] * lv_conj(b[0]); // When a and b constant ++ * lv_32fc_t res; ++ * ++ * volk_32fc_x2_conjugate_dot_prod_32fc(&res, a, b, N); ++ * ++ * printf("Expected: %8.2f%+8.2fi\n", lv_real(e), lv_imag(e)); ++ * printf("Result: %8.2f%+8.2fi\n", lv_real(res), lv_imag(res)); ++ * ++ * volk_free(a); ++ * volk_free(b); + * \endcode + */ + +@@ -70,6 +85,22 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* resul + const lv_32fc_t* taps, + unsigned int num_points) + { ++ lv_32fc_t res = lv_cmake(0.f, 0.f); ++ for (unsigned int i = 0; i < num_points; ++i) { ++ res += (*input++) * lv_conj((*taps++)); ++ } ++ *result = res; ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#ifdef LV_HAVE_GENERIC ++ ++static inline void volk_32fc_x2_conjugate_dot_prod_32fc_block(lv_32fc_t* result, ++ const lv_32fc_t* input, ++ const lv_32fc_t* taps, ++ unsigned int num_points) ++{ + + const unsigned int num_bytes = num_points * 8; + +-- +2.30.2 + diff --git a/patches/0007-Add-the-list-of-contributors-agreeing-to-LGPL-licens.patch b/patches/0007-Add-the-list-of-contributors-agreeing-to-LGPL-licens.patch new file mode 100644 index 0000000..fee1d11 --- /dev/null +++ b/patches/0007-Add-the-list-of-contributors-agreeing-to-LGPL-licens.patch @@ -0,0 +1,313 @@ +From 63d110c49e69d145eb0fa71c3cd8f27562553f1d Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marcus=20M=C3=BCller?= +Date: Thu, 1 Jul 2021 19:41:27 +0200 +Subject: [PATCH 07/73] Add the list of contributors agreeing to LGPL licensing +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +* List of contributors, with short explanation +* added LGPLv3 license text as COPYING-LGPL +* moved GPLv3 license text to COPYING-GPL +* COPYING still contains the GPLv3, but with an explanation that new + code is LGPL +* updated the contribution guide +* Removed the GPL license header from Readme, added explanation. + +Signed-off-by: Marcus Müller +--- + AUTHORS_GRANTING_LGPL_LICENSE.txt | 15 +++ + CONTRIBUTING.md | 8 +- + COPYING | 10 ++ + COPYING-GPL | 1 + + COPYING-LGPL | 175 ++++++++++++++++++++++++++++++ + README.md | 26 +---- + 6 files changed, 211 insertions(+), 24 deletions(-) + create mode 100644 AUTHORS_GRANTING_LGPL_LICENSE.txt + create mode 120000 COPYING-GPL + create mode 100644 COPYING-LGPL + +diff --git a/AUTHORS_GRANTING_LGPL_LICENSE.txt b/AUTHORS_GRANTING_LGPL_LICENSE.txt +new file mode 100644 +index 0000000..7205328 +--- /dev/null ++++ b/AUTHORS_GRANTING_LGPL_LICENSE.txt +@@ -0,0 +1,15 @@ ++VOLK is going to migrating from GPLv3 (GNU General Public license version 3.0) ++to LGPLv3 (GNU Lesser General Public License version 3.0). ++ ++This file is a list of the authors who agreed to grant an LGPL license to the ++code they contributed to this repository. In case the affected code is currently ++licensed differently (GPLv3), this gives the right to use the current ++contributions under both LGPLv3 and that other license. Future contributions by ++these authors are, however, licensed under LGPLv3, unless explicitly stated ++otherwise. ++ ++Together with the date of agreement, these authors are: ++ ++| Date | Author (as used in commits) | ++|------+-----------------------------| ++| | | +diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md +index bfbbad7..87dffde 100644 +--- a/CONTRIBUTING.md ++++ b/CONTRIBUTING.md +@@ -21,10 +21,10 @@ code. + + ## DCO Signed? + +-Any code contributions going into VOLK will become part of a GPL-licensed, +-open source repository. It is therefore imperative that code submissions belong +-to the authors, and that submitters have the authority to merge that code into +-the public VOLK codebase. ++Any code contributions going into VOLK will become part of an LPGPL-licensed ++(former contributions are GPL-licensed), open source repository. It is therefore ++imperative that code submissions belong to the authors, and that submitters have ++the authority to merge that code into the public VOLK codebase. + + For that purpose, we use the [Developer's Certificate of Origin](DCO.txt). It + is the same document used by other projects. Signing the DCO states that there +diff --git a/COPYING b/COPYING +index 94a9ed0..6c1874b 100644 +--- a/COPYING ++++ b/COPYING +@@ -1,3 +1,13 @@ ++Files in this code repository are to be licensed under the LGPLv3, which you'll ++find in the file COPYING-LGPL ++ ++However, you'll find some files that carry a license header that assigns them as ++GPLv3, which was the default license for VOLK up to version 3. You can find the ++full license text of the GPLv3 below. ++ ++================================================================================ ++ ++ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + +diff --git a/COPYING-GPL b/COPYING-GPL +new file mode 120000 +index 0000000..d24842f +--- /dev/null ++++ b/COPYING-GPL +@@ -0,0 +1 @@ ++COPYING +\ No newline at end of file +diff --git a/COPYING-LGPL b/COPYING-LGPL +new file mode 100644 +index 0000000..21ca013 +--- /dev/null ++++ b/COPYING-LGPL +@@ -0,0 +1,175 @@ ++Files in this code repository are to be licensed under the LGPLv3, which you'll ++find below. ++ ++However, you'll find some files that carry a license header that assigns them as ++GPLv3, which was the default license for VOLK up to version 3. You can find the ++full license text of the GPLv3 in the file COPYING-GPL. ++ ++================================================================================ ++ ++ ++ GNU LESSER GENERAL PUBLIC LICENSE ++ Version 3, 29 June 2007 ++ ++ Copyright (C) 2007 Free Software Foundation, Inc. ++ Everyone is permitted to copy and distribute verbatim copies ++ of this license document, but changing it is not allowed. ++ ++ ++ This version of the GNU Lesser General Public License incorporates ++the terms and conditions of version 3 of the GNU General Public ++License, supplemented by the additional permissions listed below. ++ ++ 0. Additional Definitions. ++ ++ As used herein, "this License" refers to version 3 of the GNU Lesser ++General Public License, and the "GNU GPL" refers to version 3 of the GNU ++General Public License. ++ ++ "The Library" refers to a covered work governed by this License, ++other than an Application or a Combined Work as defined below. ++ ++ An "Application" is any work that makes use of an interface provided ++by the Library, but which is not otherwise based on the Library. ++Defining a subclass of a class defined by the Library is deemed a mode ++of using an interface provided by the Library. ++ ++ A "Combined Work" is a work produced by combining or linking an ++Application with the Library. The particular version of the Library ++with which the Combined Work was made is also called the "Linked ++Version". ++ ++ The "Minimal Corresponding Source" for a Combined Work means the ++Corresponding Source for the Combined Work, excluding any source code ++for portions of the Combined Work that, considered in isolation, are ++based on the Application, and not on the Linked Version. ++ ++ The "Corresponding Application Code" for a Combined Work means the ++object code and/or source code for the Application, including any data ++and utility programs needed for reproducing the Combined Work from the ++Application, but excluding the System Libraries of the Combined Work. ++ ++ 1. Exception to Section 3 of the GNU GPL. ++ ++ You may convey a covered work under sections 3 and 4 of this License ++without being bound by section 3 of the GNU GPL. ++ ++ 2. Conveying Modified Versions. ++ ++ If you modify a copy of the Library, and, in your modifications, a ++facility refers to a function or data to be supplied by an Application ++that uses the facility (other than as an argument passed when the ++facility is invoked), then you may convey a copy of the modified ++version: ++ ++ a) under this License, provided that you make a good faith effort to ++ ensure that, in the event an Application does not supply the ++ function or data, the facility still operates, and performs ++ whatever part of its purpose remains meaningful, or ++ ++ b) under the GNU GPL, with none of the additional permissions of ++ this License applicable to that copy. ++ ++ 3. Object Code Incorporating Material from Library Header Files. ++ ++ The object code form of an Application may incorporate material from ++a header file that is part of the Library. You may convey such object ++code under terms of your choice, provided that, if the incorporated ++material is not limited to numerical parameters, data structure ++layouts and accessors, or small macros, inline functions and templates ++(ten or fewer lines in length), you do both of the following: ++ ++ a) Give prominent notice with each copy of the object code that the ++ Library is used in it and that the Library and its use are ++ covered by this License. ++ ++ b) Accompany the object code with a copy of the GNU GPL and this license ++ document. ++ ++ 4. Combined Works. ++ ++ You may convey a Combined Work under terms of your choice that, ++taken together, effectively do not restrict modification of the ++portions of the Library contained in the Combined Work and reverse ++engineering for debugging such modifications, if you also do each of ++the following: ++ ++ a) Give prominent notice with each copy of the Combined Work that ++ the Library is used in it and that the Library and its use are ++ covered by this License. ++ ++ b) Accompany the Combined Work with a copy of the GNU GPL and this license ++ document. ++ ++ c) For a Combined Work that displays copyright notices during ++ execution, include the copyright notice for the Library among ++ these notices, as well as a reference directing the user to the ++ copies of the GNU GPL and this license document. ++ ++ d) Do one of the following: ++ ++ 0) Convey the Minimal Corresponding Source under the terms of this ++ License, and the Corresponding Application Code in a form ++ suitable for, and under terms that permit, the user to ++ recombine or relink the Application with a modified version of ++ the Linked Version to produce a modified Combined Work, in the ++ manner specified by section 6 of the GNU GPL for conveying ++ Corresponding Source. ++ ++ 1) Use a suitable shared library mechanism for linking with the ++ Library. A suitable mechanism is one that (a) uses at run time ++ a copy of the Library already present on the user's computer ++ system, and (b) will operate properly with a modified version ++ of the Library that is interface-compatible with the Linked ++ Version. ++ ++ e) Provide Installation Information, but only if you would otherwise ++ be required to provide such information under section 6 of the ++ GNU GPL, and only to the extent that such information is ++ necessary to install and execute a modified version of the ++ Combined Work produced by recombining or relinking the ++ Application with a modified version of the Linked Version. (If ++ you use option 4d0, the Installation Information must accompany ++ the Minimal Corresponding Source and Corresponding Application ++ Code. If you use option 4d1, you must provide the Installation ++ Information in the manner specified by section 6 of the GNU GPL ++ for conveying Corresponding Source.) ++ ++ 5. Combined Libraries. ++ ++ You may place library facilities that are a work based on the ++Library side by side in a single library together with other library ++facilities that are not Applications and are not covered by this ++License, and convey such a combined library under terms of your ++choice, if you do both of the following: ++ ++ a) Accompany the combined library with a copy of the same work based ++ on the Library, uncombined with any other library facilities, ++ conveyed under the terms of this License. ++ ++ b) Give prominent notice with the combined library that part of it ++ is a work based on the Library, and explaining where to find the ++ accompanying uncombined form of the same work. ++ ++ 6. Revised Versions of the GNU Lesser General Public License. ++ ++ The Free Software Foundation may publish revised and/or new versions ++of the GNU Lesser General Public License from time to time. Such new ++versions will be similar in spirit to the present version, but may ++differ in detail to address new problems or concerns. ++ ++ Each version is given a distinguishing version number. If the ++Library as you received it specifies that a certain numbered version ++of the GNU Lesser General Public License "or any later version" ++applies to it, you have the option of following the terms and ++conditions either of that published version or of any later version ++published by the Free Software Foundation. If the Library as you ++received it does not specify a version number of the GNU Lesser ++General Public License, you may choose any version of the GNU Lesser ++General Public License ever published by the Free Software Foundation. ++ ++ If the Library as you received it specifies that a proxy can decide ++whether future versions of the GNU Lesser General Public License shall ++apply, that proxy's public statement of acceptance of any version is ++permanent authorization for you to choose that version for the ++Library. +diff --git a/README.md b/README.md +index 8152b60..013a9ae 100644 +--- a/README.md ++++ b/README.md +@@ -98,23 +98,9 @@ We want to make sure VOLK compiles on a wide variety of compilers. Thus, we targ + + ## License + +-> +-> Copyright 2015 Free Software Foundation, Inc. +-> +-> This file is part of VOLK +-> +-> VOLK is free software; you can redistribute it and/or modify +-> it under the terms of the GNU General Public License as published by +-> the Free Software Foundation; either version 3, or (at your option) +-> any later version. +-> +-> VOLK is distributed in the hope that it will be useful, +-> but WITHOUT ANY WARRANTY; without even the implied warranty of +-> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +-> GNU General Public License for more details. +-> +-> You should have received a copy of the GNU General Public License +-> along with GNU Radio; see the file COPYING. If not, write to +-> the Free Software Foundation, Inc., 51 Franklin Street, +-> Boston, MA 02110-1301, USA. +-> ++VOLK is moving from the GNU General Public License version 3.0 (GPLv3) to the ++GNU Lesser General Public License version 3.0 (LGPLv3). At this point in time, ++much of the code in the repository is still GPL-licensed, but new contributors ++are asked to use the LGPLv3 for their code contributions. Existing contributors ++are very kindly requested to also allow LPGL-licensing by adding their name to ++the file `AUTHORS_GRANTING_LGPL_LICENSE.txt`. +-- +2.30.2 + diff --git a/patches/0009-Code-cleanup.patch b/patches/0009-Code-cleanup.patch new file mode 100644 index 0000000..cf6e97a --- /dev/null +++ b/patches/0009-Code-cleanup.patch @@ -0,0 +1,924 @@ +From f8714d89a3accaab78711c276c98199f1991af72 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Mon, 5 Jul 2021 13:05:18 +0200 +Subject: [PATCH 09/73] Code cleanup + +Signed-off-by: Zlika +--- + kernels/volk/volk_32f_index_min_16u.h | 6 +- + kernels/volk/volk_32f_index_min_32u.h | 602 ++++++++++++------------- + kernels/volk/volk_32fc_index_min_16u.h | 16 +- + kernels/volk/volk_32fc_index_min_32u.h | 18 +- + 4 files changed, 310 insertions(+), 332 deletions(-) + +diff --git a/kernels/volk/volk_32f_index_min_16u.h b/kernels/volk/volk_32f_index_min_16u.h +index 115835e..00acd85 100644 +--- a/kernels/volk/volk_32f_index_min_16u.h ++++ b/kernels/volk/volk_32f_index_min_16u.h +@@ -2,14 +2,14 @@ + /* + * Copyright 2021 Free Software Foundation, Inc. + * +- * This file is part of GNU Radio ++ * This file is part of VOLK + * +- * GNU Radio is free software; you can redistribute it and/or modify ++ * VOLK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * +- * GNU Radio is distributed in the hope that it will be useful, ++ * VOLK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h +index a68ba9c..c71ee60 100644 +--- a/kernels/volk/volk_32f_index_min_32u.h ++++ b/kernels/volk/volk_32f_index_min_32u.h +@@ -2,14 +2,14 @@ + /* + * Copyright 2021 Free Software Foundation, Inc. + * +- * This file is part of GNU Radio ++ * This file is part of VOLK + * +- * GNU Radio is free software; you can redistribute it and/or modify ++ * VOLK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * +- * GNU Radio is distributed in the hope that it will be useful, ++ * VOLK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +@@ -76,59 +76,57 @@ static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target, + const float* source, + uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 4; ++ const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)source; ++ float* inputPtr = (float*)source; + +- __m128 indexIncrementValues = _mm_set1_ps(4); +- __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = source[0]; +- float index = 0; +- __m128 minValues = _mm_set1_ps(min); +- __m128 minValuesIndex = _mm_setzero_ps(); +- __m128 compareResults; +- __m128 currentValues; ++ float min = source[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; + +- __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; +- __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (uint32_t number = 0; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + +- currentValues = _mm_load_ps(inputPtr); +- inputPtr += 4; +- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + +- compareResults = _mm_cmplt_ps(currentValues, minValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); + +- minValuesIndex = +- _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); +- minValues = _mm_blendv_ps(minValues, currentValues, compareResults); +- } ++ minValuesIndex = ++ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm_blendv_ps(minValues, currentValues, compareResults); ++ } + +- // Calculate the smallest value from the remaining 4 points +- _mm_store_ps(minValuesBuffer, minValues); +- _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 4; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 4; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_SSE4_1*/ +@@ -141,61 +139,59 @@ static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target, + static inline void + volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 4; ++ const uint32_t quarterPoints = num_points / 4; + +- float* inputPtr = (float*)source; ++ float* inputPtr = (float*)source; + +- __m128 indexIncrementValues = _mm_set1_ps(4); +- __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); + +- float min = source[0]; +- float index = 0; +- __m128 minValues = _mm_set1_ps(min); +- __m128 minValuesIndex = _mm_setzero_ps(); +- __m128 compareResults; +- __m128 currentValues; ++ float min = source[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; + +- __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; +- __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; + +- for (uint32_t number = 0; number < quarterPoints; number++) { ++ for (uint32_t number = 0; number < quarterPoints; number++) { + +- currentValues = _mm_load_ps(inputPtr); +- inputPtr += 4; +- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ currentValues = _mm_load_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + +- compareResults = _mm_cmplt_ps(currentValues, minValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); + +- minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), +- _mm_andnot_ps(compareResults, minValuesIndex)); ++ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++ _mm_andnot_ps(compareResults, minValuesIndex)); + +- minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), +- _mm_andnot_ps(compareResults, minValues)); +- } ++ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++ _mm_andnot_ps(compareResults, minValues)); ++ } + +- // Calculate the smallest value from the remaining 4 points +- _mm_store_ps(minValuesBuffer, minValues); +- _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 4; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 4; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_SSE*/ +@@ -207,56 +203,54 @@ volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num + static inline void + volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 8; +- +- float* inputPtr = (float*)source; +- +- __m256 indexIncrementValues = _mm256_set1_ps(8); +- __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); +- +- float min = source[0]; +- float index = 0; +- __m256 minValues = _mm256_set1_ps(min); +- __m256 minValuesIndex = _mm256_setzero_ps(); +- __m256 compareResults; +- __m256 currentValues; +- +- __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; +- __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; +- +- for (uint32_t number = 0; number < quarterPoints; number++) { +- currentValues = _mm256_load_ps(inputPtr); +- inputPtr += 8; +- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +- compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); +- minValuesIndex = +- _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); +- minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); +- } ++ const uint32_t quarterPoints = num_points / 8; ++ ++ float* inputPtr = (float*)source; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = source[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (uint32_t number = 0; number < quarterPoints; number++) { ++ currentValues = _mm256_load_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ minValuesIndex = ++ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } + +- // Calculate the smallest value from the remaining 8 points +- _mm256_store_ps(minValuesBuffer, minValues); +- _mm256_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smallest value from the remaining 8 points ++ _mm256_store_ps(minValuesBuffer, minValues); ++ _mm256_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 8; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 8; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_AVX*/ +@@ -268,58 +262,56 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num + static inline void + volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 4; +- +- float* inputPtr = (float*)source; +- float32x4_t indexIncrementValues = vdupq_n_f32(4); +- __VOLK_ATTR_ALIGNED(16) +- float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; +- float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); +- +- float min = source[0]; +- float index = 0; +- float32x4_t minValues = vdupq_n_f32(min); +- uint32x4_t minValuesIndex = vmovq_n_u32(0); +- uint32x4_t compareResults; +- uint32x4_t currentIndexes_u; +- float32x4_t currentValues; +- +- __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; +- __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; +- +- for (uint32_t number = 0; number < quarterPoints; number++) { +- currentValues = vld1q_f32(inputPtr); +- inputPtr += 4; +- currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); +- currentIndexes_u = vcvtq_u32_f32(currentIndexes); +- compareResults = vcgeq_f32(currentValues, minValues); +- minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex), +- vbicq_u32(currentIndexes_u, compareResults)); +- minValues = vminq_f32(currentValues, minValues); +- } ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)source; ++ float32x4_t indexIncrementValues = vdupq_n_f32(4); ++ __VOLK_ATTR_ALIGNED(16) ++ float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; ++ float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); ++ ++ float min = source[0]; ++ float index = 0; ++ float32x4_t minValues = vdupq_n_f32(min); ++ uint32x4_t minValuesIndex = vmovq_n_u32(0); ++ uint32x4_t compareResults; ++ uint32x4_t currentIndexes_u; ++ float32x4_t currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (uint32_t number = 0; number < quarterPoints; number++) { ++ currentValues = vld1q_f32(inputPtr); ++ inputPtr += 4; ++ currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); ++ currentIndexes_u = vcvtq_u32_f32(currentIndexes); ++ compareResults = vcgeq_f32(currentValues, minValues); ++ minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex), ++ vbicq_u32(currentIndexes_u, compareResults)); ++ minValues = vminq_f32(currentValues, minValues); ++ } + +- // Calculate the smallest value from the remaining 4 points +- vst1q_f32(minValuesBuffer, minValues); +- vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); +- for (uint32_t number = 0; number < 4; number++) { +- if (minValuesBuffer[number] < min) { ++ // Calculate the smallest value from the remaining 4 points ++ vst1q_f32(minValuesBuffer, minValues); ++ vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex)); ++ for (uint32_t number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValues[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValues[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 4; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_NEON*/ +@@ -330,18 +322,16 @@ volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_ + static inline void + volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- float min = source[0]; +- uint32_t index = 0; +- +- for (uint32_t i = 1; i < num_points; ++i) { +- if (source[i] < min) { +- index = i; +- min = source[i]; +- } ++ float min = source[0]; ++ uint32_t index = 0; ++ ++ for (uint32_t i = 1; i < num_points; ++i) { ++ if (source[i] < min) { ++ index = i; ++ min = source[i]; + } +- target[0] = index; + } ++ target[0] = index; + } + + #endif /*LV_HAVE_GENERIC*/ +@@ -364,56 +354,54 @@ volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t n + static inline void + volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 8; +- +- float* inputPtr = (float*)source; +- +- __m256 indexIncrementValues = _mm256_set1_ps(8); +- __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); +- +- float min = source[0]; +- float index = 0; +- __m256 minValues = _mm256_set1_ps(min); +- __m256 minValuesIndex = _mm256_setzero_ps(); +- __m256 compareResults; +- __m256 currentValues; +- +- __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; +- __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; +- +- for (uint32_t number = 0; number < quarterPoints; number++) { +- currentValues = _mm256_loadu_ps(inputPtr); +- inputPtr += 8; +- currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); +- compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); +- minValuesIndex = +- _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); +- minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); +- } ++ const uint32_t quarterPoints = num_points / 8; ++ ++ float* inputPtr = (float*)source; ++ ++ __m256 indexIncrementValues = _mm256_set1_ps(8); ++ __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); ++ ++ float min = source[0]; ++ float index = 0; ++ __m256 minValues = _mm256_set1_ps(min); ++ __m256 minValuesIndex = _mm256_setzero_ps(); ++ __m256 compareResults; ++ __m256 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8]; ++ __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8]; ++ ++ for (uint32_t number = 0; number < quarterPoints; number++) { ++ currentValues = _mm256_loadu_ps(inputPtr); ++ inputPtr += 8; ++ currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); ++ minValuesIndex = ++ _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); ++ } + +- // Calculate the smalles value from the remaining 8 points +- _mm256_store_ps(minValuesBuffer, minValues); +- _mm256_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smalles value from the remaining 8 points ++ _mm256_store_ps(minValuesBuffer, minValues); ++ _mm256_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 8; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 8; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 8; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 8; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_AVX*/ +@@ -426,56 +414,54 @@ static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target, + const float* source, + uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 4; +- +- float* inputPtr = (float*)source; +- +- __m128 indexIncrementValues = _mm_set1_ps(4); +- __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); +- +- float min = source[0]; +- float index = 0; +- __m128 minValues = _mm_set1_ps(min); +- __m128 minValuesIndex = _mm_setzero_ps(); +- __m128 compareResults; +- __m128 currentValues; +- +- __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; +- __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; +- +- for (uint32_t number = 0; number < quarterPoints; number++) { +- currentValues = _mm_loadu_ps(inputPtr); +- inputPtr += 4; +- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +- compareResults = _mm_cmplt_ps(currentValues, minValues); +- minValuesIndex = +- _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); +- minValues = _mm_blendv_ps(minValues, currentValues, compareResults); +- } ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)source; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = source[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (uint32_t number = 0; number < quarterPoints; number++) { ++ currentValues = _mm_loadu_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ minValuesIndex = ++ _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValues = _mm_blendv_ps(minValues, currentValues, compareResults); ++ } + +- // Calculate the smallest value from the remaining 4 points +- _mm_store_ps(minValuesBuffer, minValues); +- _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 4; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 4; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_SSE4_1*/ +@@ -486,57 +472,55 @@ static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target, + static inline void + volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points) + { +- if (num_points > 0) { +- const uint32_t quarterPoints = num_points / 4; +- +- float* inputPtr = (float*)source; +- +- __m128 indexIncrementValues = _mm_set1_ps(4); +- __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); +- +- float min = source[0]; +- float index = 0; +- __m128 minValues = _mm_set1_ps(min); +- __m128 minValuesIndex = _mm_setzero_ps(); +- __m128 compareResults; +- __m128 currentValues; +- +- __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; +- __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; +- +- for (uint32_t number = 0; number < quarterPoints; number++) { +- currentValues = _mm_loadu_ps(inputPtr); +- inputPtr += 4; +- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +- compareResults = _mm_cmplt_ps(currentValues, minValues); +- minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), +- _mm_andnot_ps(compareResults, minValuesIndex)); +- minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), +- _mm_andnot_ps(compareResults, minValues)); +- } ++ const uint32_t quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)source; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); ++ ++ float min = source[0]; ++ float index = 0; ++ __m128 minValues = _mm_set1_ps(min); ++ __m128 minValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4]; ++ ++ for (uint32_t number = 0; number < quarterPoints; number++) { ++ currentValues = _mm_loadu_ps(inputPtr); ++ inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ compareResults = _mm_cmplt_ps(currentValues, minValues); ++ minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes), ++ _mm_andnot_ps(compareResults, minValuesIndex)); ++ minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues), ++ _mm_andnot_ps(compareResults, minValues)); ++ } + +- // Calculate the smallest value from the remaining 4 points +- _mm_store_ps(minValuesBuffer, minValues); +- _mm_store_ps(minIndexesBuffer, minValuesIndex); ++ // Calculate the smallest value from the remaining 4 points ++ _mm_store_ps(minValuesBuffer, minValues); ++ _mm_store_ps(minIndexesBuffer, minValuesIndex); + +- for (uint32_t number = 0; number < 4; number++) { +- if (minValuesBuffer[number] < min) { ++ for (uint32_t number = 0; number < 4; number++) { ++ if (minValuesBuffer[number] < min) { ++ index = minIndexesBuffer[number]; ++ min = minValuesBuffer[number]; ++ } else if (minValuesBuffer[number] == min) { ++ if (index > minIndexesBuffer[number]) + index = minIndexesBuffer[number]; +- min = minValuesBuffer[number]; +- } else if (minValuesBuffer[number] == min) { +- if (index > minIndexesBuffer[number]) +- index = minIndexesBuffer[number]; +- } + } ++ } + +- for (uint32_t number = quarterPoints * 4; number < num_points; number++) { +- if (source[number] < min) { +- index = number; +- min = source[number]; +- } ++ for (uint32_t number = quarterPoints * 4; number < num_points; number++) { ++ if (source[number] < min) { ++ index = number; ++ min = source[number]; + } +- target[0] = (uint32_t)index; + } ++ target[0] = (uint32_t)index; + } + + #endif /*LV_HAVE_SSE*/ +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +index 8f40730..6ddd8a3 100644 +--- a/kernels/volk/volk_32fc_index_min_16u.h ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -2,14 +2,14 @@ + /* + * Copyright 2021 Free Software Foundation, Inc. + * +- * This file is part of GNU Radio ++ * This file is part of VOLK + * +- * GNU Radio is free software; you can redistribute it and/or modify ++ * VOLK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * +- * GNU Radio is distributed in the hope that it will be useful, ++ * VOLK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +@@ -210,7 +210,6 @@ static inline void + volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- const uint32_t num_bytes = num_points * 8; + + union bit128 holderf; + union bit128 holderi; +@@ -230,7 +229,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + +- int bound = num_bytes >> 5; ++ int bound = num_points >> 2; + + for (int i = 0; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)source); +@@ -256,7 +255,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + +- if (num_bytes >> 4 & 1) { ++ if (num_points >> 1 & 1) { + xmm2 = _mm_load_ps((float*)source); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); +@@ -283,7 +282,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + +- if (num_bytes >> 3 & 1) { ++ if (num_points & 1) { + sq_dist = lv_creal(source[0]) * lv_creal(source[0]) + + lv_cimag(source[0]) * lv_cimag(source[0]); + +@@ -324,13 +323,12 @@ static inline void + volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +- const uint32_t num_bytes = num_points * 8; + + float sq_dist = 0.0; + float min = FLT_MAX; + uint16_t index = 0; + +- for (uint32_t i = 0; i> 3; ++i) { ++ for (uint32_t i = 0; i < num_points; ++i) { + sq_dist = lv_creal(source[i]) * lv_creal(source[i]) + + lv_cimag(source[i]) * lv_cimag(source[i]); + +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index efa33ee..d5e2a00 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -2,14 +2,14 @@ + /* + * Copyright 2021 Free Software Foundation, Inc. + * +- * This file is part of GNU Radio ++ * This file is part of VOLK + * +- * GNU Radio is free software; you can redistribute it and/or modify ++ * VOLK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * +- * GNU Radio is distributed in the hope that it will be useful, ++ * VOLK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +@@ -198,8 +198,6 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + static inline void + volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num_points) + { +- const uint32_t num_bytes = num_points * 8; +- + union bit128 holderf; + union bit128 holderi; + float sq_dist = 0.0; +@@ -218,7 +216,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + xmm10 = _mm_setr_epi32(4, 4, 4, 4); + xmm3 = _mm_set_ps1(FLT_MAX); + +- int bound = num_bytes >> 5; ++ int bound = num_points >> 2; + + for (int i = 0; i < bound; ++i) { + xmm1 = _mm_load_ps((float*)source); +@@ -244,7 +242,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + +- if (num_bytes >> 4 & 1) { ++ if (num_points >> 1 & 1) { + xmm2 = _mm_load_ps((float*)source); + + xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec); +@@ -271,7 +269,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + xmm8 = _mm_add_epi32(xmm8, xmm10); + } + +- if (num_bytes >> 3 & 1) { ++ if (num_points & 1) { + sq_dist = lv_creal(source[0]) * lv_creal(source[0]) + + lv_cimag(source[0]) * lv_cimag(source[0]); + +@@ -311,13 +309,11 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + static inline void + volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t num_points) + { +- const uint32_t num_bytes = num_points * 8; +- + float sq_dist = 0.0; + float min = FLT_MAX; + uint32_t index = 0; + +- for (uint32_t i = 0; i> 3; ++i) { ++ for (uint32_t i = 0; i < num_points; ++i) { + sq_dist = lv_creal(source[i]) * lv_creal(source[i]) + + lv_cimag(source[i]) * lv_cimag(source[i]); + +-- +2.30.2 + diff --git a/patches/0010-Fix-clang-format-errors.patch b/patches/0010-Fix-clang-format-errors.patch new file mode 100644 index 0000000..012eba0 --- /dev/null +++ b/patches/0010-Fix-clang-format-errors.patch @@ -0,0 +1,57 @@ +From c68e666420a840cbdeb9529f23af19b6c8e37391 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Mon, 5 Jul 2021 13:08:29 +0200 +Subject: [PATCH 10/73] Fix clang-format errors + +Signed-off-by: Zlika +--- + kernels/volk/volk_32f_index_min_32u.h | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +diff --git a/kernels/volk/volk_32f_index_min_32u.h b/kernels/volk/volk_32f_index_min_32u.h +index c71ee60..92bafbf 100644 +--- a/kernels/volk/volk_32f_index_min_32u.h ++++ b/kernels/volk/volk_32f_index_min_32u.h +@@ -101,8 +101,7 @@ static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target, + + compareResults = _mm_cmplt_ps(currentValues, minValues); + +- minValuesIndex = +- _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm_blendv_ps(minValues, currentValues, compareResults); + } + +@@ -225,8 +224,7 @@ volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); +- minValuesIndex = +- _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + +@@ -376,8 +374,7 @@ volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num + inputPtr += 8; + currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS); +- minValuesIndex = +- _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm256_blendv_ps(minValues, currentValues, compareResults); + } + +@@ -436,8 +433,7 @@ static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target, + inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + compareResults = _mm_cmplt_ps(currentValues, minValues); +- minValuesIndex = +- _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); ++ minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults); + minValues = _mm_blendv_ps(minValues, currentValues, compareResults); + } + +-- +2.30.2 + diff --git a/patches/0011-Code-cleanup.patch b/patches/0011-Code-cleanup.patch new file mode 100644 index 0000000..0396584 --- /dev/null +++ b/patches/0011-Code-cleanup.patch @@ -0,0 +1,139 @@ +From 924b3fffbb9fa6218499a1fa0378262890c3c76d Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Mon, 5 Jul 2021 14:55:30 +0200 +Subject: [PATCH 11/73] Code cleanup + +Signed-off-by: Zlika +--- + kernels/volk/volk_32fc_index_min_16u.h | 12 ++++++------ + kernels/volk/volk_32fc_index_min_32u.h | 14 +++++++------- + 2 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +index 6ddd8a3..e355626 100644 +--- a/kernels/volk/volk_32fc_index_min_16u.h ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -87,7 +87,7 @@ + #include + + static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -147,7 +147,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target, + #include + + static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -207,7 +207,7 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + #include + + static inline void +-volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num_points) ++volk_32fc_index_min_16u_a_sse3(uint16_t* target, const lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + +@@ -320,7 +320,7 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, lv_32fc_t* source, uint32_t num + + #ifdef LV_HAVE_GENERIC + static inline void +-volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t num_points) ++volk_32fc_index_min_16u_generic(uint16_t* target, const lv_32fc_t* source, uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + +@@ -358,7 +358,7 @@ volk_32fc_index_min_16u_generic(uint16_t* target, lv_32fc_t* source, uint32_t nu + #include + + static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +@@ -418,7 +418,7 @@ static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target, + #include + + static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index d5e2a00..72fb040 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -80,7 +80,7 @@ + #include + + static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -138,7 +138,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_0(uint32_t* target, + #include + + static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -196,7 +196,7 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + #include + + static inline void +-volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num_points) ++volk_32fc_index_min_32u_a_sse3(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) + { + union bit128 holderf; + union bit128 holderi; +@@ -307,7 +307,7 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, lv_32fc_t* source, uint32_t num + + #ifdef LV_HAVE_GENERIC + static inline void +-volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t num_points) ++volk_32fc_index_min_32u_generic(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) + { + float sq_dist = 0.0; + float min = FLT_MAX; +@@ -342,7 +342,7 @@ volk_32fc_index_min_32u_generic(uint32_t* target, lv_32fc_t* source, uint32_t nu + #include + + static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -400,7 +400,7 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_0(uint32_t* target, + #include + + static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, +- lv_32fc_t* source, ++ const lv_32fc_t* source, + uint32_t num_points) + { + const __m256i indices_increment = _mm256_set1_epi32(8); +@@ -458,7 +458,7 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + #include + + static inline void +-volk_32fc_index_min_32u_neon(uint32_t* target, lv_32fc_t* source, uint32_t num_points) ++volk_32fc_index_min_32u_neon(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) + { + const uint32_t quarter_points = num_points / 4; + const lv_32fc_t* sourcePtr = source; +-- +2.30.2 + diff --git a/patches/0012-Fix-clang-format-errors.patch b/patches/0012-Fix-clang-format-errors.patch new file mode 100644 index 0000000..35fa463 --- /dev/null +++ b/patches/0012-Fix-clang-format-errors.patch @@ -0,0 +1,82 @@ +From e06454128245cdf206808cf2532b41c5fee54453 Mon Sep 17 00:00:00 2001 +From: Zlika +Date: Mon, 5 Jul 2021 15:04:17 +0200 +Subject: [PATCH 12/73] Fix clang-format errors + +Signed-off-by: Zlika +--- + kernels/volk/volk_32fc_index_min_16u.h | 10 ++++++---- + kernels/volk/volk_32fc_index_min_32u.h | 15 +++++++++------ + 2 files changed, 15 insertions(+), 10 deletions(-) + +diff --git a/kernels/volk/volk_32fc_index_min_16u.h b/kernels/volk/volk_32fc_index_min_16u.h +index e355626..64fcf7b 100644 +--- a/kernels/volk/volk_32fc_index_min_16u.h ++++ b/kernels/volk/volk_32fc_index_min_16u.h +@@ -206,8 +206,9 @@ static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target, + #include + #include + +-static inline void +-volk_32fc_index_min_16u_a_sse3(uint16_t* target, const lv_32fc_t* source, uint32_t num_points) ++static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target, ++ const lv_32fc_t* source, ++ uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + +@@ -319,8 +320,9 @@ volk_32fc_index_min_16u_a_sse3(uint16_t* target, const lv_32fc_t* source, uint32 + #endif /*LV_HAVE_SSE3*/ + + #ifdef LV_HAVE_GENERIC +-static inline void +-volk_32fc_index_min_16u_generic(uint16_t* target, const lv_32fc_t* source, uint32_t num_points) ++static inline void volk_32fc_index_min_16u_generic(uint16_t* target, ++ const lv_32fc_t* source, ++ uint32_t num_points) + { + num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points; + +diff --git a/kernels/volk/volk_32fc_index_min_32u.h b/kernels/volk/volk_32fc_index_min_32u.h +index 72fb040..2fb0d7e 100644 +--- a/kernels/volk/volk_32fc_index_min_32u.h ++++ b/kernels/volk/volk_32fc_index_min_32u.h +@@ -195,8 +195,9 @@ static inline void volk_32fc_index_min_32u_a_avx2_variant_1(uint32_t* target, + #include + #include + +-static inline void +-volk_32fc_index_min_32u_a_sse3(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) ++static inline void volk_32fc_index_min_32u_a_sse3(uint32_t* target, ++ const lv_32fc_t* source, ++ uint32_t num_points) + { + union bit128 holderf; + union bit128 holderi; +@@ -306,8 +307,9 @@ volk_32fc_index_min_32u_a_sse3(uint32_t* target, const lv_32fc_t* source, uint32 + #endif /*LV_HAVE_SSE3*/ + + #ifdef LV_HAVE_GENERIC +-static inline void +-volk_32fc_index_min_32u_generic(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) ++static inline void volk_32fc_index_min_32u_generic(uint32_t* target, ++ const lv_32fc_t* source, ++ uint32_t num_points) + { + float sq_dist = 0.0; + float min = FLT_MAX; +@@ -457,8 +459,9 @@ static inline void volk_32fc_index_min_32u_u_avx2_variant_1(uint32_t* target, + #include + #include + +-static inline void +-volk_32fc_index_min_32u_neon(uint32_t* target, const lv_32fc_t* source, uint32_t num_points) ++static inline void volk_32fc_index_min_32u_neon(uint32_t* target, ++ const lv_32fc_t* source, ++ uint32_t num_points) + { + const uint32_t quarter_points = num_points / 4; + const lv_32fc_t* sourcePtr = source; +-- +2.30.2 + diff --git a/patches/0055-asan-Fix-volk_malloc-alignment-bug.patch b/patches/0055-asan-Fix-volk_malloc-alignment-bug.patch new file mode 100644 index 0000000..a854f96 --- /dev/null +++ b/patches/0055-asan-Fix-volk_malloc-alignment-bug.patch @@ -0,0 +1,44 @@ +From a0837c094fa4725e3362e05da82e78233c104975 Mon Sep 17 00:00:00 2001 +From: AlexandreRouma +Date: Thu, 30 Sep 2021 13:52:32 +0200 +Subject: [PATCH 55/73] asan: Fix volk_malloc alignment bug + +ASAN (the Address Sanitizer used by GCC) requires memory allocations to +be a multiple of the alignment. To replicate the bug, use a version of +libvolk without this fix, call volk_malloc() with a number of byte +that's not a multiple of the alignment and compile it with the Address +sanitizer enable (-fsanitize=address). The software will error out +and complain about the alignement. This patch fixes it by adding the +missing number of bytes to the size variable so that it becomes a +multiple of the alignment. + +Signed-off-by: AlexandreRouma +--- + lib/volk_malloc.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c +index 8e84c14..f489ef8 100644 +--- a/lib/volk_malloc.c ++++ b/lib/volk_malloc.c +@@ -50,6 +50,17 @@ + + void* volk_malloc(size_t size, size_t alignment) + { ++ if ((size == 0) || (alignment == 0)) { ++ fprintf(stderr, "VOLK: Error allocating memory: either size or alignment is 0"); ++ return NULL; ++ } ++ // Tweak size to satisfy ASAN (the GCC address sanitizer). ++ // Calling 'volk_malloc' might therefor result in the allocation of more memory than ++ // requested for correct alignment. Any allocation size change here will in general not ++ // impact the end result since initial size alignment is required either way. ++ if (size % alignment) { ++ size += alignment - (size % alignment); ++ } + #if HAVE_POSIX_MEMALIGN + // quoting posix_memalign() man page: + // "alignment must be a power of two and a multiple of sizeof(void *)" +-- +2.30.2 + diff --git a/patches/0056-format-Fix-code-format.patch b/patches/0056-format-Fix-code-format.patch new file mode 100644 index 0000000..f6caa75 --- /dev/null +++ b/patches/0056-format-Fix-code-format.patch @@ -0,0 +1,31 @@ +From a307c9727be4b4b608b5e6b1ae3f46218df479c2 Mon Sep 17 00:00:00 2001 +From: Johannes Demel +Date: Sat, 2 Oct 2021 11:38:46 +0200 +Subject: [PATCH 56/73] format: Fix code format + +I was too quick to merge a PR and missed a formatting issue. Thus, I fix +it now. + +Signed-off-by: Johannes Demel +--- + lib/volk_malloc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/lib/volk_malloc.c b/lib/volk_malloc.c +index f489ef8..0410d29 100644 +--- a/lib/volk_malloc.c ++++ b/lib/volk_malloc.c +@@ -56,8 +56,8 @@ void* volk_malloc(size_t size, size_t alignment) + } + // Tweak size to satisfy ASAN (the GCC address sanitizer). + // Calling 'volk_malloc' might therefor result in the allocation of more memory than +- // requested for correct alignment. Any allocation size change here will in general not +- // impact the end result since initial size alignment is required either way. ++ // requested for correct alignment. Any allocation size change here will in general ++ // not impact the end result since initial size alignment is required either way. + if (size % alignment) { + size += alignment - (size % alignment); + } +-- +2.30.2 + diff --git a/patches/make-acc-happy b/patches/make-acc-happy new file mode 100644 index 0000000..bcef894 --- /dev/null +++ b/patches/make-acc-happy @@ -0,0 +1,60 @@ +From 799245ea6e9e05cc0ed0fabe783fbbe1a5054fd4 Mon Sep 17 00:00:00 2001 +From: "A. Maitland Bottoms" +Date: Tue, 27 Mar 2018 22:02:59 -0400 +Subject: [PATCH 2/6] make acc happy + +The abi-compliance-checker grabs all the .h files it finds +and tries to compile them all. Even though some are not +appropriate for the architecture being run on. Being careful +with preprocessor protections avoids problems. +--- + include/volk/volk_neon_intrinsics.h | 2 ++ + kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h | 1 + + kernels/volk/volk_8u_x2_encodeframepolar_8u.h | 3 --- + 3 files changed, 3 insertions(+), 3 deletions(-) + +--- a/include/volk/volk_neon_intrinsics.h ++++ b/include/volk/volk_neon_intrinsics.h +@@ -79,6 +79,7 @@ + + #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ + #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ ++#ifdef LV_HAVE_NEON + #include + + +@@ -294,4 +295,5 @@ + #endif + } + ++#endif /*LV_HAVE_NEON*/ + #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */ +--- a/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h ++++ b/kernels/volk/volk_32f_8u_polarbutterflypuppet_32f.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + + static inline void sanitize_bytes(unsigned char* u, const int elements) +--- a/kernels/volk/volk_8u_x2_encodeframepolar_8u.h ++++ b/kernels/volk/volk_8u_x2_encodeframepolar_8u.h +@@ -60,8 +60,6 @@ + } + } + +-#ifdef LV_HAVE_GENERIC +- + static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame, + unsigned char* temp, + unsigned int frame_size) +@@ -81,7 +79,6 @@ + --stage; + } + } +-#endif /* LV_HAVE_GENERIC */ + + #ifdef LV_HAVE_SSSE3 + #include diff --git a/patches/optional-static-apps b/patches/optional-static-apps new file mode 100644 index 0000000..399ee9b --- /dev/null +++ b/patches/optional-static-apps @@ -0,0 +1,20 @@ +--- a/apps/CMakeLists.txt ++++ b/apps/CMakeLists.txt +@@ -62,7 +62,7 @@ + target_link_libraries(volk_profile PRIVATE std::filesystem) + endif() + +-if(ENABLE_STATIC_LIBS) ++if(ENABLE_STATIC_LIBS AND ENABLE_STATIC_APPS) + target_link_libraries(volk_profile PRIVATE volk_static) + set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static") + else() +@@ -79,7 +79,7 @@ + add_executable(volk-config-info volk-config-info.cc ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc + ) + +-if(ENABLE_STATIC_LIBS) ++if(ENABLE_STATIC_LIBS AND ENABLE_STATIC_APPS) + target_link_libraries(volk-config-info volk_static) + set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static") + else() diff --git a/patches/remove-external-HTML-resources b/patches/remove-external-HTML-resources new file mode 100644 index 0000000..63f503e --- /dev/null +++ b/patches/remove-external-HTML-resources @@ -0,0 +1,20 @@ +--- a/cpu_features/README.md ++++ b/cpu_features/README.md +@@ -1,4 +1,4 @@ +-# cpu_features [![Build Status](https://travis-ci.org/google/cpu_features.svg?branch=master)](https://travis-ci.org/google/cpu_features) [![Build status](https://ci.appveyor.com/api/projects/status/46d1owsj7n8dsylq/branch/master?svg=true)](https://ci.appveyor.com/project/gchatelet/cpu-features/branch/master) ++# cpu_features + + A cross-platform C library to retrieve CPU features (such as available + instructions) at runtime. +--- a/README.md ++++ b/README.md +@@ -1,9 +1,3 @@ +-[![Build Status](https://travis-ci.com/gnuradio/volk.svg?branch=master)](https://travis-ci.com/gnuradio/volk) [![Build status](https://ci.appveyor.com/api/projects/status/5o56mgw0do20jlh3/branch/master?svg=true)](https://ci.appveyor.com/project/gnuradio/volk/branch/master) +-![Check PR Formatting](https://github.com/gnuradio/volk/workflows/Check%20PR%20Formatting/badge.svg) +-![Run VOLK tests](https://github.com/gnuradio/volk/workflows/Run%20VOLK%20tests/badge.svg) +- +-![VOLK Logo](/docs/volk_logo.png) +- + # Welcome to VOLK! + + VOLK is a sub-project of GNU Radio. Please see http://libvolk.org for bug diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..b02dd34 --- /dev/null +++ b/patches/series @@ -0,0 +1,18 @@ +0001-Add-volk_32f-c-_index_min_16-32u.patch +0002-Fix-volk_32fc_index_min_32u_neon.patch +0003-Fix-volk_32fc_index_min_32u_neon.patch +0004-Code-cleanup.patch +0005-Fix-clang-format-errors.patch +0006-New-generic-implementation-fixed-typos.patch +0007-Add-the-list-of-contributors-agreeing-to-LGPL-licens.patch +0009-Code-cleanup.patch +0010-Fix-clang-format-errors.patch +0011-Code-cleanup.patch +0012-Fix-clang-format-errors.patch +0055-asan-Fix-volk_malloc-alignment-bug.patch +0056-format-Fix-code-format.patch +make-acc-happy +optional-static-apps +remove-external-HTML-resources +skip-cpu_features-on-kfreebsd +use-system-cpu-features-package.patch diff --git a/patches/skip-cpu_features-on-kfreebsd b/patches/skip-cpu_features-on-kfreebsd new file mode 100644 index 0000000..b96c9f2 --- /dev/null +++ b/patches/skip-cpu_features-on-kfreebsd @@ -0,0 +1,20 @@ +Subject: skip cpu_freatures on kfreebsd +Author: A. Maitland Bottoms + + Avoid #error "Unsupported OS" on kFreeBSD + +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -133,8 +133,10 @@ + ######################################################################## + + # cpu_features - sensible defaults, user settable option +-if(CMAKE_SYSTEM_PROCESSOR MATCHES +- "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)") ++message(STATUS "Building Volk for ${CMAKE_SYSTEM_NAME} on ${CMAKE_SYSTEM_PROCESSOR}") ++if((CMAKE_SYSTEM_PROCESSOR MATCHES ++ "(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)") ++ AND (NOT CMAKE_SYSTEM_NAME MATCHES "kFreeBSD")) + option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON) + else() + option(VOLK_CPU_FEATURES "Volk uses cpu_features" OFF) diff --git a/patches/use-system-cpu-features-package.patch b/patches/use-system-cpu-features-package.patch new file mode 100644 index 0000000..cb48bc3 --- /dev/null +++ b/patches/use-system-cpu-features-package.patch @@ -0,0 +1,37 @@ +Description: use system cpu_features package + +Author: Shengjing Zhu +Last-Update: 2020-12-26 + +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -142,17 +142,7 @@ + option(VOLK_CPU_FEATURES "Volk uses cpu_features" OFF) + endif() + if (VOLK_CPU_FEATURES) +- if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/cpu_features/CMakeLists.txt" ) +- message(FATAL_ERROR "cpu_features/CMakeLists.txt not found. Did you forget to git clone recursively?\nFix with: git submodule update --init") +- endif() +- message(STATUS "Building Volk with cpu_features") +- set(BUILD_PIC ON CACHE BOOL +- "Build cpu_features with Position Independent Code (PIC)." +- FORCE) +- set(BUILD_SHARED_LIBS_SAVED "${BUILD_SHARED_LIBS}") +- set(BUILD_SHARED_LIBS OFF) +- add_subdirectory(cpu_features) +- set(BUILD_SHARED_LIBS "${BUILD_SHARED_LIBS_SAVED}") ++ find_package(CpuFeatures) + else() + message(STATUS "Building Volk without cpu_features") + endif() +--- a/lib/CMakeLists.txt ++++ b/lib/CMakeLists.txt +@@ -517,7 +517,7 @@ + if(VOLK_CPU_FEATURES) + set_source_files_properties(volk_cpu.c PROPERTIES COMPILE_DEFINITIONS "VOLK_CPU_FEATURES=1") + target_include_directories(volk_obj +- PRIVATE $ ++ PRIVATE $ + ) + endif() + diff --git a/rules b/rules new file mode 100755 index 0000000..ce2504a --- /dev/null +++ b/rules @@ -0,0 +1,19 @@ +#!/usr/bin/make -f +DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH) +export DEB_HOST_MULTIARCH +#export DH_VERBOSE=1 + +%: + dh $@ --with python3 + +override_dh_auto_configure: + dh_auto_configure -- -DLIB_SUFFIX="/$(DEB_HOST_MULTIARCH)" \ + -DENABLE_STATIC_LIBS=On -DPYTHON_EXECUTABLE=/usr/bin/python3 \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo + +override_dh_auto_build-indep: + cmake --build obj-* --target all + cmake --build obj-* --target volk_doc + +override_dh_auto_test: + - dh_auto_test -- CTEST_TEST_TIMEOUT=60 diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/volk-config-info.1 b/volk-config-info.1 new file mode 100644 index 0000000..e8d6efd --- /dev/null +++ b/volk-config-info.1 @@ -0,0 +1,45 @@ +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.40.10. +.TH VOLK-CONFIG-INFO "1" "July 2014" "volk-config-info 0.1" "User Commands" +.SH NAME +volk-config-info \- pkgconfig-like tool for Vector Optimized Library of Kernels 0.1 +.SH DESCRIPTION +.SS "Program options: volk-config-info [options]:" +.TP +\fB\-h\fR [ \fB\-\-help\fR ] +print help message +.TP +\fB\-\-prefix\fR +print VOLK installation prefix +.TP +\fB\-\-builddate\fR +print VOLK build date (RFC2822 format) +.TP +\fB\-\-cc\fR +print VOLK C compiler version +.TP +\fB\-\-cflags\fR +print VOLK CFLAGS +.TP +\fB\-\-all\-machines\fR +print VOLK machines built into library +.TP +\fB\-\-avail\-machines\fR +print VOLK machines the current platform can use +.TP +\fB\-\-machine\fR +print the VOLK machine that will be used +.TP +\fB\-v\fR [ \fB\-\-version\fR ] +print VOLK version +.SH "SEE ALSO" +The full documentation for +.B volk-config-info +is maintained as a Texinfo manual. If the +.B info +and +.B volk-config-info +programs are properly installed at your site, the command +.IP +.B info volk-config-info +.PP +should give you access to the complete manual. diff --git a/volk_modtool.1 b/volk_modtool.1 new file mode 100644 index 0000000..752e7f5 --- /dev/null +++ b/volk_modtool.1 @@ -0,0 +1,112 @@ +.TH GNURADIO "1" "August 2013" "volk_modtool 3.7" "User Commands" +.SH NAME +volk_modtool \- tailor VOLK modules +.SH DESCRIPTION +The volk_modtool tool is installed along with VOLK as a way of helping +to construct, add to, and interogate the VOLK library or companion +libraries. +.P +volk_modtool is installed into $prefix/bin. +.P +VOLK modtool enables creating standalone (out-of-tree) VOLK modules +and provides a few tools for sharing VOLK kernels between VOLK +modules. If you need to design or work with VOLK kernels away from +the canonical VOLK library, this is the tool. If you need to tailor +your own VOLK library for whatever reason, this is the tool. +.P +The canonical VOLK library installs a volk.h and a libvolk.so. Your +own library will install volk_$name.h and libvolk_$name.so. Ya Gronk? +Good. +.P +There isn't a substantial difference between the canonical VOLK +module and any other VOLK module. They're all peers. Any module +created via VOLK modtool will come complete with a default +volk_modtool.cfg file associating the module with the base from which +it came, its distinctive $name and its destination (or path). These +values (created from user input if VOLK modtool runs without a +user-supplied config file or a default config file) serve as default +values for some VOLK modtool actions. It's more or less intended for +the user to change directories to the top level of a created VOLK +module and then run volk_modtool to take advantage of the values +stored in the default volk_modtool.cfg file. +.P +Apart from creating new VOLK modules, VOLK modtool allows you to list +the names of kernels in other modules, list the names of kernels in +the current module, add kernels from another module into the current +module, and remove kernels from the current module. When moving +kernels between modules, VOLK modtool does its best to keep the qa +and profiling code for those kernels intact. If the base has a test +or a profiling call for some kernel, those calls will follow the +kernel when VOLK modtool adds that kernel. If QA or profiling +requires a puppet kernel, the puppet kernel will follow the original +kernel when VOLK modtool adds that original kernel. VOLK modtool +respects puppets. +.P +====================================================================== +.P +.SH Installing a new VOLK Library: +.P +Run the command "volk_modtool -i". This will ask you three questions: +.P + name: // the name to give your VOLK library: volk_ + destination: // directory new source tree is built under -- must exists. + // It will create /volk_ + base: // the directory containing the original VOLK source code +.P +This will build a new skeleton directory in the destination provided +with the name volk_. It will contain the necessary structure to +build: +.P + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=/opt/volk ../ + make + sudo make install +.P +Right now, the library is empty and contains no kernels. Kernels can +be added from another VOLK library using the '-a' option. If not +specified, the kernel will be extracted from the base VOLK +directory. Using the '-b' allows us to specify another VOLK library to +use for this purpose. +.P + volk_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc +.P +This will put the code for the new kernel into +/volk_/kernels/volk_/ +.P +Other kernels must be added by hand. See the following webpages for +more information about creating VOLK kernels: + http://gnuradio.org/doc/doxygen/volk_guide.html + http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk +.P +====================================================================== +.P +.SH OPTIONS +.P +Options for Adding and Removing Kernels: + -a, --add_kernel + Add kernel from existing VOLK module. Uses the base VOLK module + unless -b is used. Use -n to specify the kernel name. + Requires: -n. + Optional: -b +.P + -A, --add_all_kernels + Add all kernels from existing VOLK module. Uses the base VOLK + module unless -b is used. + Optional: -b +.P + -x, --remove_kernel + Remove kernel from module. + Required: -n. + Optional: -b +.P +Options for Listing Kernels: + -l, --list + Lists all kernels available in the base VOLK module. +.P + -k, --kernels + Lists all kernels in this VOLK module. +.P + -r, --remote-list + Lists all kernels in another VOLK module that is specified + using the -b option. diff --git a/volk_profile.1 b/volk_profile.1 new file mode 100644 index 0000000..405facb --- /dev/null +++ b/volk_profile.1 @@ -0,0 +1,5 @@ +.TH UHD_FFT "1" "March 2012" "volk_profile 3.5" "User Commands" +.SH NAME +volk_profile \- Quality Assurance application for libvolk functions +.SH DESCRIPTION +Writes profile results to a file. diff --git a/watch b/watch new file mode 100644 index 0000000..de9e7d1 --- /dev/null +++ b/watch @@ -0,0 +1,4 @@ +version=4 + opts="filenamemangle=s%(?:.*?)?volk-?(\d[\d.]*)\.tar\.xz%volk_$1.orig.tar.xz%" \ + https://github.com/gnuradio/volk/releases \ + (?:.*?/)?volk-?(\d[\d.]*)\.tar\.xz debian uupdate -- 2.30.2