From: Tom Rondeau <tom@trondeau.com>
Date: Tue, 7 Apr 2015 18:37:28 +0000 (-0400)
Subject: [PATCH 1/7] volk: accurate exp kernel.
X-Git-Tag: archive/raspbian/2.2.1-2+rpi1^2~11
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=21246c7d335661b902967e417e53e653c681de90;p=volk.git

[PATCH 1/7] volk: accurate exp kernel.

A more accurate exp VOLK kernel than volk_32f_expfast_32f.Taken from
code licensed with zlib.

Gbp-Pq: Name 0001-volk-accurate-exp-kernel.patch
---

diff --git a/kernels/volk/volk_32f_exp_32f.h b/kernels/volk/volk_32f_exp_32f.h
new file mode 100644
index 0000000..19c3d9d
--- /dev/null
+++ b/kernels/volk/volk_32f_exp_32f.h
@@ -0,0 +1,298 @@
+/* -*- c++ -*- */
+/*
+ * Copyright 2015-2020 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+ * GNU Radio is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3, or (at your option)
+ * any later version.
+ *
+ * GNU Radio is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU Radio; see the file COPYING.  If not, write to
+ * the Free Software Foundation, Inc., 51 Franklin Street,
+ * Boston, MA 02110-1301, USA.
+ */
+
+/* SIMD (SSE4) implementation of exp
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+/*!
+ * \page volk_32f_exp_32f
+ *
+ * \b Overview
+ *
+ * Computes exponential of input vector and stores results in output vector.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_32f_exp_32f(float* bVector, const float* aVector, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li aVector: The input vector of floats.
+ * \li num_points: The number of data points.
+ *
+ * \b Outputs
+ * \li bVector: The vector where results will be stored.
+ *
+ * \b Example
+ * \code
+ *   int N = 10;
+ *   unsigned int alignment = volk_get_alignment();
+ *   float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *   float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
+ *
+ *   in[0] = 0;
+ *   in[1] = 0.5;
+ *   in[2] = std::sqrt(2.f)/2.f;
+ *   in[3] = std::sqrt(3.f)/2.f;
+ *   in[4] = in[5] = 1;
+ *   for(unsigned int ii = 6; ii < N; ++ii){
+ *       in[ii] = - in[N-ii-1];
+ *   }
+ *
+ *   volk_32f_exp_32f(out, in, N);
+ *
+ *   for(unsigned int ii = 0; ii < N; ++ii){
+ *       printf("exp(%1.3f) = %1.3f\n", in[ii], out[ii]);
+ *   }
+ *
+ *   volk_free(in);
+ *   volk_free(out);
+ * \endcode
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+#ifndef INCLUDED_volk_32f_exp_32f_a_H
+#define INCLUDED_volk_32f_exp_32f_a_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_exp_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+  float* bPtr = bVector;
+  const float* aPtr = aVector;
+
+  unsigned int number = 0;
+  unsigned int quarterPoints = num_points / 4;
+
+  // Declare variables and constants
+  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
+  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
+  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+  __m128i emm0, pi32_0x7f;
+
+  one = _mm_set1_ps(1.0);
+  exp_hi = _mm_set1_ps(88.3762626647949);
+  exp_lo = _mm_set1_ps(-88.3762626647949);
+  log2EF = _mm_set1_ps(1.44269504088896341);
+  half = _mm_set1_ps(0.5);
+  exp_C1 = _mm_set1_ps(0.693359375);
+  exp_C2 = _mm_set1_ps(-2.12194440e-4);
+  pi32_0x7f = _mm_set1_epi32(0x7f);
+
+  exp_p0 = _mm_set1_ps(1.9875691500e-4);
+  exp_p1 = _mm_set1_ps(1.3981999507e-3);
+  exp_p2 = _mm_set1_ps(8.3334519073e-3);
+  exp_p3 = _mm_set1_ps(4.1665795894e-2);
+  exp_p4 = _mm_set1_ps(1.6666665459e-1);
+  exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+  for(;number < quarterPoints; number++) {
+    aVal = _mm_load_ps(aPtr);
+    tmp = _mm_setzero_ps();
+
+    aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
+
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp = _mm_cvtepi32_ps(emm0);
+
+    mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+    fx = _mm_sub_ps(tmp, mask);
+
+    tmp = _mm_mul_ps(fx, exp_C1);
+    z = _mm_mul_ps(fx, exp_C2);
+    aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
+    z = _mm_mul_ps(aVal, aVal);
+
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
+    y = _mm_add_ps(y, one);
+
+    emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+    pow2n = _mm_castsi128_ps(emm0);
+    bVal = _mm_mul_ps(y, pow2n);
+
+    _mm_store_ps(bPtr, bVal);
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++) {
+    *bPtr++ = expf(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+  float* bPtr = bVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++) {
+    *bPtr++ = expf(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_exp_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_exp_32f_u_H
+#define INCLUDED_volk_32f_exp_32f_u_H
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+
+static inline void
+volk_32f_exp_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
+{
+  float* bPtr = bVector;
+  const float* aPtr = aVector;
+
+  unsigned int number = 0;
+  unsigned int quarterPoints = num_points / 4;
+
+  // Declare variables and constants
+  __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
+  __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
+  __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+  __m128i emm0, pi32_0x7f;
+
+  one = _mm_set1_ps(1.0);
+  exp_hi = _mm_set1_ps(88.3762626647949);
+  exp_lo = _mm_set1_ps(-88.3762626647949);
+  log2EF = _mm_set1_ps(1.44269504088896341);
+  half = _mm_set1_ps(0.5);
+  exp_C1 = _mm_set1_ps(0.693359375);
+  exp_C2 = _mm_set1_ps(-2.12194440e-4);
+  pi32_0x7f = _mm_set1_epi32(0x7f);
+
+  exp_p0 = _mm_set1_ps(1.9875691500e-4);
+  exp_p1 = _mm_set1_ps(1.3981999507e-3);
+  exp_p2 = _mm_set1_ps(8.3334519073e-3);
+  exp_p3 = _mm_set1_ps(4.1665795894e-2);
+  exp_p4 = _mm_set1_ps(1.6666665459e-1);
+  exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+
+  for(;number < quarterPoints; number++) {
+    aVal = _mm_loadu_ps(aPtr);
+    tmp = _mm_setzero_ps();
+
+    aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
+
+    /* express exp(x) as exp(g + n*log(2)) */
+    fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
+
+    emm0 = _mm_cvttps_epi32(fx);
+    tmp = _mm_cvtepi32_ps(emm0);
+
+    mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+    fx = _mm_sub_ps(tmp, mask);
+
+    tmp = _mm_mul_ps(fx, exp_C1);
+    z = _mm_mul_ps(fx, exp_C2);
+    aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
+    z = _mm_mul_ps(aVal, aVal);
+
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
+    y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
+    y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
+    y = _mm_add_ps(y, one);
+
+    emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+    pow2n = _mm_castsi128_ps(emm0);
+    bVal = _mm_mul_ps(y, pow2n);
+
+    _mm_storeu_ps(bPtr, bVal);
+    aPtr += 4;
+    bPtr += 4;
+  }
+
+  number = quarterPoints * 4;
+  for(;number < num_points; number++){
+    *bPtr++ = expf(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void
+volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
+{
+  float* bPtr = bVector;
+  const float* aPtr = aVector;
+  unsigned int number = 0;
+
+  for(number = 0; number < num_points; number++){
+    *bPtr++ = expf(*aPtr++);
+  }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+#endif /* INCLUDED_volk_32f_exp_32f_u_H */
diff --git a/lib/kernel_tests.h b/lib/kernel_tests.h
index c009c3f..8552488 100644
--- a/lib/kernel_tests.h
+++ b/lib/kernel_tests.h
@@ -144,6 +144,8 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
     QA(VOLK_INIT_TEST(volk_32fc_x2_s32fc_multiply_conjugate_add_32fc, test_params))
     QA(VOLK_INIT_PUPP(volk_8u_x3_encodepolarpuppet_8u, volk_8u_x3_encodepolar_8u_x2, test_params))
     QA(VOLK_INIT_PUPP(volk_32f_8u_polarbutterflypuppet_32f, volk_32f_8u_polarbutterfly_32f, test_params))
+    QA(VOLK_INIT_TEST(volk_32f_exp_32f,                               test_params))
+
     // no one uses these, so don't test them
     //VOLK_PROFILE(volk_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
     //VOLK_PROFILE(volk_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);