kernel: Adds unaligned protokernles to `32f_x2_s32f_interleave_16ic` and `32f_x2_subt...
authorDamian Miralles <dmiralles2009@gmail.com>
Wed, 13 Dec 2017 20:27:17 +0000 (13:27 -0700)
committerA. Maitland Bottoms <bottoms@debian.org>
Sun, 4 Feb 2018 18:12:21 +0000 (18:12 +0000)
Adds unaligned versions to the afore mentioned kernels, relative speeds
improvements shown in both cases.

Gbp-Pq: Name 0015-kernel-Adds-unaligned-protokernles-to-32f_x2_s32f_in.patch

kernels/volk/volk_32f_x2_s32f_interleave_16ic.h
kernels/volk/volk_32f_x2_subtract_32f.h

index 99f1b5eee3094835e64ed55477007e764dde1b70..20f66ff2bc61c1ab65dd4e9ae483a97c7a099a84 100644 (file)
@@ -214,3 +214,66 @@ volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector, const float*
 
 
 #endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
+
+#include <volk/volk_common.h>
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector, const float* iBuffer,
+                                        const float* qBuffer, const float scalar, unsigned int num_points)
+{
+  unsigned int number = 0;
+  const float* iBufferPtr = iBuffer;
+  const float* qBufferPtr = qBuffer;
+
+  __m256 vScalar = _mm256_set1_ps(scalar);
+
+  const unsigned int eighthPoints = num_points / 8;
+
+  __m256 iValue, qValue, cplxValue1, cplxValue2;
+  __m256i intValue1, intValue2;
+
+  int16_t* complexVectorPtr = (int16_t*)complexVector;
+
+  for(;number < eighthPoints; number++){
+    iValue = _mm256_loadu_ps(iBufferPtr);
+    qValue = _mm256_loadu_ps(qBufferPtr);
+
+    // Interleaves the lower two values in the i and q variables into one buffer
+    cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
+    cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
+
+    // Interleaves the upper two values in the i and q variables into one buffer
+    cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
+    cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
+
+    intValue1 = _mm256_cvtps_epi32(cplxValue1);
+    intValue2 = _mm256_cvtps_epi32(cplxValue2);
+
+    intValue1 = _mm256_packs_epi32(intValue1, intValue2);
+
+    _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
+    complexVectorPtr += 16;
+
+    iBufferPtr += 8;
+    qBufferPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  complexVectorPtr = (int16_t*)(&complexVector[number]);
+  for(; number < num_points; number++){
+    *complexVectorPtr++ = (int16_t)(*iBufferPtr++ * scalar);
+    *complexVectorPtr++ = (int16_t)(*qBufferPtr++ * scalar);
+  }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
index 4a452fd92ed9ecd9912b16c551dcd847b1a85527..b7f36cf5cf188c9f5450cf10f2d064590a23252d 100644 (file)
@@ -176,3 +176,48 @@ volk_32f_x2_subtract_32f_u_orc(float* cVector, const float* aVector,
 
 
 #endif /* INCLUDED_volk_32f_x2_subtract_32f_a_H */
+
+
+#ifndef INCLUDED_volk_32f_x2_subtract_32f_u_H
+#define INCLUDED_volk_32f_x2_subtract_32f_u_H
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void
+volk_32f_x2_subtract_32f_u_avx(float* cVector, const float* aVector,
+                               const float* bVector, unsigned int num_points)
+{
+  unsigned int number = 0;
+  const unsigned int eighthPoints = num_points / 8;
+
+  float* cPtr = cVector;
+  const float* aPtr = aVector;
+  const float* bPtr = bVector;
+
+  __m256 aVal, bVal, cVal;
+  for(;number < eighthPoints; number++){
+
+    aVal = _mm256_loadu_ps(aPtr);
+    bVal = _mm256_loadu_ps(bPtr);
+
+    cVal = _mm256_sub_ps(aVal, bVal);
+
+    _mm256_storeu_ps(cPtr,cVal); // Store the results back into the C container
+
+    aPtr += 8;
+    bPtr += 8;
+    cPtr += 8;
+  }
+
+  number = eighthPoints * 8;
+  for(;number < num_points; number++){
+    *cPtr++ = (*aPtr++) - (*bPtr++);
+  }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_x2_subtract_32f_u_H */