* \li outputVector: The output vector.
*
* \b Example
- * Convert floats from [-1,1] to 16-bit integers with a scale of 5 to maintain smallest
+ * Convert floats from [-1,1] to 8-bit integers with a scale of 5 to maintain smallest
delta
* int N = 10;
* unsigned int alignment = volk_get_alignment();
* float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment);
- * int16_t* out = (int16_t*)volk_malloc(sizeof(int16_t)*N, alignment);
+ * int8_t* out = (int8_t*)volk_malloc(sizeof(int8_t)*N, alignment);
*
* for(unsigned int ii = 0; ii < N; ++ii){
* increasing[ii] = 2.f * ((float)ii / (float)N) - 1.f;
* float scale = 5.1f;
*
- * volk_32f_s32f_convert_32i(out, increasing, scale, N);
+ * volk_32f_s32f_convert_8i(out, increasing, scale, N);
*
* for(unsigned int ii = 0; ii < N; ++ii){
* printf("out[%u] = %i\n", ii, out[ii]);
#define INCLUDED_volk_32f_s32f_convert_8i_u_H
#include <inttypes.h>
-#include <stdio.h>
static inline void volk_32f_s32f_convert_8i_single(int8_t* out, const float in)
{
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
if (in > max_val) {
*out = (int8_t)(max_val);
} else if (in < min_val) {
}
}
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
+ const float* inputVector,
+ const float scalar,
+ unsigned int num_points)
+{
+ const float* inputVectorPtr = inputVector;
+
+ for (unsigned int number = 0; number < num_points; number++) {
+ const float r = *inputVectorPtr++ * scalar;
+ volk_32f_s32f_convert_8i_single(&outputVector[number], r);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
-
const unsigned int thirtysecondPoints = num_points / 32;
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m256 vmin_val = _mm256_set1_ps(min_val);
+ const __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256i intInputVal;
+ const __m256 vScalar = _mm256_set1_ps(scalar);
- for (; number < thirtysecondPoints; number++) {
- inputVal1 = _mm256_loadu_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < thirtysecondPoints; number++) {
+ __m256 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal2 = _mm256_loadu_ps(inputVectorPtr);
+ __m256 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal3 = _mm256_loadu_ps(inputVectorPtr);
+ __m256 inputVal3 = _mm256_loadu_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal4 = _mm256_loadu_ps(inputVectorPtr);
+ __m256 inputVal4 = _mm256_loadu_ps(inputVectorPtr);
inputVectorPtr += 8;
inputVal1 = _mm256_max_ps(
inputVal4 = _mm256_max_ps(
_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+ __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+ __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+ __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+ __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+ const __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal);
outputVectorPtr += 32;
}
- number = thirtysecondPoints * 32;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
+ float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
-
const unsigned int sixteenthPoints = num_points / 16;
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ const __m128 vScalar = _mm_set_ps1(scalar);
- for (; number < sixteenthPoints; number++) {
- inputVal1 = _mm_loadu_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < sixteenthPoints; number++) {
+ __m128 inputVal1 = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal2 = _mm_loadu_ps(inputVectorPtr);
+ __m128 inputVal2 = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal3 = _mm_loadu_ps(inputVectorPtr);
+ __m128 inputVal3 = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal4 = _mm_loadu_ps(inputVectorPtr);
+ __m128 inputVal4 = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
inputVal1 =
inputVal4 =
_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
- intInputVal2 = _mm_cvtps_epi32(inputVal2);
- intInputVal3 = _mm_cvtps_epi32(inputVal3);
- intInputVal4 = _mm_cvtps_epi32(inputVal4);
+ __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
outputVectorPtr += 16;
}
- number = sixteenthPoints * 16;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
+ const float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- size_t inner_loop;
-
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ const __m128 vScalar = _mm_set_ps1(scalar);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for (; number < quarterPoints; number++) {
- ret = _mm_loadu_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < quarterPoints; number++) {
+ __m128 ret = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
- for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+ for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
*outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
+ const float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
#endif /* LV_HAVE_SSE */
-#ifdef LV_HAVE_GENERIC
-
-static inline void volk_32f_s32f_convert_8i_generic(int8_t* outputVector,
- const float* inputVector,
- const float scalar,
- unsigned int num_points)
-{
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float r;
-
- for (number = 0; number < num_points; number++) {
- r = *inputVectorPtr++ * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_s32f_convert_8i_u_H */
#ifndef INCLUDED_volk_32f_s32f_convert_8i_a_H
#define INCLUDED_volk_32f_s32f_convert_8i_a_H
#include <inttypes.h>
-#include <stdio.h>
-#include <volk/volk_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
-
const unsigned int thirtysecondPoints = num_points / 32;
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m256 vmin_val = _mm256_set1_ps(min_val);
+ const __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256 vScalar = _mm256_set1_ps(scalar);
- __m256 inputVal1, inputVal2, inputVal3, inputVal4;
- __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m256 vmin_val = _mm256_set1_ps(min_val);
- __m256 vmax_val = _mm256_set1_ps(max_val);
- __m256i intInputVal;
+ const __m256 vScalar = _mm256_set1_ps(scalar);
- for (; number < thirtysecondPoints; number++) {
- inputVal1 = _mm256_load_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < thirtysecondPoints; number++) {
+ __m256 inputVal1 = _mm256_load_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal2 = _mm256_load_ps(inputVectorPtr);
+ __m256 inputVal2 = _mm256_load_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal3 = _mm256_load_ps(inputVectorPtr);
+ __m256 inputVal3 = _mm256_load_ps(inputVectorPtr);
inputVectorPtr += 8;
- inputVal4 = _mm256_load_ps(inputVectorPtr);
+ __m256 inputVal4 = _mm256_load_ps(inputVectorPtr);
inputVectorPtr += 8;
inputVal1 = _mm256_max_ps(
inputVal4 = _mm256_max_ps(
_mm256_min_ps(_mm256_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm256_cvtps_epi32(inputVal1);
- intInputVal2 = _mm256_cvtps_epi32(inputVal2);
- intInputVal3 = _mm256_cvtps_epi32(inputVal3);
- intInputVal4 = _mm256_cvtps_epi32(inputVal4);
+ __m256i intInputVal1 = _mm256_cvtps_epi32(inputVal1);
+ __m256i intInputVal2 = _mm256_cvtps_epi32(inputVal2);
+ __m256i intInputVal3 = _mm256_cvtps_epi32(inputVal3);
+ __m256i intInputVal4 = _mm256_cvtps_epi32(inputVal4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal3 = _mm256_permute4x64_epi64(intInputVal3, 0b11011000);
intInputVal1 = _mm256_packs_epi16(intInputVal1, intInputVal3);
- intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+ __m256i intInputVal = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal);
outputVectorPtr += 32;
}
- number = thirtysecondPoints * 32;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = thirtysecondPoints * 32; number < num_points; number++) {
+ const float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
-
const unsigned int sixteenthPoints = num_points / 16;
const float* inputVectorPtr = (const float*)inputVector;
int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 inputVal1, inputVal2, inputVal3, inputVal4;
- __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ const __m128 vScalar = _mm_set_ps1(scalar);
- for (; number < sixteenthPoints; number++) {
- inputVal1 = _mm_load_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < sixteenthPoints; number++) {
+ __m128 inputVal1 = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal2 = _mm_load_ps(inputVectorPtr);
+ __m128 inputVal2 = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal3 = _mm_load_ps(inputVectorPtr);
+ __m128 inputVal3 = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
- inputVal4 = _mm_load_ps(inputVectorPtr);
+ __m128 inputVal4 = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
inputVal1 =
inputVal4 =
_mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal4, vScalar), vmax_val), vmin_val);
- intInputVal1 = _mm_cvtps_epi32(inputVal1);
- intInputVal2 = _mm_cvtps_epi32(inputVal2);
- intInputVal3 = _mm_cvtps_epi32(inputVal3);
- intInputVal4 = _mm_cvtps_epi32(inputVal4);
+ __m128i intInputVal1 = _mm_cvtps_epi32(inputVal1);
+ __m128i intInputVal2 = _mm_cvtps_epi32(inputVal2);
+ __m128i intInputVal3 = _mm_cvtps_epi32(inputVal3);
+ __m128i intInputVal4 = _mm_cvtps_epi32(inputVal4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal3 = _mm_packs_epi32(intInputVal3, intInputVal4);
outputVectorPtr += 16;
}
- number = sixteenthPoints * 16;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = sixteenthPoints * 16; number < num_points; number++) {
+ const float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
const float scalar,
unsigned int num_points)
{
- unsigned int number = 0;
- size_t inner_loop;
-
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
+ int8_t* outputVectorPtr = outputVector;
- float min_val = INT8_MIN;
- float max_val = INT8_MAX;
- float r;
+ const float min_val = INT8_MIN;
+ const float max_val = INT8_MAX;
+ const __m128 vmin_val = _mm_set_ps1(min_val);
+ const __m128 vmax_val = _mm_set_ps1(max_val);
- int8_t* outputVectorPtr = outputVector;
- __m128 vScalar = _mm_set_ps1(scalar);
- __m128 ret;
- __m128 vmin_val = _mm_set_ps1(min_val);
- __m128 vmax_val = _mm_set_ps1(max_val);
+ const __m128 vScalar = _mm_set_ps1(scalar);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
- for (; number < quarterPoints; number++) {
- ret = _mm_load_ps(inputVectorPtr);
+ for (unsigned int number = 0; number < quarterPoints; number++) {
+ __m128 ret = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
- for (inner_loop = 0; inner_loop < 4; inner_loop++) {
+ for (size_t inner_loop = 0; inner_loop < 4; inner_loop++) {
*outputVectorPtr++ = (int8_t)(rintf(outputFloatBuffer[inner_loop]));
}
}
- number = quarterPoints * 4;
- for (; number < num_points; number++) {
- r = inputVector[number] * scalar;
+ for (unsigned int number = quarterPoints * 4; number < num_points; number++) {
+ const float r = inputVector[number] * scalar;
volk_32f_s32f_convert_8i_single(&outputVector[number], r);
}
}
#endif /* LV_HAVE_SSE */
-#ifdef LV_HAVE_GENERIC
-
-static inline void volk_32f_s32f_convert_8i_a_generic(int8_t* outputVector,
- const float* inputVector,
- const float scalar,
- unsigned int num_points)
-{
- const float* inputVectorPtr = inputVector;
- unsigned int number = 0;
- float r;
-
- for (number = 0; number < num_points; number++) {
- r = *inputVectorPtr++ * scalar;
- volk_32f_s32f_convert_8i_single(&outputVector[number], r);
- }
-}
-
-#endif /* LV_HAVE_GENERIC */
-
-
#endif /* INCLUDED_volk_32f_s32f_convert_8i_a_H */