54 #ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
55 #define INCLUDED_volk_16ic_magnitude_16i_a_H
64 #include <immintrin.h>
66 static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
68 unsigned int num_points)
70 unsigned int number = 0;
71 const unsigned int eighthPoints = num_points / 8;
73 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
74 int16_t* magnitudeVectorPtr = magnitudeVector;
76 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
77 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
79 __m128i short1, short2;
80 __m256 cplxValue1, cplxValue2, result;
81 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
83 for (; number < eighthPoints; number++) {
85 int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
86 complexVectorPtr += 16;
87 short1 = _mm256_extracti128_si256(int1, 0);
88 short2 = _mm256_extracti128_si256(int1, 1);
90 int1 = _mm256_cvtepi16_epi32(short1);
91 int2 = _mm256_cvtepi16_epi32(short2);
92 cplxValue1 = _mm256_cvtepi32_ps(int1);
93 cplxValue2 = _mm256_cvtepi32_ps(int2);
95 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
96 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
98 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
99 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
101 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
103 result = _mm256_sqrt_ps(result);
105 result = _mm256_mul_ps(result, vScalar);
107 int1 = _mm256_cvtps_epi32(result);
108 int1 = _mm256_packs_epi32(int1, int1);
109 int1 = _mm256_permutevar8x32_epi32(
111 short1 = _mm256_extracti128_si256(int1, 0);
112 _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
113 magnitudeVectorPtr += 8;
116 number = eighthPoints * 8;
117 magnitudeVectorPtr = &magnitudeVector[number];
118 complexVectorPtr = (
const int16_t*)&complexVector[number];
119 for (; number < num_points; number++) {
120 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
121 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
122 const float val1Result =
123 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
124 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
130 #include <pmmintrin.h>
134 unsigned int num_points)
136 unsigned int number = 0;
137 const unsigned int quarterPoints = num_points / 4;
139 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
140 int16_t* magnitudeVectorPtr = magnitudeVector;
142 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
143 __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
145 __m128 cplxValue1, cplxValue2, result;
150 for (; number < quarterPoints; number++) {
152 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
153 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
154 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
155 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
157 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
158 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
159 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
160 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
162 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
163 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
165 complexVectorPtr += 8;
167 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
168 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
170 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1);
171 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2);
173 result = _mm_hadd_ps(cplxValue1, cplxValue2);
175 result = _mm_sqrt_ps(result);
177 result = _mm_mul_ps(result, vScalar);
179 _mm_store_ps(outputFloatBuffer, result);
180 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
181 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
182 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
183 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
186 number = quarterPoints * 4;
187 magnitudeVectorPtr = &magnitudeVector[number];
188 complexVectorPtr = (
const int16_t*)&complexVector[number];
189 for (; number < num_points; number++) {
190 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
191 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
192 const float val1Result =
193 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
194 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
200 #include <xmmintrin.h>
204 unsigned int num_points)
206 unsigned int number = 0;
207 const unsigned int quarterPoints = num_points / 4;
209 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
210 int16_t* magnitudeVectorPtr = magnitudeVector;
212 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
213 __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
215 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
220 for (; number < quarterPoints; number++) {
222 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
223 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
224 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
225 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
227 cplxValue1 = _mm_load_ps(inputFloatBuffer);
228 complexVectorPtr += 4;
230 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
231 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
232 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
233 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
235 cplxValue2 = _mm_load_ps(inputFloatBuffer);
236 complexVectorPtr += 4;
238 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
239 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
242 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
244 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
246 iValue = _mm_mul_ps(iValue, iValue);
247 qValue = _mm_mul_ps(qValue, qValue);
249 result = _mm_add_ps(iValue, qValue);
251 result = _mm_sqrt_ps(result);
253 result = _mm_mul_ps(result, vScalar);
255 _mm_store_ps(outputFloatBuffer, result);
256 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
257 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
258 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
259 *magnitudeVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
262 number = quarterPoints * 4;
263 magnitudeVectorPtr = &magnitudeVector[number];
264 complexVectorPtr = (
const int16_t*)&complexVector[number];
265 for (; number < num_points; number++) {
266 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
267 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
268 const float val1Result =
269 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
270 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
275 #ifdef LV_HAVE_GENERIC
279 unsigned int num_points)
281 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
282 int16_t* magnitudeVectorPtr = magnitudeVector;
283 unsigned int number = 0;
284 const float scalar = SHRT_MAX;
285 for (number = 0; number < num_points; number++) {
286 float real = ((float)(*complexVectorPtr++)) / scalar;
287 float imag = ((float)(*complexVectorPtr++)) / scalar;
288 *magnitudeVectorPtr++ =
289 (int16_t)
rintf(sqrtf((real * real) + (imag * imag)) * scalar);
294 #ifdef LV_HAVE_ORC_DISABLED
295 extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
298 unsigned int num_points);
300 static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
302 unsigned int num_points)
304 volk_16ic_magnitude_16i_a_orc_impl(
305 magnitudeVector, complexVector, SHRT_MAX, num_points);
313 #ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
314 #define INCLUDED_volk_16ic_magnitude_16i_u_H
316 #include <inttypes.h>
322 #include <immintrin.h>
324 static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
326 unsigned int num_points)
328 unsigned int number = 0;
329 const unsigned int eighthPoints = num_points / 8;
331 const int16_t* complexVectorPtr = (
const int16_t*)complexVector;
332 int16_t* magnitudeVectorPtr = magnitudeVector;
334 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
335 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
337 __m128i short1, short2;
338 __m256 cplxValue1, cplxValue2, result;
339 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
341 for (; number < eighthPoints; number++) {
343 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
344 complexVectorPtr += 16;
345 short1 = _mm256_extracti128_si256(int1, 0);
346 short2 = _mm256_extracti128_si256(int1, 1);
348 int1 = _mm256_cvtepi16_epi32(short1);
349 int2 = _mm256_cvtepi16_epi32(short2);
350 cplxValue1 = _mm256_cvtepi32_ps(int1);
351 cplxValue2 = _mm256_cvtepi32_ps(int2);
353 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
354 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
356 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1);
357 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2);
359 result = _mm256_hadd_ps(cplxValue1, cplxValue2);
361 result = _mm256_sqrt_ps(result);
363 result = _mm256_mul_ps(result, vScalar);
365 int1 = _mm256_cvtps_epi32(result);
366 int1 = _mm256_packs_epi32(int1, int1);
367 int1 = _mm256_permutevar8x32_epi32(
369 short1 = _mm256_extracti128_si256(int1, 0);
370 _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
371 magnitudeVectorPtr += 8;
374 number = eighthPoints * 8;
375 magnitudeVectorPtr = &magnitudeVector[number];
376 complexVectorPtr = (
const int16_t*)&complexVector[number];
377 for (; number < num_points; number++) {
378 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
379 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
380 const float val1Result =
381 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
382 *magnitudeVectorPtr++ = (int16_t)
rintf(val1Result);
387 #ifdef LV_HAVE_NEONV7
388 #include <arm_neon.h>
391 static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
393 unsigned int num_points)
395 unsigned int number = 0;
396 unsigned int quarter_points = num_points / 4;
398 const float scalar = SHRT_MAX;
399 const float inv_scalar = 1.0f / scalar;
401 int16_t* magnitudeVectorPtr = magnitudeVector;
402 const lv_16sc_t* complexVectorPtr = complexVector;
407 for (number = 0; number < quarter_points; number++) {
408 const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
410 c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
411 c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
413 c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
414 c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
417 mag_vec = vmulq_f32(mag_vec_squared,
_vinvsqrtq_f32(mag_vec_squared));
419 mag_vec = vmulq_n_f32(mag_vec, scalar);
422 mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
423 const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
424 vst1_s16(magnitudeVectorPtr, mag16_vec);
426 magnitudeVectorPtr += 4;
427 complexVectorPtr += 4;
431 for (number = quarter_points * 4; number < num_points; number++) {
432 const float real =
lv_creal(*complexVectorPtr) * inv_scalar;
433 const float imag =
lv_cimag(*complexVectorPtr) * inv_scalar;
434 *magnitudeVectorPtr =
435 (int16_t)
rintf(sqrtf((real * real) + (imag * imag)) * scalar);
437 magnitudeVectorPtr++;
static float rintf(float x)
Definition: config.h:37
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:277
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:202
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:132
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
short complex lv_16sc_t
Definition: volk_complex.h:62
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:97
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87