70 #ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
71 #define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
80 #ifdef LV_HAVE_GENERIC
84 const float* inputBuffer,
85 unsigned int num_points)
87 const float* in_ptr = inputBuffer;
88 if (num_points == 0) {
90 }
else if (num_points == 1) {
97 float SquareSum[2] = { 0.f, 0.f };
101 uint32_t half_points = num_points / 2;
103 for (uint32_t number = 1; number < half_points; number++) {
104 float Val0 = (*in_ptr++);
105 float Val1 = (*in_ptr++);
106 float n = (float)number;
107 float n_plus_one = n + 1.f;
108 float r = 1.f / (n * n_plus_one);
113 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
114 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
117 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
120 uint32_t points_done = half_points * 2;
122 for (; points_done < num_points; points_done++) {
123 float Val = (*in_ptr++);
124 float n = (float)points_done;
125 float n_plus_one = n + 1.f;
126 float r = 1.f / (n * n_plus_one);
128 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
130 *stddev = sqrtf(SquareSum[0] / num_points);
131 *mean = Sum[0] / num_points;
141 float n = (float)len;
142 float n_plus_one = n + 1.f;
144 1.f / (n * n_plus_one) * (n_plus_one *
val - Sum) * (n_plus_one *
val - Sum);
149 const float SquareSum1,
154 float n = (float)len;
155 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
160 const uint32_t NumberOfPartitions,
161 const uint32_t PartitionLen)
164 uint32_t accumulators = NumberOfPartitions;
167 uint32_t partition_len = PartitionLen;
169 while (accumulators >>= 1) {
172 accumulators = NumberOfPartitions;
174 for (uint32_t s = 0; s < stages; s++) {
177 for (uint32_t a = 0; a < accumulators; a++) {
180 PartialSquareSums[idx + offset],
181 PartialSums[idx + offset],
183 PartialSums[idx] += PartialSums[idx + offset];
192 #include <arm_neon.h>
197 const float* inputBuffer,
198 unsigned int num_points)
200 if (num_points < 8) {
205 const float* in_ptr = inputBuffer;
210 const uint32_t eigth_points = num_points / 8;
212 float32x4_t Sum0, Sum1;
214 Sum0 = vld1q_f32((
const float32_t*)in_ptr);
218 Sum1 = vld1q_f32((
const float32_t*)in_ptr);
222 float32x4_t SquareSum0 = { 0.f };
223 float32x4_t SquareSum1 = { 0.f };
225 float32x4_t Values0, Values1;
226 float32x4_t Aux0, Aux1;
227 float32x4_t Reciprocal;
229 for (uint32_t number = 1; number < eigth_points; number++) {
230 Values0 = vld1q_f32(in_ptr);
234 Values1 = vld1q_f32(in_ptr);
238 float n = (float)number;
239 float n_plus_one = n + 1.f;
240 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
242 Sum0 = vaddq_f32(Sum0, Values0);
243 Aux0 = vdupq_n_f32(n_plus_one);
247 Sum1 = vaddq_f32(Sum1, Values1);
248 Aux1 = vdupq_n_f32(n_plus_one);
253 vst1q_f32(&SumLocal[0], Sum0);
254 vst1q_f32(&SumLocal[4], Sum1);
255 vst1q_f32(&SquareSumLocal[0], SquareSum0);
256 vst1q_f32(&SquareSumLocal[4], SquareSum1);
260 uint32_t points_done = eigth_points * 8;
262 for (; points_done < num_points; points_done++) {
263 float val = (*in_ptr++);
269 *stddev = sqrtf(SquareSumLocal[0] / num_points);
270 *mean = SumLocal[0] / num_points;
276 #include <xmmintrin.h>
280 const float* inputBuffer,
281 unsigned int num_points)
283 if (num_points < 8) {
288 const float* in_ptr = inputBuffer;
294 const uint32_t eigth_points = num_points / 8;
296 __m128 Sum0 = _mm_loadu_ps(in_ptr);
298 __m128 Sum1 = _mm_loadu_ps(in_ptr);
300 __m128 SquareSum0 = _mm_setzero_ps();
301 __m128 SquareSum1 = _mm_setzero_ps();
302 __m128 Values0, Values1;
306 for (uint32_t number = 1; number < eigth_points; number++) {
307 Values0 = _mm_loadu_ps(in_ptr);
311 Values1 = _mm_loadu_ps(in_ptr);
315 float n = (float)number;
316 float n_plus_one = n + 1.f;
317 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
319 Sum0 = _mm_add_ps(Sum0, Values0);
320 Aux0 = _mm_set_ps1(n_plus_one);
324 Sum1 = _mm_add_ps(Sum1, Values1);
325 Aux1 = _mm_set_ps1(n_plus_one);
330 _mm_store_ps(&SumLocal[0], Sum0);
331 _mm_store_ps(&SumLocal[4], Sum1);
332 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
333 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
337 uint32_t points_done = eigth_points * 8;
339 for (; points_done < num_points; points_done++) {
340 float val = (*in_ptr++);
346 *stddev = sqrtf(SquareSumLocal[0] / num_points);
347 *mean = SumLocal[0] / num_points;
352 #include <immintrin.h>
357 const float* inputBuffer,
358 unsigned int num_points)
360 if (num_points < 16) {
365 const float* in_ptr = inputBuffer;
370 const unsigned int sixteenth_points = num_points / 16;
372 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
374 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
377 __m256 SquareSum0 = _mm256_setzero_ps();
378 __m256 SquareSum1 = _mm256_setzero_ps();
379 __m256 Values0, Values1;
383 for (uint32_t number = 1; number < sixteenth_points; number++) {
384 Values0 = _mm256_loadu_ps(in_ptr);
388 Values1 = _mm256_loadu_ps(in_ptr);
392 float n = (float)number;
393 float n_plus_one = n + 1.f;
395 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
397 Sum0 = _mm256_add_ps(Sum0, Values0);
398 Aux0 = _mm256_set1_ps(n_plus_one);
402 Sum1 = _mm256_add_ps(Sum1, Values1);
403 Aux1 = _mm256_set1_ps(n_plus_one);
408 _mm256_store_ps(&SumLocal[0], Sum0);
409 _mm256_store_ps(&SumLocal[8], Sum1);
410 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
411 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
413 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
415 uint32_t points_done = sixteenth_points * 16;
417 for (; points_done < num_points; points_done++) {
418 float val = (*in_ptr++);
424 *stddev = sqrtf(SquareSumLocal[0] / num_points);
425 *mean = SumLocal[0] / num_points;
430 #include <xmmintrin.h>
434 const float* inputBuffer,
435 unsigned int num_points)
437 if (num_points < 8) {
442 const float* in_ptr = inputBuffer;
448 const uint32_t eigth_points = num_points / 8;
450 __m128 Sum0 = _mm_load_ps(in_ptr);
452 __m128 Sum1 = _mm_load_ps(in_ptr);
454 __m128 SquareSum0 = _mm_setzero_ps();
455 __m128 SquareSum1 = _mm_setzero_ps();
456 __m128 Values0, Values1;
460 for (uint32_t number = 1; number < eigth_points; number++) {
461 Values0 = _mm_load_ps(in_ptr);
465 Values1 = _mm_load_ps(in_ptr);
469 float n = (float)number;
470 float n_plus_one = n + 1.f;
471 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
473 Sum0 = _mm_add_ps(Sum0, Values0);
474 Aux0 = _mm_set_ps1(n_plus_one);
478 Sum1 = _mm_add_ps(Sum1, Values1);
479 Aux1 = _mm_set_ps1(n_plus_one);
484 _mm_store_ps(&SumLocal[0], Sum0);
485 _mm_store_ps(&SumLocal[4], Sum1);
486 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
487 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
491 uint32_t points_done = eigth_points * 8;
493 for (; points_done < num_points; points_done++) {
494 float val = (*in_ptr++);
500 *stddev = sqrtf(SquareSumLocal[0] / num_points);
501 *mean = SumLocal[0] / num_points;
506 #include <immintrin.h>
510 const float* inputBuffer,
511 unsigned int num_points)
513 if (num_points < 16) {
518 const float* in_ptr = inputBuffer;
523 const unsigned int sixteenth_points = num_points / 16;
525 __m256 Sum0 = _mm256_load_ps(in_ptr);
527 __m256 Sum1 = _mm256_load_ps(in_ptr);
530 __m256 SquareSum0 = _mm256_setzero_ps();
531 __m256 SquareSum1 = _mm256_setzero_ps();
532 __m256 Values0, Values1;
536 for (uint32_t number = 1; number < sixteenth_points; number++) {
537 Values0 = _mm256_load_ps(in_ptr);
541 Values1 = _mm256_load_ps(in_ptr);
545 float n = (float)number;
546 float n_plus_one = n + 1.f;
548 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
550 Sum0 = _mm256_add_ps(Sum0, Values0);
551 Aux0 = _mm256_set1_ps(n_plus_one);
555 Sum1 = _mm256_add_ps(Sum1, Values1);
556 Aux1 = _mm256_set1_ps(n_plus_one);
561 _mm256_store_ps(&SumLocal[0], Sum0);
562 _mm256_store_ps(&SumLocal[8], Sum1);
563 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
564 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
566 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
568 uint32_t points_done = sixteenth_points * 16;
570 for (; points_done < num_points; points_done++) {
571 float val = (*in_ptr++);
577 *stddev = sqrtf(SquareSumLocal[0] / num_points);
578 *mean = SumLocal[0] / num_points;
val
Definition: volk_arch_defs.py:66
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:278
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:147
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:158
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:355
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:82
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:508
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:195
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:432
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:135
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:281
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:62