Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
70 #ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71 #define INCLUDED_volk_32fc_index_max_32u_a_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <volk/volk_common.h>
76 #include <volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_AVX2
79 #include <immintrin.h>
81 
82 static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
83  lv_32fc_t* src0,
84  uint32_t num_points)
85 {
86  const __m256i indices_increment = _mm256_set1_epi32(8);
87  /*
88  * At the start of each loop iteration current_indices holds the indices of
89  * the complex numbers loaded from memory. Explanation for odd order is given
90  * in implementation of vector_32fc_index_max_variant0().
91  */
92  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 
94  __m256 max_values = _mm256_setzero_ps();
95  __m256i max_indices = _mm256_setzero_si256();
96 
97  for (unsigned i = 0; i < num_points / 8u; ++i) {
98  __m256 in0 = _mm256_load_ps((float*)src0);
99  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
101  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
102  src0 += 8;
103  }
104 
105  // determine maximum value and index in the result of the vectorized loop
106  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
107  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
108  _mm256_store_ps(max_values_buffer, max_values);
109  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
110 
111  float max = 0.f;
112  uint32_t index = 0;
113  for (unsigned i = 0; i < 8; i++) {
114  if (max_values_buffer[i] > max) {
115  max = max_values_buffer[i];
116  index = max_indices_buffer[i];
117  }
118  }
119 
120  // handle tail not processed by the vectorized loop
121  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
122  const float abs_squared =
123  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
124  if (abs_squared > max) {
125  max = abs_squared;
126  index = i;
127  }
128  ++src0;
129  }
130 
131  *target = index;
132 }
133 
134 #endif /*LV_HAVE_AVX2*/
135 
136 #ifdef LV_HAVE_AVX2
137 #include <immintrin.h>
139 
140 static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
141  lv_32fc_t* src0,
142  uint32_t num_points)
143 {
144  const __m256i indices_increment = _mm256_set1_epi32(8);
145  /*
146  * At the start of each loop iteration current_indices holds the indices of
147  * the complex numbers loaded from memory. Explanation for odd order is given
148  * in implementation of vector_32fc_index_max_variant0().
149  */
150  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
151 
152  __m256 max_values = _mm256_setzero_ps();
153  __m256i max_indices = _mm256_setzero_si256();
154 
155  for (unsigned i = 0; i < num_points / 8u; ++i) {
156  __m256 in0 = _mm256_load_ps((float*)src0);
157  __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
159  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
160  src0 += 8;
161  }
162 
163  // determine maximum value and index in the result of the vectorized loop
164  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
165  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
166  _mm256_store_ps(max_values_buffer, max_values);
167  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
168 
169  float max = 0.f;
170  uint32_t index = 0;
171  for (unsigned i = 0; i < 8; i++) {
172  if (max_values_buffer[i] > max) {
173  max = max_values_buffer[i];
174  index = max_indices_buffer[i];
175  }
176  }
177 
178  // handle tail not processed by the vectorized loop
179  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
180  const float abs_squared =
181  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
182  if (abs_squared > max) {
183  max = abs_squared;
184  index = i;
185  }
186  ++src0;
187  }
188 
189  *target = index;
190 }
191 
192 #endif /*LV_HAVE_AVX2*/
193 
194 #ifdef LV_HAVE_SSE3
195 #include <pmmintrin.h>
196 #include <xmmintrin.h>
197 
198 static inline void
199 volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
200 {
201  const uint32_t num_bytes = num_points * 8;
202 
203  union bit128 holderf;
204  union bit128 holderi;
205  float sq_dist = 0.0;
206 
207  union bit128 xmm5, xmm4;
208  __m128 xmm1, xmm2, xmm3;
209  __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210 
211  xmm5.int_vec = _mm_setzero_si128();
212  xmm4.int_vec = _mm_setzero_si128();
213  holderf.int_vec = _mm_setzero_si128();
214  holderi.int_vec = _mm_setzero_si128();
215 
216  int bound = num_bytes >> 5;
217  int i = 0;
218 
219  xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220  xmm9 = _mm_setzero_si128();
221  xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222  xmm3 = _mm_setzero_ps();
223 
224  for (; i < bound; ++i) {
225  xmm1 = _mm_load_ps((float*)src0);
226  xmm2 = _mm_load_ps((float*)&src0[2]);
227 
228  src0 += 4;
229 
230  xmm1 = _mm_mul_ps(xmm1, xmm1);
231  xmm2 = _mm_mul_ps(xmm2, xmm2);
232 
233  xmm1 = _mm_hadd_ps(xmm1, xmm2);
234 
235  xmm3 = _mm_max_ps(xmm1, xmm3);
236 
237  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239 
240  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242 
243  xmm9 = _mm_add_epi32(xmm11, xmm12);
244 
245  xmm8 = _mm_add_epi32(xmm8, xmm10);
246  }
247 
248  if (num_bytes >> 4 & 1) {
249  xmm2 = _mm_load_ps((float*)src0);
250 
251  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252  xmm8 = bit128_p(&xmm1)->int_vec;
253 
254  xmm2 = _mm_mul_ps(xmm2, xmm2);
255 
256  src0 += 2;
257 
258  xmm1 = _mm_hadd_ps(xmm2, xmm2);
259 
260  xmm3 = _mm_max_ps(xmm1, xmm3);
261 
262  xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263 
264  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266 
267  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269 
270  xmm9 = _mm_add_epi32(xmm11, xmm12);
271 
272  xmm8 = _mm_add_epi32(xmm8, xmm10);
273  }
274 
275  if (num_bytes >> 3 & 1) {
276  sq_dist =
277  lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278 
279  xmm2 = _mm_load1_ps(&sq_dist);
280 
281  xmm1 = xmm3;
282 
283  xmm3 = _mm_max_ss(xmm3, xmm2);
284 
285  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287 
288  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289 
290  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292 
293  xmm9 = _mm_add_epi32(xmm11, xmm12);
294  }
295 
296  _mm_store_ps((float*)&(holderf.f), xmm3);
297  _mm_store_si128(&(holderi.int_vec), xmm9);
298 
299  target[0] = holderi.i[0];
300  sq_dist = holderf.f[0];
301  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307 }
308 
309 #endif /*LV_HAVE_SSE3*/
310 
311 #ifdef LV_HAVE_GENERIC
312 static inline void
313 volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
314 {
315  const uint32_t num_bytes = num_points * 8;
316 
317  float sq_dist = 0.0;
318  float max = 0.0;
319  uint32_t index = 0;
320 
321  uint32_t i = 0;
322 
323  for (; i<num_bytes>> 3; ++i) {
324  sq_dist =
325  lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
326 
327  if (sq_dist > max) {
328  index = i;
329  max = sq_dist;
330  }
331  }
332  target[0] = index;
333 }
334 
335 #endif /*LV_HAVE_GENERIC*/
336 
337 #endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
338 
339 #ifndef INCLUDED_volk_32fc_index_max_32u_u_H
340 #define INCLUDED_volk_32fc_index_max_32u_u_H
341 
342 #include <inttypes.h>
343 #include <stdio.h>
344 #include <volk/volk_common.h>
345 #include <volk/volk_complex.h>
346 
347 #ifdef LV_HAVE_AVX2
348 #include <immintrin.h>
350 
351 static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
352  lv_32fc_t* src0,
353  uint32_t num_points)
354 {
355  const __m256i indices_increment = _mm256_set1_epi32(8);
356  /*
357  * At the start of each loop iteration current_indices holds the indices of
358  * the complex numbers loaded from memory. Explanation for odd order is given
359  * in implementation of vector_32fc_index_max_variant0().
360  */
361  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362 
363  __m256 max_values = _mm256_setzero_ps();
364  __m256i max_indices = _mm256_setzero_si256();
365 
366  for (unsigned i = 0; i < num_points / 8u; ++i) {
367  __m256 in0 = _mm256_loadu_ps((float*)src0);
368  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
370  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
371  src0 += 8;
372  }
373 
374  // determine maximum value and index in the result of the vectorized loop
375  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
376  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
377  _mm256_store_ps(max_values_buffer, max_values);
378  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
379 
380  float max = 0.f;
381  uint32_t index = 0;
382  for (unsigned i = 0; i < 8; i++) {
383  if (max_values_buffer[i] > max) {
384  max = max_values_buffer[i];
385  index = max_indices_buffer[i];
386  }
387  }
388 
389  // handle tail not processed by the vectorized loop
390  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391  const float abs_squared =
392  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
393  if (abs_squared > max) {
394  max = abs_squared;
395  index = i;
396  }
397  ++src0;
398  }
399 
400  *target = index;
401 }
402 
403 #endif /*LV_HAVE_AVX2*/
404 
405 #ifdef LV_HAVE_AVX2
406 #include <immintrin.h>
408 
409 static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
410  lv_32fc_t* src0,
411  uint32_t num_points)
412 {
413  const __m256i indices_increment = _mm256_set1_epi32(8);
414  /*
415  * At the start of each loop iteration current_indices holds the indices of
416  * the complex numbers loaded from memory. Explanation for odd order is given
417  * in implementation of vector_32fc_index_max_variant0().
418  */
419  __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
420 
421  __m256 max_values = _mm256_setzero_ps();
422  __m256i max_indices = _mm256_setzero_si256();
423 
424  for (unsigned i = 0; i < num_points / 8u; ++i) {
425  __m256 in0 = _mm256_loadu_ps((float*)src0);
426  __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
428  in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
429  src0 += 8;
430  }
431 
432  // determine maximum value and index in the result of the vectorized loop
433  __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
434  __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
435  _mm256_store_ps(max_values_buffer, max_values);
436  _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
437 
438  float max = 0.f;
439  uint32_t index = 0;
440  for (unsigned i = 0; i < 8; i++) {
441  if (max_values_buffer[i] > max) {
442  max = max_values_buffer[i];
443  index = max_indices_buffer[i];
444  }
445  }
446 
447  // handle tail not processed by the vectorized loop
448  for (unsigned i = num_points & (~7u); i < num_points; ++i) {
449  const float abs_squared =
450  lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
451  if (abs_squared > max) {
452  max = abs_squared;
453  index = i;
454  }
455  ++src0;
456  }
457 
458  *target = index;
459 }
460 
461 #endif /*LV_HAVE_AVX2*/
462 
463 #ifdef LV_HAVE_NEON
464 #include <arm_neon.h>
466 
467 static inline void
468 volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
469 {
470  unsigned int number = 0;
471  const uint32_t quarter_points = num_points / 4;
472  const lv_32fc_t* src0Ptr = src0;
473 
474  uint32_t indices[4] = { 0, 1, 2, 3 };
475  const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
476  uint32x4_t vec_indices = vld1q_u32(indices);
477  uint32x4_t vec_max_indices = vec_indices;
478 
479  if (num_points) {
480  float max = *src0Ptr;
481  uint32_t index = 0;
482 
483  float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
484 
485  for (; number < quarter_points; number++) {
486  // Load complex and compute magnitude squared
487  const float32x4_t vec_mag2 =
488  _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
489  __VOLK_PREFETCH(src0Ptr += 4);
490  // a > b?
491  const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
492  vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
493  vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
494  vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
495  }
496  uint32_t tmp_max_indices[4];
497  float tmp_max[4];
498  vst1q_u32(tmp_max_indices, vec_max_indices);
499  vst1q_f32(tmp_max, vec_max);
500 
501  for (int i = 0; i < 4; i++) {
502  if (tmp_max[i] > max) {
503  max = tmp_max[i];
504  index = tmp_max_indices[i];
505  }
506  }
507 
508  // Deal with the rest
509  for (number = quarter_points * 4; number < num_points; number++) {
510  const float re = lv_creal(*src0Ptr);
511  const float im = lv_cimag(*src0Ptr);
512  if ((re * re + im * im) > max) {
513  max = *src0Ptr;
514  index = number;
515  }
516  src0Ptr++;
517  }
518  *target = index;
519  }
520 }
521 
522 #endif /*LV_HAVE_NEON*/
523 
524 #endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:313
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:199
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:468
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87