Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
71 #ifndef INCLUDED_volk_32f_index_max_16u_a_H
72 #define INCLUDED_volk_32f_index_max_16u_a_H
73 
74 #include <inttypes.h>
75 #include <limits.h>
76 #include <stdio.h>
77 #include <volk/volk_common.h>
78 
79 #ifdef LV_HAVE_AVX
80 #include <immintrin.h>
81 
82 static inline void
83 volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
84 {
85  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
86 
87  uint32_t number = 0;
88  const uint32_t eighthPoints = num_points / 8;
89 
90  float* inputPtr = (float*)src0;
91 
92  __m256 indexIncrementValues = _mm256_set1_ps(8);
93  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
94 
95  float max = src0[0];
96  float index = 0;
97  __m256 maxValues = _mm256_set1_ps(max);
98  __m256 maxValuesIndex = _mm256_setzero_ps();
99  __m256 compareResults;
100  __m256 currentValues;
101 
102  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
103  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
104 
105  for (; number < eighthPoints; number++) {
106 
107  currentValues = _mm256_load_ps(inputPtr);
108  inputPtr += 8;
109  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
110 
111  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
112 
113  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
114  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
115  }
116 
117  // Calculate the largest value from the remaining 4 points
118  _mm256_store_ps(maxValuesBuffer, maxValues);
119  _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
120 
121  for (number = 0; number < 8; number++) {
122  if (maxValuesBuffer[number] > max) {
123  index = maxIndexesBuffer[number];
124  max = maxValuesBuffer[number];
125  } else if (maxValuesBuffer[number] == max) {
126  if (index > maxIndexesBuffer[number])
127  index = maxIndexesBuffer[number];
128  }
129  }
130 
131  number = eighthPoints * 8;
132  for (; number < num_points; number++) {
133  if (src0[number] > max) {
134  index = number;
135  max = src0[number];
136  }
137  }
138  target[0] = (uint16_t)index;
139 }
140 
141 #endif /*LV_HAVE_AVX*/
142 
143 #ifdef LV_HAVE_SSE4_1
144 #include <smmintrin.h>
145 
146 static inline void
147 volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
148 {
149  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
150 
151  uint32_t number = 0;
152  const uint32_t quarterPoints = num_points / 4;
153 
154  float* inputPtr = (float*)src0;
155 
156  __m128 indexIncrementValues = _mm_set1_ps(4);
157  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
158 
159  float max = src0[0];
160  float index = 0;
161  __m128 maxValues = _mm_set1_ps(max);
162  __m128 maxValuesIndex = _mm_setzero_ps();
163  __m128 compareResults;
164  __m128 currentValues;
165 
166  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
167  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
168 
169  for (; number < quarterPoints; number++) {
170 
171  currentValues = _mm_load_ps(inputPtr);
172  inputPtr += 4;
173  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
174 
175  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
176 
177  maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
178  maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
179  }
180 
181  // Calculate the largest value from the remaining 4 points
182  _mm_store_ps(maxValuesBuffer, maxValues);
183  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
184 
185  for (number = 0; number < 4; number++) {
186  if (maxValuesBuffer[number] > max) {
187  index = maxIndexesBuffer[number];
188  max = maxValuesBuffer[number];
189  } else if (maxValuesBuffer[number] == max) {
190  if (index > maxIndexesBuffer[number])
191  index = maxIndexesBuffer[number];
192  }
193  }
194 
195  number = quarterPoints * 4;
196  for (; number < num_points; number++) {
197  if (src0[number] > max) {
198  index = number;
199  max = src0[number];
200  }
201  }
202  target[0] = (uint16_t)index;
203 }
204 
205 #endif /*LV_HAVE_SSE4_1*/
206 
207 
208 #ifdef LV_HAVE_SSE
209 
210 #include <xmmintrin.h>
211 
212 static inline void
213 volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
214 {
215  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
216 
217  uint32_t number = 0;
218  const uint32_t quarterPoints = num_points / 4;
219 
220  float* inputPtr = (float*)src0;
221 
222  __m128 indexIncrementValues = _mm_set1_ps(4);
223  __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
224 
225  float max = src0[0];
226  float index = 0;
227  __m128 maxValues = _mm_set1_ps(max);
228  __m128 maxValuesIndex = _mm_setzero_ps();
229  __m128 compareResults;
230  __m128 currentValues;
231 
232  __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
233  __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
234 
235  for (; number < quarterPoints; number++) {
236 
237  currentValues = _mm_load_ps(inputPtr);
238  inputPtr += 4;
239  currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
240 
241  compareResults = _mm_cmpgt_ps(currentValues, maxValues);
242 
243  maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
244  _mm_andnot_ps(compareResults, maxValuesIndex));
245  maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
246  _mm_andnot_ps(compareResults, maxValues));
247  }
248 
249  // Calculate the largest value from the remaining 4 points
250  _mm_store_ps(maxValuesBuffer, maxValues);
251  _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
252 
253  for (number = 0; number < 4; number++) {
254  if (maxValuesBuffer[number] > max) {
255  index = maxIndexesBuffer[number];
256  max = maxValuesBuffer[number];
257  } else if (maxValuesBuffer[number] == max) {
258  if (index > maxIndexesBuffer[number])
259  index = maxIndexesBuffer[number];
260  }
261  }
262 
263  number = quarterPoints * 4;
264  for (; number < num_points; number++) {
265  if (src0[number] > max) {
266  index = number;
267  max = src0[number];
268  }
269  }
270  target[0] = (uint16_t)index;
271 }
272 
273 #endif /*LV_HAVE_SSE*/
274 
275 
276 #ifdef LV_HAVE_GENERIC
277 
278 static inline void
279 volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
280 {
281  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
282 
283  float max = src0[0];
284  uint16_t index = 0;
285 
286  uint32_t i = 1;
287 
288  for (; i < num_points; ++i) {
289  if (src0[i] > max) {
290  index = i;
291  max = src0[i];
292  }
293  }
294  target[0] = index;
295 }
296 
297 #endif /*LV_HAVE_GENERIC*/
298 
299 
300 #endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
301 
302 
303 #ifndef INCLUDED_volk_32f_index_max_16u_u_H
304 #define INCLUDED_volk_32f_index_max_16u_u_H
305 
306 #include <inttypes.h>
307 #include <limits.h>
308 #include <stdio.h>
309 #include <volk/volk_common.h>
310 
311 #ifdef LV_HAVE_AVX
312 #include <immintrin.h>
313 
314 static inline void
315 volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
316 {
317  num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
318 
319  uint32_t number = 0;
320  const uint32_t eighthPoints = num_points / 8;
321 
322  float* inputPtr = (float*)src0;
323 
324  __m256 indexIncrementValues = _mm256_set1_ps(8);
325  __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
326 
327  float max = src0[0];
328  float index = 0;
329  __m256 maxValues = _mm256_set1_ps(max);
330  __m256 maxValuesIndex = _mm256_setzero_ps();
331  __m256 compareResults;
332  __m256 currentValues;
333 
334  __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
335  __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
336 
337  for (; number < eighthPoints; number++) {
338 
339  currentValues = _mm256_loadu_ps(inputPtr);
340  inputPtr += 8;
341  currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
342 
343  compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
344 
345  maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
346  maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
347  }
348 
349  // Calculate the largest value from the remaining 4 points
350  _mm256_storeu_ps(maxValuesBuffer, maxValues);
351  _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
352 
353  for (number = 0; number < 8; number++) {
354  if (maxValuesBuffer[number] > max) {
355  index = maxIndexesBuffer[number];
356  max = maxValuesBuffer[number];
357  } else if (maxValuesBuffer[number] == max) {
358  if (index > maxIndexesBuffer[number])
359  index = maxIndexesBuffer[number];
360  }
361  }
362 
363  number = eighthPoints * 8;
364  for (; number < num_points; number++) {
365  if (src0[number] > max) {
366  index = number;
367  max = src0[number];
368  }
369  }
370  target[0] = (uint16_t)index;
371 }
372 
373 #endif /*LV_HAVE_AVX*/
374 
375 #endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:315
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:83
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:279
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:213
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25