Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
55 #ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
56 #define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
57 
58 #include <inttypes.h>
59 #include <stdio.h>
60 
61 #ifdef LV_HAVE_AVX
62 #include <immintrin.h>
63 
64 static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
65  const lv_32fc_t* aVector,
66  const float* bVector,
67  unsigned int num_points)
68 {
69  unsigned int number = 0;
70  const unsigned int eighthPoints = num_points / 8;
71 
72  lv_32fc_t* cPtr = cVector;
73  const lv_32fc_t* aPtr = aVector;
74  const float* bPtr = bVector;
75 
76  __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
77 
78  __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
79 
80  for (; number < eighthPoints; number++) {
81 
82  aVal1 = _mm256_load_ps((float*)aPtr);
83  aPtr += 4;
84 
85  aVal2 = _mm256_load_ps((float*)aPtr);
86  aPtr += 4;
87 
88  bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
89  bPtr += 8;
90 
91  bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
92  bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
93 
94  bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
95  bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
96 
97  cVal1 = _mm256_mul_ps(aVal1, bVal1);
98  cVal2 = _mm256_mul_ps(aVal2, bVal2);
99 
100  _mm256_store_ps((float*)cPtr,
101  cVal1); // Store the results back into the C container
102  cPtr += 4;
103 
104  _mm256_store_ps((float*)cPtr,
105  cVal2); // Store the results back into the C container
106  cPtr += 4;
107  }
108 
109  number = eighthPoints * 8;
110  for (; number < num_points; ++number) {
111  *cPtr++ = (*aPtr++) * (*bPtr++);
112  }
113 }
114 #endif /* LV_HAVE_AVX */
115 
116 
117 #ifdef LV_HAVE_SSE
118 #include <xmmintrin.h>
119 
120 static inline void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t* cVector,
121  const lv_32fc_t* aVector,
122  const float* bVector,
123  unsigned int num_points)
124 {
125  unsigned int number = 0;
126  const unsigned int quarterPoints = num_points / 4;
127 
128  lv_32fc_t* cPtr = cVector;
129  const lv_32fc_t* aPtr = aVector;
130  const float* bPtr = bVector;
131 
132  __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
133  for (; number < quarterPoints; number++) {
134 
135  aVal1 = _mm_load_ps((const float*)aPtr);
136  aPtr += 2;
137 
138  aVal2 = _mm_load_ps((const float*)aPtr);
139  aPtr += 2;
140 
141  bVal = _mm_load_ps(bPtr);
142  bPtr += 4;
143 
144  bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
145  bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
146 
147  cVal = _mm_mul_ps(aVal1, bVal1);
148 
149  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
150  cPtr += 2;
151 
152  cVal = _mm_mul_ps(aVal2, bVal2);
153 
154  _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
155 
156  cPtr += 2;
157  }
158 
159  number = quarterPoints * 4;
160  for (; number < num_points; number++) {
161  *cPtr++ = (*aPtr++) * (*bPtr);
162  bPtr++;
163  }
164 }
165 #endif /* LV_HAVE_SSE */
166 
167 
168 #ifdef LV_HAVE_GENERIC
169 
170 static inline void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t* cVector,
171  const lv_32fc_t* aVector,
172  const float* bVector,
173  unsigned int num_points)
174 {
175  lv_32fc_t* cPtr = cVector;
176  const lv_32fc_t* aPtr = aVector;
177  const float* bPtr = bVector;
178  unsigned int number = 0;
179 
180  for (number = 0; number < num_points; number++) {
181  *cPtr++ = (*aPtr++) * (*bPtr++);
182  }
183 }
184 #endif /* LV_HAVE_GENERIC */
185 
186 
187 #ifdef LV_HAVE_NEON
188 #include <arm_neon.h>
189 
190 static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
191  const lv_32fc_t* aVector,
192  const float* bVector,
193  unsigned int num_points)
194 {
195  lv_32fc_t* cPtr = cVector;
196  const lv_32fc_t* aPtr = aVector;
197  const float* bPtr = bVector;
198  unsigned int number = 0;
199  unsigned int quarter_points = num_points / 4;
200 
201  float32x4x2_t inputVector, outputVector;
202  float32x4_t tapsVector;
203  for (number = 0; number < quarter_points; number++) {
204  inputVector = vld2q_f32((float*)aPtr);
205  tapsVector = vld1q_f32(bPtr);
206 
207  outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
208  outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
209 
210  vst2q_f32((float*)cPtr, outputVector);
211  aPtr += 4;
212  bPtr += 4;
213  cPtr += 4;
214  }
215 
216  for (number = quarter_points * 4; number < num_points; number++) {
217  *cPtr++ = (*aPtr++) * (*bPtr++);
218  }
219 }
220 #endif /* LV_HAVE_NEON */
221 
222 
223 #ifdef LV_HAVE_ORC
224 
225 extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
226  const lv_32fc_t* aVector,
227  const float* bVector,
228  unsigned int num_points);
229 
230 static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
231  const lv_32fc_t* aVector,
232  const float* bVector,
233  unsigned int num_points)
234 {
235  volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
236 }
237 
238 #endif /* LV_HAVE_GENERIC */
239 
240 
241 #endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:190
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:170
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:64
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:120
float complex lv_32fc_t
Definition: volk_complex.h:65