Vector Optimized Library of Kernels  2.5.0
Architecture-tuned implementations of math kernels
volk_neon_intrinsics.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2015 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*
24  * Copyright (c) 2016-2019 ARM Limited.
25  *
26  * SPDX-License-Identifier: MIT
27  *
28  * Permission is hereby granted, free of charge, to any person obtaining a copy
29  * of this software and associated documentation files (the "Software"), to
30  * deal in the Software without restriction, including without limitation the
31  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
32  * sell copies of the Software, and to permit persons to whom the Software is
33  * furnished to do so, subject to the following conditions:
34  *
35  * The above copyright notice and this permission notice shall be included in all
36  * copies or substantial portions of the Software.
37  *
38  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44  * SOFTWARE.
45  *
46  * _vtaylor_polyq_f32
47  * _vlogq_f32
48  *
49  */
50 
51 /* Copyright (C) 2011 Julien Pommier
52 
53  This software is provided 'as-is', without any express or implied
54  warranty. In no event will the authors be held liable for any damages
55  arising from the use of this software.
56 
57  Permission is granted to anyone to use this software for any purpose,
58  including commercial applications, and to alter it and redistribute it
59  freely, subject to the following restrictions:
60 
61  1. The origin of this software must not be misrepresented; you must not
62  claim that you wrote the original software. If you use this software
63  in a product, an acknowledgment in the product documentation would be
64  appreciated but is not required.
65  2. Altered source versions must be plainly marked as such, and must not be
66  misrepresented as being the original software.
67  3. This notice may not be removed or altered from any source distribution.
68 
69  (this is the zlib license)
70 
71  _vsincosq_f32
72 
73 */
74 
75 /*
76  * This file is intended to hold NEON intrinsics of intrinsics.
77  * They should be used in VOLK kernels to avoid copy-pasta.
78  */
79 
80 #ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
81 #define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
82 #ifdef LV_HAVE_NEON
83 #include <arm_neon.h>
84 
85 
86 /* Magnitude squared for float32x4x2_t */
87 static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
88 {
89  float32x4_t iValue, qValue, result;
90  iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
91  qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
92  result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
93  return result;
94 }
95 
96 /* Inverse square root for float32x4_t */
97 static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
98 {
99  float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
100  sqrt_reciprocal = vmulq_f32(
101  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
102  sqrt_reciprocal = vmulq_f32(
103  vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
104 
105  return sqrt_reciprocal;
106 }
107 
108 /* Inverse */
109 static inline float32x4_t _vinvq_f32(float32x4_t x)
110 {
111  // Newton's method
112  float32x4_t recip = vrecpeq_f32(x);
113  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
114  recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
115  return recip;
116 }
117 
118 /* Complex multiplication for float32x4x2_t */
119 static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
120  float32x4x2_t b_val)
121 {
122  float32x4x2_t tmp_real;
123  float32x4x2_t tmp_imag;
124  float32x4x2_t c_val;
125 
126  // multiply the real*real and imag*imag to get real result
127  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
128  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
129  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
130  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
131  // Multiply cross terms to get the imaginary result
132  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
133  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
134  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
135  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
136  // combine the products
137  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
138  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
139  return c_val;
140 }
141 
142 /* From ARM Compute Library, MIT license */
143 static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
144 {
145  float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
146  float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
147  float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
148  float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
149  float32x4_t x2 = vmulq_f32(x, x);
150  float32x4_t x4 = vmulq_f32(x2, x2);
151  float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
152  return res;
153 }
154 
155 /* Natural logarithm.
156  * From ARM Compute Library, MIT license */
157 static inline float32x4_t _vlogq_f32(float32x4_t x)
158 {
159  const float32x4_t log_tab[8] = {
160  vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
161  vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
162  vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
163  vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
164  };
165 
166  const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
167  const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
168 
169  // Extract exponent
170  int32x4_t m = vsubq_s32(
171  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
172  float32x4_t val =
173  vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
174 
175  // Polynomial Approximation
176  float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
177 
178  // Reconstruct
179  poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
180 
181  return poly;
182 }
183 
184 /* Evaluation of 4 sines & cosines at once.
185  * Optimized from here (zlib license)
186  * http://gruntthepeon.free.fr/ssemath/ */
187 static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
188 {
189  const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
190  const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
191  const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
192  const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
193  const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
194  const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
195  const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
196  const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
197  const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
198  const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
199 
200  const float32x4_t CONST_1 = vdupq_n_f32(1.f);
201  const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
202  const float32x4_t CONST_0 = vdupq_n_f32(0.f);
203  const uint32x4_t CONST_2 = vdupq_n_u32(2);
204  const uint32x4_t CONST_4 = vdupq_n_u32(4);
205 
206  uint32x4_t emm2;
207 
208  uint32x4_t sign_mask_sin, sign_mask_cos;
209  sign_mask_sin = vcltq_f32(x, CONST_0);
210  x = vabsq_f32(x);
211  // scale by 4/pi
212  float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
213 
214  // store the integer part of y in mm0
215  emm2 = vcvtq_u32_f32(y);
216  /* j=(j+1) & (~1) (see the cephes sources) */
217  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
218  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
219  y = vcvtq_f32_u32(emm2);
220 
221  /* get the polynom selection mask
222  there is one polynom for 0 <= x <= Pi/4
223  and another one for Pi/4<x<=Pi/2
224  Both branches will be computed. */
225  const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
226 
227  // The magic pass: "Extended precision modular arithmetic"
228  x = vmlaq_f32(x, y, c_minus_cephes_DP1);
229  x = vmlaq_f32(x, y, c_minus_cephes_DP2);
230  x = vmlaq_f32(x, y, c_minus_cephes_DP3);
231 
232  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
233  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
234 
235  /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
236  and the second polynom (Pi/4 <= x <= 0) in y2 */
237  float32x4_t y1, y2;
238  float32x4_t z = vmulq_f32(x, x);
239 
240  y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
241  y1 = vmlaq_f32(c_coscof_p2, z, y1);
242  y1 = vmulq_f32(y1, z);
243  y1 = vmulq_f32(y1, z);
244  y1 = vmlsq_f32(y1, z, CONST_1_2);
245  y1 = vaddq_f32(y1, CONST_1);
246 
247  y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
248  y2 = vmlaq_f32(c_sincof_p2, z, y2);
249  y2 = vmulq_f32(y2, z);
250  y2 = vmlaq_f32(x, x, y2);
251 
252  /* select the correct result from the two polynoms */
253  const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
254  const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
255 
256  float32x4x2_t sincos;
257  sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
258  sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
259 
260  return sincos;
261 }
262 
263 static inline float32x4_t _vsinq_f32(float32x4_t x)
264 {
265  const float32x4x2_t sincos = _vsincosq_f32(x);
266  return sincos.val[0];
267 }
268 
269 static inline float32x4_t _vcosq_f32(float32x4_t x)
270 {
271  const float32x4x2_t sincos = _vsincosq_f32(x);
272  return sincos.val[1];
273 }
274 
275 static inline float32x4_t _vtanq_f32(float32x4_t x)
276 {
277  const float32x4x2_t sincos = _vsincosq_f32(x);
278  return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
279 }
280 
281 static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
282  float32x4_t acc,
283  float32x4_t val,
284  float32x4_t rec,
285  float32x4_t aux)
286 {
287  aux = vmulq_f32(aux, val);
288  aux = vsubq_f32(aux, acc);
289  aux = vmulq_f32(aux, aux);
290 #ifdef LV_HAVE_NEONV8
291  return vfmaq_f32(sq_acc, aux, rec);
292 #else
293  aux = vmulq_f32(aux, rec);
294  return vaddq_f32(sq_acc, aux);
295 #endif
296 }
297 
298 #endif /*LV_HAVE_NEON*/
299 #endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */
val
Definition: volk_arch_defs.py:66
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:97
static float32x4_t _vinvq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:109
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:119
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:263
static float32x4_t _vlogq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:157
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:269
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:87
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:281
static float32x4x2_t _vsincosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:187
static float32x4_t _vtanq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:275
static float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
Definition: volk_neon_intrinsics.h:143