27 #ifndef __always_inline 28 #define __always_inline inline __attribute__((always_inline)) 50 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \ 52 M3 = _mm_adds_epi16(M0, M2); \ 53 M4 = _mm_subs_epi16(M1, M2); \ 54 M0 = _mm_subs_epi16(M0, M2); \ 55 M1 = _mm_adds_epi16(M1, M2); \ 56 M2 = _mm_max_epi16(M3, M4); \ 57 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \ 58 M4 = _mm_max_epi16(M0, M1); \ 59 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \ 76 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 78 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \ 80 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \ 81 M0 = _mm_shuffle_epi8(M0, M2); \ 82 M1 = _mm_shuffle_epi8(M1, M2); \ 83 M2 = _mm_unpacklo_epi64(M0, M1); \ 84 M3 = _mm_unpackhi_epi64(M0, M1); \ 101 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \ 102 M8, M9, M10, M11, M12, M13, M14, M15) \ 104 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \ 105 M0 = _mm_shuffle_epi8(M0, M8); \ 106 M1 = _mm_shuffle_epi8(M1, M8); \ 107 M2 = _mm_shuffle_epi8(M2, M8); \ 108 M3 = _mm_shuffle_epi8(M3, M8); \ 109 M4 = _mm_shuffle_epi8(M4, M8); \ 110 M5 = _mm_shuffle_epi8(M5, M8); \ 111 M6 = _mm_shuffle_epi8(M6, M8); \ 112 M7 = _mm_shuffle_epi8(M7, M8); \ 113 M8 = _mm_unpacklo_epi64(M0, M1); \ 114 M9 = _mm_unpackhi_epi64(M0, M1); \ 115 M10 = _mm_unpacklo_epi64(M2, M3); \ 116 M11 = _mm_unpackhi_epi64(M2, M3); \ 117 M12 = _mm_unpacklo_epi64(M4, M5); \ 118 M13 = _mm_unpackhi_epi64(M4, M5); \ 119 M14 = _mm_unpacklo_epi64(M6, M7); \ 120 M15 = _mm_unpackhi_epi64(M6, M7); \ 133 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \ 135 M0 = _mm_sign_epi16(M4, M0); \ 136 M1 = _mm_sign_epi16(M4, M1); \ 137 M2 = _mm_sign_epi16(M4, M2); \ 138 M3 = _mm_sign_epi16(M4, M3); \ 139 M6 = _mm_hadds_epi16(M0, M1); \ 140 M7 = _mm_hadds_epi16(M2, M3); \ 155 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \ 157 M0 = _mm_sign_epi16(M4, M0); \ 158 M1 = _mm_sign_epi16(M4, M1); \ 159 M2 = _mm_sign_epi16(M4, M2); \ 160 M3 = _mm_sign_epi16(M4, M3); \ 161 M0 = _mm_hadds_epi16(M0, M1); \ 162 M1 = _mm_hadds_epi16(M2, M3); \ 163 M5 = _mm_hadds_epi16(M0, M1); \ 179 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41) 180 #define SSE_MINPOS(M0, M1) \ 182 if (sse41_supported) { \ 183 M0 = _mm_minpos_epu16(M0); \ 185 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ 186 M0 = _mm_min_epi16(M0, M1); \ 187 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ 188 M0 = _mm_min_epi16(M0, M1); \ 189 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \ 190 M0 = _mm_min_epi16(M0, M1); \ 194 #define SSE_MINPOS(M0, M1) \ 196 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ 197 M0 = _mm_min_epi16(M0, M1); \ 198 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \ 199 M0 = _mm_min_epi16(M0, M1); \ 200 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \ 201 M0 = _mm_min_epi16(M0, M1); \ 217 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \ 219 M2 = _mm_min_epi16(M0, M1); \ 222 M0 = _mm_subs_epi16(M0, M2); \ 223 M1 = _mm_subs_epi16(M1, M2); \ 237 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \ 239 M8 = _mm_min_epi16(M0, M1); \ 240 M9 = _mm_min_epi16(M2, M3); \ 241 M10 = _mm_min_epi16(M4, M5); \ 242 M11 = _mm_min_epi16(M6, M7); \ 243 M8 = _mm_min_epi16(M8, M9); \ 244 M10 = _mm_min_epi16(M10, M11); \ 245 M8 = _mm_min_epi16(M8, M10); \ 248 M0 = _mm_subs_epi16(M0, M8); \ 249 M1 = _mm_subs_epi16(M1, M8); \ 250 M2 = _mm_subs_epi16(M2, M8); \ 251 M3 = _mm_subs_epi16(M3, M8); \ 252 M4 = _mm_subs_epi16(M4, M8); \ 253 M5 = _mm_subs_epi16(M5, M8); \ 254 M6 = _mm_subs_epi16(M6, M8); \ 255 M7 = _mm_subs_epi16(M7, M8); \ 265 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
267 __m128i m0, m1, m2, m3, m4, m5, m6;
270 m2 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
273 m0 = _mm_load_si128((__m128i *) &out[0]);
274 m1 = _mm_load_si128((__m128i *) &out[8]);
277 m0 = _mm_sign_epi16(m2, m0);
278 m1 = _mm_sign_epi16(m2, m1);
279 m2 = _mm_hadds_epi16(m0, m1);
282 m0 = _mm_load_si128((__m128i *) &sums[0]);
283 m1 = _mm_load_si128((__m128i *) &sums[8]);
293 _mm_store_si128((__m128i *) &sums[0], m2);
294 _mm_store_si128((__m128i *) &sums[8], m6);
295 _mm_store_si128((__m128i *) &paths[0], m5);
296 _mm_store_si128((__m128i *) &paths[8], m4);
307 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
309 __m128i m0, m1, m2, m3, m4, m5, m6;
312 m4 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
315 m0 = _mm_load_si128((__m128i *) &out[0]);
316 m1 = _mm_load_si128((__m128i *) &out[8]);
317 m2 = _mm_load_si128((__m128i *) &out[16]);
318 m3 = _mm_load_si128((__m128i *) &out[24]);
323 m0 = _mm_load_si128((__m128i *) &sums[0]);
324 m1 = _mm_load_si128((__m128i *) &sums[8]);
334 _mm_store_si128((__m128i *) &sums[0], m2);
335 _mm_store_si128((__m128i *) &sums[8], m6);
336 _mm_store_si128((__m128i *) &paths[0], m5);
337 _mm_store_si128((__m128i *) &paths[8], m4);
347 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
349 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
350 m9, m10, m11, m12, m13, m14, m15;
353 m0 = _mm_load_si128((__m128i *) &sums[0]);
354 m1 = _mm_load_si128((__m128i *) &sums[8]);
355 m2 = _mm_load_si128((__m128i *) &sums[16]);
356 m3 = _mm_load_si128((__m128i *) &sums[24]);
357 m4 = _mm_load_si128((__m128i *) &sums[32]);
358 m5 = _mm_load_si128((__m128i *) &sums[40]);
359 m6 = _mm_load_si128((__m128i *) &sums[48]);
360 m7 = _mm_load_si128((__m128i *) &sums[56]);
364 m8, m9, m10, m11, m12, m13, m14, m15)
367 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
370 m0 = _mm_load_si128((__m128i *) &out[0]);
371 m1 = _mm_load_si128((__m128i *) &out[8]);
372 m2 = _mm_load_si128((__m128i *) &out[16]);
373 m3 = _mm_load_si128((__m128i *) &out[24]);
377 m0 = _mm_load_si128((__m128i *) &out[32]);
378 m1 = _mm_load_si128((__m128i *) &out[40]);
379 m2 = _mm_load_si128((__m128i *) &out[48]);
380 m3 = _mm_load_si128((__m128i *) &out[56]);
388 _mm_store_si128((__m128i *) &paths[0], m0);
389 _mm_store_si128((__m128i *) &paths[8], m2);
390 _mm_store_si128((__m128i *) &paths[32], m9);
391 _mm_store_si128((__m128i *) &paths[40], m11);
397 _mm_store_si128((__m128i *) &paths[16], m0);
398 _mm_store_si128((__m128i *) &paths[24], m9);
399 _mm_store_si128((__m128i *) &paths[48], m13);
400 _mm_store_si128((__m128i *) &paths[56], m15);
404 m7, m11, m0, m8, m9, m10)
406 _mm_store_si128((__m128i *) &sums[0], m4);
407 _mm_store_si128((__m128i *) &sums[8], m5);
408 _mm_store_si128((__m128i *) &sums[16], m6);
409 _mm_store_si128((__m128i *) &sums[24], m7);
410 _mm_store_si128((__m128i *) &sums[32], m1);
411 _mm_store_si128((__m128i *) &sums[40], m3);
412 _mm_store_si128((__m128i *) &sums[48], m2);
413 _mm_store_si128((__m128i *) &sums[56], m11);
422 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
424 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
425 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
428 m0 = _mm_load_si128((__m128i *) &sums[0]);
429 m1 = _mm_load_si128((__m128i *) &sums[8]);
430 m2 = _mm_load_si128((__m128i *) &sums[16]);
431 m3 = _mm_load_si128((__m128i *) &sums[24]);
432 m4 = _mm_load_si128((__m128i *) &sums[32]);
433 m5 = _mm_load_si128((__m128i *) &sums[40]);
434 m6 = _mm_load_si128((__m128i *) &sums[48]);
435 m7 = _mm_load_si128((__m128i *) &sums[56]);
439 m8, m9, m10, m11, m12, m13, m14, m15)
442 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
445 m0 = _mm_load_si128((__m128i *) &out[0]);
446 m1 = _mm_load_si128((__m128i *) &out[8]);
447 m2 = _mm_load_si128((__m128i *) &out[16]);
448 m3 = _mm_load_si128((__m128i *) &out[24]);
452 m0 = _mm_load_si128((__m128i *) &out[32]);
453 m1 = _mm_load_si128((__m128i *) &out[40]);
454 m2 = _mm_load_si128((__m128i *) &out[48]);
455 m3 = _mm_load_si128((__m128i *) &out[56]);
459 m0 = _mm_load_si128((__m128i *) &out[64]);
460 m1 = _mm_load_si128((__m128i *) &out[72]);
461 m2 = _mm_load_si128((__m128i *) &out[80]);
462 m3 = _mm_load_si128((__m128i *) &out[88]);
466 m0 = _mm_load_si128((__m128i *) &out[96]);
467 m1 = _mm_load_si128((__m128i *) &out[104]);
468 m2 = _mm_load_si128((__m128i *) &out[112]);
469 m3 = _mm_load_si128((__m128i *) &out[120]);
477 _mm_store_si128((__m128i *) &paths[0], m0);
478 _mm_store_si128((__m128i *) &paths[8], m2);
479 _mm_store_si128((__m128i *) &paths[32], m9);
480 _mm_store_si128((__m128i *) &paths[40], m11);
486 _mm_store_si128((__m128i *) &paths[16], m0);
487 _mm_store_si128((__m128i *) &paths[24], m9);
488 _mm_store_si128((__m128i *) &paths[48], m13);
489 _mm_store_si128((__m128i *) &paths[56], m15);
493 m7, m11, m0, m8, m9, m10)
495 _mm_store_si128((__m128i *) &sums[0], m4);
496 _mm_store_si128((__m128i *) &sums[8], m5);
497 _mm_store_si128((__m128i *) &sums[16], m6);
498 _mm_store_si128((__m128i *) &sums[24], m7);
499 _mm_store_si128((__m128i *) &sums[32], m1);
500 _mm_store_si128((__m128i *) &sums[40], m3);
501 _mm_store_si128((__m128i *) &sums[48], m2);
502 _mm_store_si128((__m128i *) &sums[56], m11);
int16_t ** paths
Definition: conv_acc.c:168
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:50
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:346
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:155
#define __always_inline
Definition: conv_acc_sse_impl.h:28
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:133
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:78
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:264
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:101
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:217
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:421
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:237
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:306