30 #ifndef GDALSSE_PRIV_H_INCLUDED 31 #define GDALSSE_PRIV_H_INCLUDED 39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION) 42 #include <emmintrin.h> 46 #include <smmintrin.h> 49 #include "gdal_priv_templates.hpp" 51 static inline __m128i GDALCopyInt16ToXMM(
const void* ptr)
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 56 return _mm_cvtsi32_si128(s);
58 return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
62 static inline __m128i GDALCopyInt32ToXMM(
const void* ptr)
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 67 return _mm_cvtsi32_si128(i);
69 return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
73 static inline __m128i GDALCopyInt64ToXMM(
const void* ptr)
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 78 return _mm_cvtsi64_si128(i);
80 return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
84 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void* pDest)
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS 87 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
90 *
static_cast<GInt16*
>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
102 XMMReg2Double(
double val) { xmm = _mm_load_sd (&val); }
103 XMMReg2Double(
const XMMReg2Double& other) : xmm(other.xmm) {}
105 static inline XMMReg2Double Zero()
112 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
115 reg.nsLoad1ValHighAndLow(ptr);
119 static inline XMMReg2Double Load2Val(
const double* ptr)
126 static inline XMMReg2Double Load2Val(
const float* ptr)
133 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
136 reg.nsLoad2ValAligned(ptr);
140 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
147 static inline XMMReg2Double Load2Val(
const short* ptr)
154 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
161 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
164 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
168 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
171 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
175 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
178 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
182 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
185 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
189 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
192 reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
196 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
199 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
203 inline void nsLoad1ValHighAndLow(
const double* ptr)
205 xmm = _mm_load1_pd(ptr);
208 inline void nsLoad2Val(
const double* ptr)
210 xmm = _mm_loadu_pd(ptr);
213 inline void nsLoad2ValAligned(
const double* ptr)
215 xmm = _mm_load_pd(ptr);
218 inline void nsLoad2Val(
const float* ptr)
220 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
223 inline void nsLoad2Val(
const unsigned char* ptr)
225 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
227 xmm_i = _mm_cvtepu8_epi32(xmm_i);
229 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
230 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
232 xmm = _mm_cvtepi32_pd(xmm_i);
235 inline void nsLoad2Val(
const short* ptr)
237 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
239 xmm_i = _mm_cvtepi16_epi32(xmm_i);
241 xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i);
242 xmm_i = _mm_srai_epi32(xmm_i, 16);
244 xmm = _mm_cvtepi32_pd(xmm_i);
247 inline void nsLoad2Val(
const unsigned short* ptr)
249 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
251 xmm_i = _mm_cvtepu16_epi32(xmm_i);
253 xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128());
255 xmm = _mm_cvtepi32_pd(xmm_i);
258 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
260 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
262 xmm_i = _mm_cvtepu8_epi32(xmm_i);
264 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
265 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
267 low.xmm = _mm_cvtepi32_pd(xmm_i);
268 high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
271 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
274 high.nsLoad2Val(ptr+2);
277 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
280 high.nsLoad2Val(ptr+2);
283 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
286 high.nsLoad2Val(ptr+2);
289 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
291 __m128 temp1 = _mm_loadu_ps(ptr);
292 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
293 low.xmm = _mm_cvtps_pd(temp1);
294 high.xmm = _mm_cvtps_pd(temp2);
297 inline void Zeroize()
299 xmm = _mm_setzero_pd();
302 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
308 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
310 xmm = _mm_add_pd(xmm, other.xmm);
314 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
316 xmm = _mm_mul_pd(xmm, other.xmm);
320 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 323 ret.xmm = _mm_add_pd(xmm, other.xmm);
327 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 330 ret.xmm = _mm_sub_pd(xmm, other.xmm);
334 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 337 ret.xmm = _mm_mul_pd(xmm, other.xmm);
341 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 344 ret.xmm = _mm_div_pd(xmm, other.xmm);
348 inline double GetHorizSum()
const 351 xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1));
352 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
355 inline void Store2Val(
double* ptr)
const 357 _mm_storeu_pd(ptr, xmm);
360 inline void Store2ValAligned(
double* ptr)
const 362 _mm_store_pd(ptr, xmm);
365 inline void Store2Val(
float* ptr)
const 367 __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
368 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
371 inline void Store2Val(
unsigned char* ptr)
const 373 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
376 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
377 tmp = _mm_packus_epi16(tmp, tmp);
378 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
381 inline void Store2Val(
unsigned short* ptr)
const 383 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5)));
386 tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
387 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
393 inline void StoreMask(
unsigned char* ptr)
const 395 _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
398 inline operator double ()
const 400 return _mm_cvtsd_f64(xmm);
406 #warning "Software emulation of SSE2 !" 415 XMMReg2Double(
double val) { low = val; high = 0.0; }
416 XMMReg2Double(
const XMMReg2Double& other) : low(other.low), high(other.high) {}
418 static inline XMMReg2Double Zero()
425 static inline XMMReg2Double Load1ValHighAndLow(
const double* ptr)
428 reg.nsLoad1ValHighAndLow(ptr);
432 static inline XMMReg2Double Equals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
436 if (expr1.low == expr2.low)
437 memset(&(reg.low), 0xFF,
sizeof(
double));
441 if (expr1.high == expr2.high)
442 memset(&(reg.high), 0xFF,
sizeof(
double));
449 static inline XMMReg2Double NotEquals(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
453 if (expr1.low != expr2.low)
454 memset(&(reg.low), 0xFF,
sizeof(
double));
458 if (expr1.high != expr2.high)
459 memset(&(reg.high), 0xFF,
sizeof(
double));
466 static inline XMMReg2Double Greater(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
470 if (expr1.low > expr2.low)
471 memset(&(reg.low), 0xFF,
sizeof(
double));
475 if (expr1.high > expr2.high)
476 memset(&(reg.high), 0xFF,
sizeof(
double));
483 static inline XMMReg2Double And(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
486 int low1[2], high1[2];
487 int low2[2], high2[2];
488 memcpy(low1, &expr1.low,
sizeof(
double));
489 memcpy(high1, &expr1.high,
sizeof(
double));
490 memcpy(low2, &expr2.low,
sizeof(
double));
491 memcpy(high2, &expr2.high,
sizeof(
double));
494 high1[0] &= high2[0];
495 high1[1] &= high2[1];
496 memcpy(®.low, low1,
sizeof(
double));
497 memcpy(®.high, high1,
sizeof(
double));
501 static inline XMMReg2Double Ternary(
const XMMReg2Double& cond,
const XMMReg2Double& true_expr,
const XMMReg2Double& false_expr)
505 reg.low = true_expr.low;
507 reg.low = false_expr.low;
509 reg.high = true_expr.high;
511 reg.high = false_expr.high;
515 static inline XMMReg2Double Min(
const XMMReg2Double& expr1,
const XMMReg2Double& expr2)
518 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
519 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
523 static inline XMMReg2Double Load2Val(
const double* ptr)
530 static inline XMMReg2Double Load2ValAligned(
const double* ptr)
533 reg.nsLoad2ValAligned(ptr);
537 static inline XMMReg2Double Load2Val(
const float* ptr)
544 static inline XMMReg2Double Load2Val(
const unsigned char* ptr)
551 static inline XMMReg2Double Load2Val(
const short* ptr)
558 static inline XMMReg2Double Load2Val(
const unsigned short* ptr)
565 inline void nsLoad1ValHighAndLow(
const double* ptr)
571 inline void nsLoad2Val(
const double* ptr)
577 inline void nsLoad2ValAligned(
const double* ptr)
583 inline void nsLoad2Val(
const float* ptr)
589 inline void nsLoad2Val(
const unsigned char* ptr)
595 inline void nsLoad2Val(
const short* ptr)
601 inline void nsLoad2Val(
const unsigned short* ptr)
607 static inline void Load4Val(
const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
615 static inline void Load4Val(
const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
618 high.nsLoad2Val(ptr+2);
621 static inline void Load4Val(
const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
624 high.nsLoad2Val(ptr+2);
627 static inline void Load4Val(
const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
630 high.nsLoad2Val(ptr+2);
633 static inline void Load4Val(
const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
636 high.nsLoad2Val(ptr+2);
639 inline void Zeroize()
645 inline XMMReg2Double& operator= (
const XMMReg2Double& other)
652 inline XMMReg2Double& operator+= (
const XMMReg2Double& other)
659 inline XMMReg2Double& operator*= (
const XMMReg2Double& other)
666 inline XMMReg2Double operator+ (
const XMMReg2Double& other)
const 669 ret.low = low + other.low;
670 ret.high = high + other.high;
674 inline XMMReg2Double operator- (
const XMMReg2Double& other)
const 677 ret.low = low - other.low;
678 ret.high = high - other.high;
682 inline XMMReg2Double operator* (
const XMMReg2Double& other)
const 685 ret.low = low * other.low;
686 ret.high = high * other.high;
690 inline XMMReg2Double operator/ (
const XMMReg2Double& other)
const 693 ret.low = low / other.low;
694 ret.high = high / other.high;
698 inline double GetHorizSum()
const 703 inline void Store2Val(
double* ptr)
const 709 inline void Store2ValAligned(
double* ptr)
const 715 inline void Store2Val(
float* ptr)
const 721 void Store2Val(
unsigned char* ptr)
const 723 ptr[0] = (
unsigned char)(low + 0.5);
724 ptr[1] = (
unsigned char)(high + 0.5);
727 void Store2Val(
unsigned short* ptr)
const 730 ptr[1] = (
GUInt16)(high + 0.5);
733 inline void StoreMask(
unsigned char* ptr)
const 735 memcpy(ptr, &low, 8);
736 memcpy(ptr + 8, &high, 8);
739 inline operator double ()
const 749 #include <immintrin.h> 757 XMMReg4Double(
const XMMReg4Double& other) : ymm(other.ymm) {}
759 static inline XMMReg4Double Zero()
766 inline void Zeroize()
768 ymm = _mm256_setzero_pd();
771 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
774 reg.nsLoad1ValHighAndLow(ptr);
778 inline void nsLoad1ValHighAndLow(
const double* ptr)
780 ymm = _mm256_set1_pd(*ptr);
783 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
790 inline void nsLoad4Val(
const unsigned char* ptr)
792 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
793 xmm_i = _mm_cvtepu8_epi32(xmm_i);
794 ymm = _mm256_cvtepi32_pd(xmm_i);
797 static inline XMMReg4Double Load4Val(
const short* ptr)
804 inline void nsLoad4Val(
const short* ptr)
806 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
807 xmm_i = _mm_cvtepi16_epi32(xmm_i);
808 ymm = _mm256_cvtepi32_pd(xmm_i);
811 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
818 inline void nsLoad4Val(
const unsigned short* ptr)
820 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
821 xmm_i = _mm_cvtepu16_epi32(xmm_i);
822 ymm = _mm256_cvtepi32_pd(xmm_i);
825 static inline XMMReg4Double Load4Val(
const double* ptr)
832 inline void nsLoad4Val(
const double* ptr)
834 ymm = _mm256_loadu_pd(ptr);
837 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
840 reg.nsLoad4ValAligned(ptr);
844 inline void nsLoad4ValAligned(
const double* ptr)
846 ymm = _mm256_load_pd(ptr);
849 static inline XMMReg4Double Load4Val(
const float* ptr)
856 inline void nsLoad4Val(
const float* ptr)
858 ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
861 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
864 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
868 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
871 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
875 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
878 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
882 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
885 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
889 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
892 reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
896 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
899 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
903 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
909 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
911 ymm = _mm256_add_pd(ymm, other.ymm);
915 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
917 ymm = _mm256_mul_pd(ymm, other.ymm);
921 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const 924 ret.ymm = _mm256_add_pd(ymm, other.ymm);
928 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const 931 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
935 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const 938 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
942 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const 945 ret.ymm = _mm256_div_pd(ymm, other.ymm);
949 void AddToLow(
const XMMReg2Double& other )
951 __m256d ymm2 = _mm256_setzero_pd();
952 ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
953 ymm = _mm256_add_pd(ymm, ymm2);
956 inline double GetHorizSum()
const 958 __m256d ymm_tmp1, ymm_tmp2;
959 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
960 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
961 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
962 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
965 inline void Store4Val(
unsigned char* ptr)
const 967 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
970 xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24)));
971 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
974 inline void Store4Val(
unsigned short* ptr)
const 976 __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
977 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
978 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
981 inline void Store4Val(
float* ptr)
const 983 _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
986 inline void Store4Val(
double* ptr)
const 988 _mm256_storeu_pd(ptr, ymm);
991 inline void StoreMask(
unsigned char* ptr)
const 993 _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
1002 XMMReg2Double low, high;
1005 XMMReg4Double(
const XMMReg4Double& other) : low(other.low), high(other.high) {}
1007 static inline XMMReg4Double Zero()
1015 static inline XMMReg4Double Load1ValHighAndLow(
const double* ptr)
1018 reg.low.nsLoad1ValHighAndLow(ptr);
1023 static inline XMMReg4Double Load4Val(
const unsigned char* ptr)
1026 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1030 static inline XMMReg4Double Load4Val(
const short* ptr)
1033 reg.low.nsLoad2Val(ptr);
1034 reg.high.nsLoad2Val(ptr+2);
1038 static inline XMMReg4Double Load4Val(
const unsigned short* ptr)
1041 reg.low.nsLoad2Val(ptr);
1042 reg.high.nsLoad2Val(ptr+2);
1046 static inline XMMReg4Double Load4Val(
const double* ptr)
1049 reg.low.nsLoad2Val(ptr);
1050 reg.high.nsLoad2Val(ptr+2);
1054 static inline XMMReg4Double Load4ValAligned(
const double* ptr)
1057 reg.low.nsLoad2ValAligned(ptr);
1058 reg.high.nsLoad2ValAligned(ptr+2);
1062 static inline XMMReg4Double Load4Val(
const float* ptr)
1065 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1069 static inline XMMReg4Double Equals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1072 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1073 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1077 static inline XMMReg4Double NotEquals(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1080 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1081 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1085 static inline XMMReg4Double Greater(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1088 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1089 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1093 static inline XMMReg4Double And(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1096 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1097 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1101 static inline XMMReg4Double Ternary(
const XMMReg4Double& cond,
const XMMReg4Double& true_expr,
const XMMReg4Double& false_expr)
1104 reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1105 reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1109 static inline XMMReg4Double Min(
const XMMReg4Double& expr1,
const XMMReg4Double& expr2)
1112 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1113 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1117 inline XMMReg4Double& operator= (
const XMMReg4Double& other)
1124 inline XMMReg4Double& operator+= (
const XMMReg4Double& other)
1131 inline XMMReg4Double& operator*= (
const XMMReg4Double& other)
1138 inline XMMReg4Double operator+ (
const XMMReg4Double& other)
const 1141 ret.low = low + other.low;
1142 ret.high = high + other.high;
1146 inline XMMReg4Double operator- (
const XMMReg4Double& other)
const 1149 ret.low = low - other.low;
1150 ret.high = high - other.high;
1154 inline XMMReg4Double operator* (
const XMMReg4Double& other)
const 1157 ret.low = low * other.low;
1158 ret.high = high * other.high;
1162 inline XMMReg4Double operator/ (
const XMMReg4Double& other)
const 1165 ret.low = low / other.low;
1166 ret.high = high / other.high;
1170 void AddToLow(
const XMMReg2Double& other )
1175 inline double GetHorizSum()
const 1177 return (low + high).GetHorizSum();
1180 inline void Store4Val(
unsigned char* ptr)
const 1183 high.Store2Val(ptr+2);
1186 inline void Store4Val(
unsigned short* ptr)
const 1190 high.Store2Val(ptr+2);
1192 __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1193 __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1194 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1196 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1198 xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1199 xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1200 xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1202 GDALCopyXMMToInt64(xmm0, (
GInt64*)ptr);
1206 inline void Store4Val(
float* ptr)
const 1209 high.Store2Val(ptr+2);
1212 inline void Store4Val(
double* ptr)
const 1215 high.Store2Val(ptr+2);
1218 inline void StoreMask(
unsigned char* ptr)
const 1221 high.StoreMask(ptr+16);
Core portability definitions for CPL.
int GInt32
Int32 type.
Definition: cpl_port.h:203
short GInt16
Int16 type.
Definition: cpl_port.h:209
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:211
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:265