24 #if !defined(SIMDE__MMX_H)
25 #if !defined(SIMDE__MMX_H)
30 #if defined(SIMDE_MMX_FORCE_NATIVE)
31 #define SIMDE_MMX_NATIVE
32 #elif defined(__MMX__) && !defined(SIMDE_MMX_NO_NATIVE) && \
33 !defined(SIMDE_NO_NATIVE)
34 #define SIMDE_MMX_NATIVE
35 #elif defined(__ARM_NEON) && !defined(SIMDE_MMX_NO_NEON) && \
36 !defined(SIMDE_NO_NEON)
37 #define SIMDE_MMX_NEON
40 #if defined(SIMDE_MMX_NATIVE)
43 #if defined(SIMDE_MMX_NEON)
55 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
56 int8_t i8 __attribute__((__vector_size__(8), __may_alias__));
57 int16_t i16 __attribute__((__vector_size__(8), __may_alias__));
58 int32_t i32 __attribute__((__vector_size__(8), __may_alias__));
59 int64_t i64 __attribute__((__vector_size__(8), __may_alias__));
60 uint8_t u8 __attribute__((__vector_size__(8), __may_alias__));
61 uint16_t u16 __attribute__((__vector_size__(8), __may_alias__));
62 uint32_t u32 __attribute__((__vector_size__(8), __may_alias__));
63 uint64_t u64 __attribute__((__vector_size__(8), __may_alias__));
64 simde_float32 f32 __attribute__((__vector_size__(8), __may_alias__));
77 #if defined(SIMDE_MMX_NATIVE)
79 #elif defined(SIMDE_MMX_NEON)
92 #if defined(SIMDE_MMX_NATIVE)
94 "__m64 size doesn't match simde__m64 size");
101 #elif defined(SIMDE_MMX_NEON)
102 #define SIMDE__M64_NEON_C(T, expr) \
103 (simde__m64) { .neon_##T = (expr) }
110 #if defined(SIMDE_MMX_NATIVE)
111 return SIMDE__M64_C(_mm_add_pi8(a.n, b.n));
115 for (
size_t i = 0; i < 8; i++) {
116 r.
i8[i] = a.
i8[i] + b.
i8[i];
121 #define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
126 #if defined(SIMDE_MMX_NATIVE)
127 return SIMDE__M64_C(_mm_add_pi16(a.n, b.n));
131 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
137 #define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
142 #if defined(SIMDE_MMX_NATIVE)
143 return SIMDE__M64_C(_mm_add_pi32(a.n, b.n));
147 for (
size_t i = 0; i < (8 /
sizeof(int32_t)); i++) {
153 #define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
158 #if defined(SIMDE_MMX_NATIVE)
159 return SIMDE__M64_C(_mm_adds_pi8(a.n, b.n));
163 for (
int i = 0; i < 8; i++) {
164 if ((((b.
i8[i]) > 0) && ((a.
i8[i]) > (INT8_MAX - (b.
i8[i]))))) {
166 }
else if ((((b.
i8[i]) < 0) &&
167 ((a.
i8[i]) < (INT8_MIN - (b.
i8[i]))))) {
170 r.
i8[i] = (a.
i8[i]) + (b.
i8[i]);
176 #define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
181 #if defined(SIMDE_MMX_NATIVE)
182 return SIMDE__M64_C(_mm_adds_pu8(a.n, b.n));
186 for (
size_t i = 0; i < 8; i++) {
187 const int32_t x = a.
u8[i] + b.
u8[i];
190 else if (x > UINT8_MAX)
193 r.
u8[i] = (uint8_t)x;
198 #define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
203 #if defined(SIMDE_MMX_NATIVE)
204 return SIMDE__M64_C(_mm_adds_pi16(a.n, b.n));
208 for (
int i = 0; i < 4; i++) {
209 if ((((b.
i16[i]) > 0) &&
210 ((a.
i16[i]) > (INT16_MAX - (b.
i16[i]))))) {
211 r.
i16[i] = INT16_MAX;
212 }
else if ((((b.
i16[i]) < 0) &&
213 ((a.
i16[i]) < (SHRT_MIN - (b.
i16[i]))))) {
222 #define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
227 #if defined(SIMDE_MMX_NATIVE)
228 return SIMDE__M64_C(_mm_adds_pu16(a.n, b.n));
232 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
233 const uint32_t x = a.
u16[i] + b.
u16[i];
235 r.
u16[i] = UINT16_MAX;
237 r.
u16[i] = (uint16_t)x;
242 #define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
247 #if defined(SIMDE_MMX_NATIVE)
248 return SIMDE__M64_C(_mm_and_si64(a.n, b.n));
255 #define simde_m_pand(a, b) simde_mm_and_si64(a, b)
260 #if defined(SIMDE_MMX_NATIVE)
261 return SIMDE__M64_C(_mm_andnot_si64(a.n, b.n));
268 #define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
273 #if defined(SIMDE_MMX_NATIVE)
274 return SIMDE__M64_C(_mm_cmpeq_pi8(a.n, b.n));
278 for (
int i = 0; i < 8; i++) {
279 r.
i8[i] = (a.
i8[i] == b.
i8[i]) * 0xff;
284 #define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
289 #if defined(SIMDE_MMX_NATIVE)
290 return SIMDE__M64_C(_mm_cmpeq_pi16(a.n, b.n));
294 for (
int i = 0; i < 4; i++) {
295 r.
i16[i] = (a.
i16[i] == b.
i16[i]) * 0xffff;
300 #define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
305 #if defined(SIMDE_MMX_NATIVE)
306 return SIMDE__M64_C(_mm_cmpeq_pi32(a.n, b.n));
310 for (
int i = 0; i < 2; i++) {
311 r.
i32[i] = (a.
i32[i] == b.
i32[i]) * 0xffffffff;
316 #define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
321 #if defined(SIMDE_MMX_NATIVE)
322 return SIMDE__M64_C(_mm_cmpgt_pi8(a.n, b.n));
326 for (
int i = 0; i < 8; i++) {
327 r.
i8[i] = (a.
i8[i] > b.
i8[i]) * 0xff;
332 #define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
337 #if defined(SIMDE_MMX_NATIVE)
338 return SIMDE__M64_C(_mm_cmpgt_pi16(a.n, b.n));
342 for (
int i = 0; i < 4; i++) {
343 r.
i16[i] = (a.
i16[i] > b.
i16[i]) * 0xffff;
348 #define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
353 #if defined(SIMDE_MMX_NATIVE)
354 return SIMDE__M64_C(_mm_cmpgt_pi32(a.n, b.n));
358 for (
int i = 0; i < 2; i++) {
359 r.
i32[i] = (a.
i32[i] > b.
i32[i]) * 0xffffffff;
364 #define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
369 #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
370 return _mm_cvtm64_si64(a.n);
375 #define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
380 #if defined(SIMDE_MMX_NATIVE)
381 return SIMDE__M64_C(_mm_cvtsi32_si64(a));
389 #define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
394 #if defined(SIMDE_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI)
395 return SIMDE__M64_C(_mm_cvtsi64_m64(a));
402 #define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
407 #if defined(SIMDE_MMX_NATIVE)
408 return _mm_cvtsi64_si32(a.n);
417 #if defined(SIMDE_MMX_NATIVE)
422 #define simde_m_empty() simde_mm_empty()
427 #if defined(SIMDE_MMX_NATIVE)
428 return SIMDE__M64_C(_mm_madd_pi16(a.n, b.n));
432 for (
int i = 0; i < 4; i += 2) {
439 #define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
444 #if defined(SIMDE_MMX_NATIVE)
445 return SIMDE__M64_C(_mm_mulhi_pi16(a.n, b.n));
449 for (
int i = 0; i < 4; i++) {
450 r.
i16[i] = (int16_t)((a.
i16[i] * b.
i16[i]) >> 16);
455 #define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
460 #if defined(SIMDE_MMX_NATIVE)
461 return SIMDE__M64_C(_mm_mullo_pi16(a.n, b.n));
465 for (
int i = 0; i < 4; i++) {
466 r.
i16[i] = (int16_t)((a.
i16[i] * b.
i16[i]) & 0xffff);
471 #define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
476 #if defined(SIMDE_MMX_NATIVE)
477 return SIMDE__M64_C(_mm_or_si64(a.n, b.n));
484 #define simde_m_por(a, b) simde_mm_or_si64(a, b)
489 #if defined(SIMDE_MMX_NATIVE)
490 return SIMDE__M64_C(_mm_packs_pi16(a.n, b.n));
495 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
496 if (a.
i16[i] < INT8_MIN) {
498 }
else if (a.
i16[i] > INT8_MAX) {
501 r.
i8[i] = (int8_t)a.
i16[i];
506 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
507 if (b.
i16[i] < INT8_MIN) {
508 r.
i8[i + 4] = INT8_MIN;
509 }
else if (b.
i16[i] > INT8_MAX) {
510 r.
i8[i + 4] = INT8_MAX;
512 r.
i8[i + 4] = (int8_t)b.
i16[i];
519 #define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
524 #if defined(SIMDE_MMX_NATIVE)
525 return SIMDE__M64_C(_mm_packs_pi32(a.n, b.n));
530 for (
size_t i = 0; i < (8 /
sizeof(a.
i32[0])); i++) {
531 if (a.
i32[i] < SHRT_MIN) {
533 }
else if (a.
i32[i] > INT16_MAX) {
534 r.
i16[i] = INT16_MAX;
536 r.
i16[i] = (int16_t)a.
i32[i];
541 for (
size_t i = 0; i < (8 /
sizeof(b.
i32[0])); i++) {
542 if (b.
i32[i] < SHRT_MIN) {
543 r.
i16[i + 2] = SHRT_MIN;
544 }
else if (b.
i32[i] > INT16_MAX) {
545 r.
i16[i + 2] = INT16_MAX;
547 r.
i16[i + 2] = (int16_t)b.
i32[i];
554 #define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
559 #if defined(SIMDE_MMX_NATIVE)
560 return SIMDE__M64_C(_mm_packs_pu16(a.n, b.n));
565 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
566 if (a.
i16[i] > UINT8_MAX) {
568 }
else if (a.
i16[i] < 0) {
571 r.
u8[i] = (int8_t)a.
i16[i];
576 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
577 if (b.
i16[i] > UINT8_MAX) {
578 r.
u8[i + 4] = UINT8_MAX;
579 }
else if (b.
i16[i] < 0) {
582 r.
u8[i + 4] = (int8_t)b.
i16[i];
589 #define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
593 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
595 #if defined(SIMDE_MMX_NATIVE)
596 return SIMDE__M64_C(_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
613 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
615 #if defined(SIMDE_MMX_NATIVE)
616 return SIMDE__M64_C(_mm_set_pi8((int8_t)e7, (int8_t)e6, (int8_t)e5,
617 (int8_t)e4, (int8_t)e3, (int8_t)e2,
618 (int8_t)e1, (int8_t)e0));
636 #if defined(SIMDE_MMX_NATIVE)
637 return SIMDE__M64_C(_mm_set_pi16(e3, e2, e1, e0));
652 #if defined(SIMDE_MMX_NATIVE)
653 return SIMDE__M64_C(_mm_set_pi16((int16_t)e3, (int16_t)e2, (int16_t)e1,
668 #if defined(SIMDE_MMX_NATIVE)
669 return SIMDE__M64_C(_mm_set_pi32((int32_t)e1, (int32_t)e0));
681 #if defined(SIMDE_MMX_NATIVE)
682 return SIMDE__M64_C(_mm_set_pi32(e1, e0));
694 #if defined(SIMDE_MMX_NATIVE)
695 return SIMDE__M64_C(_mm_set1_pi8(a));
704 #if defined(SIMDE_MMX_NATIVE)
705 return SIMDE__M64_C(_mm_set1_pi16(a));
714 #if defined(SIMDE_MMX_NATIVE)
715 return SIMDE__M64_C(_mm_set1_pi32(a));
723 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
725 #if defined(SIMDE_MMX_NATIVE)
726 return SIMDE__M64_C(_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0));
735 #if defined(SIMDE_MMX_NATIVE)
736 return SIMDE__M64_C(_mm_setr_pi16(e3, e2, e1, e0));
745 #if defined(SIMDE_MMX_NATIVE)
746 return SIMDE__M64_C(_mm_setr_pi32(e1, e0));
755 #if defined(SIMDE_MMX_NATIVE)
756 return SIMDE__M64_C(_mm_setzero_si64());
765 #if defined(SIMDE_MMX_NATIVE)
766 return SIMDE__M64_C(_mm_sll_pi16(a.n, count.n));
771 memset(&r, 0,
sizeof(r));
776 for (
size_t i = 0; i < (
sizeof(r.
u16) /
sizeof(r.
u16[0])); i++) {
782 #define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
787 #if defined(SIMDE_MMX_NATIVE)
788 return SIMDE__M64_C(_mm_sll_pi32(a.n, count.n));
793 memset(&r, 0,
sizeof(r));
798 for (
size_t i = 0; i < (
sizeof(r.
u32) /
sizeof(r.
u32[0])); i++) {
804 #define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
809 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
810 return SIMDE__M64_C(_mm_slli_pi16(a.n, count));
815 for (
size_t i = 0; i < (
sizeof(r.
u16) /
sizeof(r.
u16[0])); i++) {
816 r.
u16[i] = a.
u16[i] << count;
822 #define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
827 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
828 return SIMDE__M64_C(_mm_slli_pi32(a.n, count));
833 for (
size_t i = 0; i < (8 /
sizeof(int)); i++) {
834 r.
u32[i] = a.
u32[i] << count;
840 #define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
845 #if defined(SIMDE_MMX_NATIVE)
846 return SIMDE__M64_C(_mm_slli_si64(a.n, count));
849 r.
u64[0] = a.
u64[0] << count;
853 #define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
858 #if defined(SIMDE_MMX_NATIVE)
859 return SIMDE__M64_C(_mm_sll_si64(a.n, count.n));
864 memset(&r, 0,
sizeof(r));
873 #define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
878 #if defined(SIMDE_MMX_NATIVE)
879 return SIMDE__M64_C(_mm_srl_pi16(a.n, count.n));
884 memset(&r, 0,
sizeof(r));
889 for (
size_t i = 0; i <
sizeof(r.
u16) /
sizeof(r.
u16[0]); i++) {
895 #define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
900 #if defined(SIMDE_MMX_NATIVE)
901 return SIMDE__M64_C(_mm_srl_pi32(a.n, count.n));
906 memset(&r, 0,
sizeof(r));
911 for (
size_t i = 0; i <
sizeof(r.
u32) /
sizeof(r.
u32[0]); i++) {
917 #define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
922 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
923 return SIMDE__M64_C(_mm_srli_pi16(a.n, count));
928 for (
size_t i = 0; i < (8 /
sizeof(uint16_t)); i++) {
929 r.
u16[i] = a.
u16[i] >> count;
935 #define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
940 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
941 return SIMDE__M64_C(_mm_srli_pi32(a.n, count));
946 for (
size_t i = 0; i < (8 /
sizeof(int)); i++) {
947 r.
u32[i] = a.
u32[i] >> count;
953 #define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
958 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
959 return SIMDE__M64_C(_mm_srli_si64(a.n, count));
962 r.
u64[0] = a.
u64[0] >> count;
966 #define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
971 #if defined(SIMDE_MMX_NATIVE)
972 return SIMDE__M64_C(_mm_srl_si64(a.n, count.n));
977 memset(&r, 0,
sizeof(r));
985 #define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
990 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
991 return SIMDE__M64_C(_mm_srai_pi16(a.n, count));
996 (uint16_t)((~0U) << ((
sizeof(int16_t) * CHAR_BIT) - count));
999 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
1000 const uint16_t is_neg = ((uint16_t)(
1001 ((a.
u16[i]) >> ((
sizeof(int16_t) * CHAR_BIT) - 1))));
1002 r.
u16[i] = (a.
u16[i] >> count) | (m * is_neg);
1008 #define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1013 #if defined(SIMDE_MMX_NATIVE) && !defined(__PGI)
1014 return SIMDE__M64_C(_mm_srai_pi32(a.n, count));
1019 (uint32_t)((~0U) << ((
sizeof(int) * CHAR_BIT) - count));
1021 for (
size_t i = 0; i < (8 /
sizeof(int)); i++) {
1022 const uint32_t is_neg = ((uint32_t)(
1023 ((a.
u32[i]) >> ((
sizeof(int) * CHAR_BIT) - 1))));
1024 r.
u32[i] = (a.
u32[i] >> count) | (m * is_neg);
1030 #define simde_m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1035 #if defined(SIMDE_MMX_NATIVE)
1036 return SIMDE__M64_C(_mm_sra_pi16(a.n, count.n));
1039 int cnt = (int)count.
i64[0];
1041 if (cnt > 15 || cnt < 0) {
1042 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0]));
1044 r.
u16[i] = (a.
i16[i] < 0) ? 0xffff : 0x0000;
1047 const uint16_t m = (uint16_t)(
1048 (~0U) << ((
sizeof(int16_t) * CHAR_BIT) - cnt));
1049 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0]));
1051 const uint16_t is_neg = a.
i16[i] < 0;
1052 r.
u16[i] = (a.
u16[i] >> cnt) | (m * is_neg);
1059 #define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
1064 #if defined(SIMDE_MMX_NATIVE)
1065 return SIMDE__M64_C(_mm_sra_pi32(a.n, count.n));
1068 const uint64_t cnt = count.
u64[0];
1071 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0]));
1073 r.
u32[i] = (a.
i32[i] < 0) ? UINT32_MAX : 0;
1075 }
else if (cnt == 0) {
1076 memcpy(&r, &a,
sizeof(r));
1078 const uint32_t m = (uint32_t)(
1079 (~0U) << ((
sizeof(int32_t) * CHAR_BIT) - cnt));
1080 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0]));
1082 const uint32_t is_neg = a.
i32[i] < 0;
1083 r.
u32[i] = (a.
u32[i] >> cnt) | (m * is_neg);
1090 #define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
1095 #if defined(SIMDE_MMX_NATIVE)
1096 return SIMDE__M64_C(_mm_sub_pi8(a.n, b.n));
1100 for (
size_t i = 0; i < 8; i++) {
1101 r.
i8[i] = a.
i8[i] - b.
i8[i];
1106 #define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
1111 #if defined(SIMDE_MMX_NATIVE)
1112 return SIMDE__M64_C(_mm_sub_pi16(a.n, b.n));
1116 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
1122 #define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
1127 #if defined(SIMDE_MMX_NATIVE)
1128 return SIMDE__M64_C(_mm_sub_pi32(a.n, b.n));
1132 for (
size_t i = 0; i < (8 /
sizeof(int)); i++) {
1138 #define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
1143 #if defined(SIMDE_MMX_NATIVE)
1144 return SIMDE__M64_C(_mm_subs_pi8(a.n, b.n));
1148 for (
size_t i = 0; i < (8); i++) {
1149 if (((b.
i8[i]) > 0 && (a.
i8[i]) < INT8_MIN + (b.
i8[i]))) {
1151 }
else if ((b.
i8[i]) < 0 && (a.
i8[i]) > INT8_MAX + (b.
i8[i])) {
1154 r.
i8[i] = (a.
i8[i]) - (b.
i8[i]);
1160 #define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
1165 #if defined(SIMDE_MMX_NATIVE)
1166 return SIMDE__M64_C(_mm_subs_pu8(a.n, b.n));
1170 for (
size_t i = 0; i < (8); i++) {
1171 const int32_t x = a.
u8[i] - b.
u8[i];
1174 }
else if (x > UINT8_MAX) {
1175 r.
u8[i] = UINT8_MAX;
1177 r.
u8[i] = (uint8_t)x;
1183 #define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
1188 #if defined(SIMDE_MMX_NATIVE)
1189 return SIMDE__M64_C(_mm_subs_pi16(a.n, b.n));
1193 for (
size_t i = 0; i < (8 /
sizeof(int16_t)); i++) {
1194 if (((b.
i16[i]) > 0 && (a.
i16[i]) < SHRT_MIN + (b.
i16[i]))) {
1195 r.
i16[i] = SHRT_MIN;
1196 }
else if ((b.
i16[i]) < 0 &&
1197 (a.
i16[i]) > INT16_MAX + (b.
i16[i])) {
1198 r.
i16[i] = INT16_MAX;
1206 #define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
1211 #if defined(SIMDE_MMX_NATIVE)
1212 return SIMDE__M64_C(_mm_subs_pu16(a.n, b.n));
1216 for (
size_t i = 0; i < (8 /
sizeof(uint16_t)); i++) {
1217 const int x = a.
u16[i] - b.
u16[i];
1220 }
else if (x > UINT16_MAX) {
1221 r.
u16[i] = UINT16_MAX;
1223 r.
u16[i] = (uint16_t)x;
1229 #define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
1234 #if defined(SIMDE_MMX_NATIVE)
1235 return SIMDE__M64_C(_mm_unpackhi_pi8(a.n, b.n));
1249 #define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
1254 #if defined(SIMDE_MMX_NATIVE)
1255 return SIMDE__M64_C(_mm_unpackhi_pi16(a.n, b.n));
1265 #define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
1270 #if defined(SIMDE_MMX_NATIVE)
1271 return SIMDE__M64_C(_mm_unpackhi_pi32(a.n, b.n));
1279 #define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
1284 #if defined(SIMDE_MMX_NATIVE)
1285 return SIMDE__M64_C(_mm_unpacklo_pi8(a.n, b.n));
1299 #define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
1304 #if defined(SIMDE_MMX_NATIVE)
1305 return SIMDE__M64_C(_mm_unpacklo_pi16(a.n, b.n));
1315 #define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
1320 #if defined(SIMDE_MMX_NATIVE)
1321 return SIMDE__M64_C(_mm_unpacklo_pi32(a.n, b.n));
1329 #define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
1334 #if defined(SIMDE_MMX_NATIVE)
1335 return SIMDE__M64_C(_mm_xor_si64(a.n, b.n));
1342 #define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
1347 #if defined(SIMDE_MMX_NATIVE)
1348 return _m_to_int(a.n);