1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
28 #error "SSE2 instruction set not enabled"
31 #include <xmmintrin.h>
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43 _mm_add_sd(__m128d a, __m128d b)
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50 _mm_add_pd(__m128d a, __m128d b)
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56 _mm_sub_sd(__m128d a, __m128d b)
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63 _mm_sub_pd(__m128d a, __m128d b)
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69 _mm_mul_sd(__m128d a, __m128d b)
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76 _mm_mul_pd(__m128d a, __m128d b)
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82 _mm_div_sd(__m128d a, __m128d b)
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89 _mm_div_pd(__m128d a, __m128d b)
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95 _mm_sqrt_sd(__m128d a, __m128d b)
97 __m128d c = __builtin_ia32_sqrtsd(b);
98 return (__m128d) { c[0], a[1] };
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102 _mm_sqrt_pd(__m128d a)
104 return __builtin_ia32_sqrtpd(a);
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108 _mm_min_sd(__m128d a, __m128d b)
110 return __builtin_ia32_minsd(a, b);
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114 _mm_min_pd(__m128d a, __m128d b)
116 return __builtin_ia32_minpd(a, b);
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120 _mm_max_sd(__m128d a, __m128d b)
122 return __builtin_ia32_maxsd(a, b);
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126 _mm_max_pd(__m128d a, __m128d b)
128 return __builtin_ia32_maxpd(a, b);
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132 _mm_and_pd(__m128d a, __m128d b)
134 return (__m128d)((__v4si)a & (__v4si)b);
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138 _mm_andnot_pd(__m128d a, __m128d b)
140 return (__m128d)(~(__v4si)a & (__v4si)b);
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144 _mm_or_pd(__m128d a, __m128d b)
146 return (__m128d)((__v4si)a | (__v4si)b);
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150 _mm_xor_pd(__m128d a, __m128d b)
152 return (__m128d)((__v4si)a ^ (__v4si)b);
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156 _mm_cmpeq_pd(__m128d a, __m128d b)
158 return (__m128d)__builtin_ia32_cmppd(a, b, 0);
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162 _mm_cmplt_pd(__m128d a, __m128d b)
164 return (__m128d)__builtin_ia32_cmppd(a, b, 1);
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168 _mm_cmple_pd(__m128d a, __m128d b)
170 return (__m128d)__builtin_ia32_cmppd(a, b, 2);
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174 _mm_cmpgt_pd(__m128d a, __m128d b)
176 return (__m128d)__builtin_ia32_cmppd(b, a, 1);
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180 _mm_cmpge_pd(__m128d a, __m128d b)
182 return (__m128d)__builtin_ia32_cmppd(b, a, 2);
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186 _mm_cmpord_pd(__m128d a, __m128d b)
188 return (__m128d)__builtin_ia32_cmppd(a, b, 7);
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192 _mm_cmpunord_pd(__m128d a, __m128d b)
194 return (__m128d)__builtin_ia32_cmppd(a, b, 3);
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198 _mm_cmpneq_pd(__m128d a, __m128d b)
200 return (__m128d)__builtin_ia32_cmppd(a, b, 4);
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204 _mm_cmpnlt_pd(__m128d a, __m128d b)
206 return (__m128d)__builtin_ia32_cmppd(a, b, 5);
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210 _mm_cmpnle_pd(__m128d a, __m128d b)
212 return (__m128d)__builtin_ia32_cmppd(a, b, 6);
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216 _mm_cmpngt_pd(__m128d a, __m128d b)
218 return (__m128d)__builtin_ia32_cmppd(b, a, 5);
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222 _mm_cmpnge_pd(__m128d a, __m128d b)
224 return (__m128d)__builtin_ia32_cmppd(b, a, 6);
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228 _mm_cmpeq_sd(__m128d a, __m128d b)
230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234 _mm_cmplt_sd(__m128d a, __m128d b)
236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240 _mm_cmple_sd(__m128d a, __m128d b)
242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246 _mm_cmpgt_sd(__m128d a, __m128d b)
248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252 _mm_cmpge_sd(__m128d a, __m128d b)
254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258 _mm_cmpord_sd(__m128d a, __m128d b)
260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264 _mm_cmpunord_sd(__m128d a, __m128d b)
266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270 _mm_cmpneq_sd(__m128d a, __m128d b)
272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276 _mm_cmpnlt_sd(__m128d a, __m128d b)
278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282 _mm_cmpnle_sd(__m128d a, __m128d b)
284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288 _mm_cmpngt_sd(__m128d a, __m128d b)
290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294 _mm_cmpnge_sd(__m128d a, __m128d b)
296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
300 _mm_comieq_sd(__m128d a, __m128d b)
302 return __builtin_ia32_comisdeq(a, b);
305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
306 _mm_comilt_sd(__m128d a, __m128d b)
308 return __builtin_ia32_comisdlt(a, b);
311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
312 _mm_comile_sd(__m128d a, __m128d b)
314 return __builtin_ia32_comisdle(a, b);
317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
318 _mm_comigt_sd(__m128d a, __m128d b)
320 return __builtin_ia32_comisdgt(a, b);
323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
324 _mm_comineq_sd(__m128d a, __m128d b)
326 return __builtin_ia32_comisdneq(a, b);
329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
330 _mm_ucomieq_sd(__m128d a, __m128d b)
332 return __builtin_ia32_ucomisdeq(a, b);
335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
336 _mm_ucomilt_sd(__m128d a, __m128d b)
338 return __builtin_ia32_ucomisdlt(a, b);
341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
342 _mm_ucomile_sd(__m128d a, __m128d b)
344 return __builtin_ia32_ucomisdle(a, b);
347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
348 _mm_ucomigt_sd(__m128d a, __m128d b)
350 return __builtin_ia32_ucomisdgt(a, b);
353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
354 _mm_ucomineq_sd(__m128d a, __m128d b)
356 return __builtin_ia32_ucomisdneq(a, b);
359 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
360 _mm_cvtpd_ps(__m128d a)
362 return __builtin_ia32_cvtpd2ps(a);
365 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
366 _mm_cvtps_pd(__m128 a)
368 return __builtin_ia32_cvtps2pd(a);
371 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
372 _mm_cvtepi32_pd(__m128i a)
374 return __builtin_ia32_cvtdq2pd((__v4si)a);
377 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
378 _mm_cvtpd_epi32(__m128d a)
380 return __builtin_ia32_cvtpd2dq(a);
383 static __inline__ int __attribute__((__always_inline__, __nodebug__))
384 _mm_cvtsd_si32(__m128d a)
386 return __builtin_ia32_cvtsd2si(a);
389 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
390 _mm_cvtsd_ss(__m128 a, __m128d b)
396 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
397 _mm_cvtsi32_sd(__m128d a, int b)
403 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
404 _mm_cvtss_sd(__m128d a, __m128 b)
410 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
411 _mm_cvttpd_epi32(__m128d a)
413 return (__m128i)__builtin_ia32_cvttpd2dq(a);
416 static __inline__ int __attribute__((__always_inline__, __nodebug__))
417 _mm_cvttsd_si32(__m128d a)
422 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
423 _mm_cvtpd_pi32(__m128d a)
425 return (__m64)__builtin_ia32_cvtpd2pi(a);
428 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
429 _mm_cvttpd_pi32(__m128d a)
431 return (__m64)__builtin_ia32_cvttpd2pi(a);
434 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
435 _mm_cvtpi32_pd(__m64 a)
437 return __builtin_ia32_cvtpi2pd((__v2si)a);
440 static __inline__ double __attribute__((__always_inline__, __nodebug__))
441 _mm_cvtsd_f64(__m128d a)
446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447 _mm_load_pd(double const *dp)
449 return *(__m128d*)dp;
452 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
453 _mm_load1_pd(double const *dp)
455 return (__m128d){ dp[0], dp[0] };
458 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
460 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
461 _mm_loadr_pd(double const *dp)
463 return (__m128d){ dp[1], dp[0] };
466 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
467 _mm_loadu_pd(double const *dp)
471 } __attribute__((packed, may_alias));
472 return ((struct __loadu_pd*)dp)->v;
475 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
476 _mm_load_sd(double const *dp)
478 return (__m128d){ *dp, 0.0 };
481 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
482 _mm_loadh_pd(__m128d a, double const *dp)
484 return (__m128d){ a[0], *dp };
487 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
488 _mm_loadl_pd(__m128d a, double const *dp)
490 return (__m128d){ *dp, a[1] };
493 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
496 return (__m128d){ w, 0 };
499 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
500 _mm_set1_pd(double w)
502 return (__m128d){ w, w };
505 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
506 _mm_set_pd(double w, double x)
508 return (__m128d){ x, w };
511 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
512 _mm_setr_pd(double w, double x)
514 return (__m128d){ w, x };
517 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
520 return (__m128d){ 0, 0 };
523 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
524 _mm_move_sd(__m128d a, __m128d b)
526 return (__m128d){ b[0], a[1] };
529 static __inline__ void __attribute__((__always_inline__, __nodebug__))
530 _mm_store_sd(double *dp, __m128d a)
535 static __inline__ void __attribute__((__always_inline__, __nodebug__))
536 _mm_store1_pd(double *dp, __m128d a)
542 static __inline__ void __attribute__((__always_inline__, __nodebug__))
543 _mm_store_pd(double *dp, __m128d a)
548 static __inline__ void __attribute__((__always_inline__, __nodebug__))
549 _mm_storeu_pd(double *dp, __m128d a)
551 __builtin_ia32_storeupd(dp, a);
554 static __inline__ void __attribute__((__always_inline__, __nodebug__))
555 _mm_storer_pd(double *dp, __m128d a)
561 static __inline__ void __attribute__((__always_inline__, __nodebug__))
562 _mm_storeh_pd(double *dp, __m128d a)
567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
568 _mm_storel_pd(double *dp, __m128d a)
573 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
574 _mm_add_epi8(__m128i a, __m128i b)
576 return (__m128i)((__v16qi)a + (__v16qi)b);
579 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
580 _mm_add_epi16(__m128i a, __m128i b)
582 return (__m128i)((__v8hi)a + (__v8hi)b);
585 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
586 _mm_add_epi32(__m128i a, __m128i b)
588 return (__m128i)((__v4si)a + (__v4si)b);
591 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
592 _mm_add_si64(__m64 a, __m64 b)
597 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
598 _mm_add_epi64(__m128i a, __m128i b)
603 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
604 _mm_adds_epi8(__m128i a, __m128i b)
606 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
609 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
610 _mm_adds_epi16(__m128i a, __m128i b)
612 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
615 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
616 _mm_adds_epu8(__m128i a, __m128i b)
618 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
621 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
622 _mm_adds_epu16(__m128i a, __m128i b)
624 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
627 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
628 _mm_avg_epu8(__m128i a, __m128i b)
630 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
633 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
634 _mm_avg_epu16(__m128i a, __m128i b)
636 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
639 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
640 _mm_madd_epi16(__m128i a, __m128i b)
642 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
645 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
646 _mm_max_epi16(__m128i a, __m128i b)
648 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
651 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
652 _mm_max_epu8(__m128i a, __m128i b)
654 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
657 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
658 _mm_min_epi16(__m128i a, __m128i b)
660 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
663 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
664 _mm_min_epu8(__m128i a, __m128i b)
666 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
669 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
670 _mm_mulhi_epi16(__m128i a, __m128i b)
672 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
675 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
676 _mm_mulhi_epu16(__m128i a, __m128i b)
678 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
681 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
682 _mm_mullo_epi16(__m128i a, __m128i b)
684 return (__m128i)((__v8hi)a * (__v8hi)b);
687 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
688 _mm_mul_su32(__m64 a, __m64 b)
690 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
693 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
694 _mm_mul_epu32(__m128i a, __m128i b)
696 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
699 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
700 _mm_sad_epu8(__m128i a, __m128i b)
702 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
705 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
706 _mm_sub_epi8(__m128i a, __m128i b)
708 return (__m128i)((__v16qi)a - (__v16qi)b);
711 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
712 _mm_sub_epi16(__m128i a, __m128i b)
714 return (__m128i)((__v8hi)a - (__v8hi)b);
717 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
718 _mm_sub_epi32(__m128i a, __m128i b)
720 return (__m128i)((__v4si)a - (__v4si)b);
723 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
724 _mm_sub_si64(__m64 a, __m64 b)
729 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
730 _mm_sub_epi64(__m128i a, __m128i b)
735 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
736 _mm_subs_epi8(__m128i a, __m128i b)
738 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
741 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
742 _mm_subs_epi16(__m128i a, __m128i b)
744 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
747 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
748 _mm_subs_epu8(__m128i a, __m128i b)
750 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
753 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
754 _mm_subs_epu16(__m128i a, __m128i b)
756 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
759 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
760 _mm_and_si128(__m128i a, __m128i b)
765 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
766 _mm_andnot_si128(__m128i a, __m128i b)
771 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
772 _mm_or_si128(__m128i a, __m128i b)
777 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
778 _mm_xor_si128(__m128i a, __m128i b)
783 #define _mm_slli_si128(VEC, IMM) \
784 ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
786 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
787 _mm_slli_epi16(__m128i a, int count)
789 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
792 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
793 _mm_sll_epi16(__m128i a, __m128i count)
795 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
798 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
799 _mm_slli_epi32(__m128i a, int count)
801 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
804 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
805 _mm_sll_epi32(__m128i a, __m128i count)
807 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
810 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
811 _mm_slli_epi64(__m128i a, int count)
813 return __builtin_ia32_psllqi128(a, count);
816 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
817 _mm_sll_epi64(__m128i a, __m128i count)
819 return __builtin_ia32_psllq128(a, count);
822 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
823 _mm_srai_epi16(__m128i a, int count)
825 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
828 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
829 _mm_sra_epi16(__m128i a, __m128i count)
831 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
834 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
835 _mm_srai_epi32(__m128i a, int count)
837 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
840 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
841 _mm_sra_epi32(__m128i a, __m128i count)
843 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
847 #define _mm_srli_si128(VEC, IMM) \
848 ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
850 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
851 _mm_srli_epi16(__m128i a, int count)
853 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
856 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
857 _mm_srl_epi16(__m128i a, __m128i count)
859 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
862 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
863 _mm_srli_epi32(__m128i a, int count)
865 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
868 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
869 _mm_srl_epi32(__m128i a, __m128i count)
871 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
874 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
875 _mm_srli_epi64(__m128i a, int count)
877 return __builtin_ia32_psrlqi128(a, count);
880 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
881 _mm_srl_epi64(__m128i a, __m128i count)
883 return __builtin_ia32_psrlq128(a, count);
886 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
887 _mm_cmpeq_epi8(__m128i a, __m128i b)
889 return (__m128i)((__v16qi)a == (__v16qi)b);
892 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
893 _mm_cmpeq_epi16(__m128i a, __m128i b)
895 return (__m128i)((__v8hi)a == (__v8hi)b);
898 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
899 _mm_cmpeq_epi32(__m128i a, __m128i b)
901 return (__m128i)((__v4si)a == (__v4si)b);
904 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
905 _mm_cmpgt_epi8(__m128i a, __m128i b)
907 return (__m128i)((__v16qi)a > (__v16qi)b);
910 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
911 _mm_cmpgt_epi16(__m128i a, __m128i b)
913 return (__m128i)((__v8hi)a > (__v8hi)b);
916 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
917 _mm_cmpgt_epi32(__m128i a, __m128i b)
919 return (__m128i)((__v4si)a > (__v4si)b);
922 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
923 _mm_cmplt_epi8(__m128i a, __m128i b)
925 return _mm_cmpgt_epi8(b,a);
928 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
929 _mm_cmplt_epi16(__m128i a, __m128i b)
931 return _mm_cmpgt_epi16(b,a);
934 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
935 _mm_cmplt_epi32(__m128i a, __m128i b)
937 return _mm_cmpgt_epi32(b,a);
941 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
942 _mm_cvtsi64_sd(__m128d a, long long b)
948 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
949 _mm_cvtsd_si64(__m128d a)
951 return __builtin_ia32_cvtsd2si64(a);
954 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
955 _mm_cvttsd_si64(__m128d a)
961 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
962 _mm_cvtepi32_ps(__m128i a)
964 return __builtin_ia32_cvtdq2ps((__v4si)a);
967 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
968 _mm_cvtps_epi32(__m128 a)
970 return (__m128i)__builtin_ia32_cvtps2dq(a);
973 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
974 _mm_cvttps_epi32(__m128 a)
976 return (__m128i)__builtin_ia32_cvttps2dq(a);
979 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
980 _mm_cvtsi32_si128(int a)
982 return (__m128i)(__v4si){ a, 0, 0, 0 };
986 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
987 _mm_cvtsi64_si128(long long a)
989 return (__m128i){ a, 0 };
993 static __inline__ int __attribute__((__always_inline__, __nodebug__))
994 _mm_cvtsi128_si32(__m128i a)
996 __v4si b = (__v4si)a;
1001 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1002 _mm_cvtsi128_si64(__m128i a)
1008 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1009 _mm_load_si128(__m128i const *p)
1014 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1015 _mm_loadu_si128(__m128i const *p)
1017 struct __loadu_si128 {
1019 } __attribute__((packed, may_alias));
1020 return ((struct __loadu_si128*)p)->v;
1023 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1024 _mm_loadl_epi64(__m128i const *p)
1026 return (__m128i) { *(long long*)p, 0};
1029 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1030 _mm_set_epi64x(long long q1, long long q0)
1032 return (__m128i){ q0, q1 };
1035 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1036 _mm_set_epi64(__m64 q1, __m64 q0)
1038 return (__m128i){ (long long)q0, (long long)q1 };
1041 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1042 _mm_set_epi32(int i3, int i2, int i1, int i0)
1044 return (__m128i)(__v4si){ i0, i1, i2, i3};
1047 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1048 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1050 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1053 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1054 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1056 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1059 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1060 _mm_set1_epi64x(long long q)
1062 return (__m128i){ q, q };
1065 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1066 _mm_set1_epi64(__m64 q)
1068 return (__m128i){ (long long)q, (long long)q };
1071 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1072 _mm_set1_epi32(int i)
1074 return (__m128i)(__v4si){ i, i, i, i };
1077 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1078 _mm_set1_epi16(short w)
1080 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1083 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1084 _mm_set1_epi8(char b)
1086 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1089 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1090 _mm_setr_epi64(__m64 q0, __m64 q1)
1092 return (__m128i){ (long long)q0, (long long)q1 };
1095 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1096 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1098 return (__m128i)(__v4si){ i0, i1, i2, i3};
1101 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1102 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1104 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1107 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1108 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1110 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1113 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1114 _mm_setzero_si128(void)
1116 return (__m128i){ 0LL, 0LL };
1119 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1120 _mm_store_si128(__m128i *p, __m128i b)
1125 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1126 _mm_storeu_si128(__m128i *p, __m128i b)
1128 __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1131 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1132 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1134 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1137 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1138 _mm_storel_epi64(__m128i *p, __m128i a)
1140 __builtin_ia32_storelv4si((__v2si *)p, a);
1143 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1144 _mm_stream_pd(double *p, __m128d a)
1146 __builtin_ia32_movntpd(p, a);
1149 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1150 _mm_stream_si128(__m128i *p, __m128i a)
1152 __builtin_ia32_movntdq(p, a);
1155 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1156 _mm_stream_si32(int *p, int a)
1158 __builtin_ia32_movnti(p, a);
1161 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1162 _mm_clflush(void const *p)
1164 __builtin_ia32_clflush(p);
1167 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1170 __builtin_ia32_lfence();
1173 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1176 __builtin_ia32_mfence();
1179 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1180 _mm_packs_epi16(__m128i a, __m128i b)
1182 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1185 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1186 _mm_packs_epi32(__m128i a, __m128i b)
1188 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1191 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1192 _mm_packus_epi16(__m128i a, __m128i b)
1194 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1197 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1198 _mm_extract_epi16(__m128i a, int imm)
1200 __v8hi b = (__v8hi)a;
1201 return (unsigned short)b[imm];
1204 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1205 _mm_insert_epi16(__m128i a, int b, int imm)
1207 __v8hi c = (__v8hi)a;
1212 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1213 _mm_movemask_epi8(__m128i a)
1215 return __builtin_ia32_pmovmskb128((__v16qi)a);
1218 #define _mm_shuffle_epi32(a, imm) \
1219 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1220 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1221 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1224 #define _mm_shufflelo_epi16(a, imm) \
1225 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1226 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1227 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1229 #define _mm_shufflehi_epi16(a, imm) \
1230 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1231 4 + (((imm) & 0x03) >> 0), \
1232 4 + (((imm) & 0x0c) >> 2), \
1233 4 + (((imm) & 0x30) >> 4), \
1234 4 + (((imm) & 0xc0) >> 6)))
1236 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1237 _mm_unpackhi_epi8(__m128i a, __m128i b)
1239 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1242 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1243 _mm_unpackhi_epi16(__m128i a, __m128i b)
1245 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1248 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1249 _mm_unpackhi_epi32(__m128i a, __m128i b)
1251 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1254 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1255 _mm_unpackhi_epi64(__m128i a, __m128i b)
1257 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1260 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1261 _mm_unpacklo_epi8(__m128i a, __m128i b)
1263 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1266 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1267 _mm_unpacklo_epi16(__m128i a, __m128i b)
1269 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1272 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1273 _mm_unpacklo_epi32(__m128i a, __m128i b)
1275 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1278 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1279 _mm_unpacklo_epi64(__m128i a, __m128i b)
1281 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1284 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1285 _mm_movepi64_pi64(__m128i a)
1290 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1291 _mm_movpi64_pi64(__m64 a)
1293 return (__m128i){ (long long)a, 0 };
1296 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1297 _mm_move_epi64(__m128i a)
1299 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1302 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1303 _mm_unpackhi_pd(__m128d a, __m128d b)
1305 return __builtin_shufflevector(a, b, 1, 2+1);
1308 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1309 _mm_unpacklo_pd(__m128d a, __m128d b)
1311 return __builtin_shufflevector(a, b, 0, 2+0);
1314 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1315 _mm_movemask_pd(__m128d a)
1317 return __builtin_ia32_movmskpd(a);
1320 #define _mm_shuffle_pd(a, b, i) \
1321 (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1322 (((i) & 2) >> 1) + 2))
1324 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1325 _mm_castpd_ps(__m128d in)
1330 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1331 _mm_castpd_si128(__m128d in)
1336 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1337 _mm_castps_pd(__m128 in)
1342 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1343 _mm_castps_si128(__m128 in)
1348 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1349 _mm_castsi128_ps(__m128i in)
1354 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1355 _mm_castsi128_pd(__m128i in)
1360 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1363 __asm__ volatile ("pause");
1366 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1368 #endif /* __SSE2__ */
1370 #endif /* __EMMINTRIN_H */