1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
28 #error "SSE2 instruction set not enabled"
31 #include <xmmintrin.h>
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
42 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
43 _mm_add_sd(__m128d a, __m128d b)
49 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
50 _mm_add_pd(__m128d a, __m128d b)
55 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
56 _mm_sub_sd(__m128d a, __m128d b)
62 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
63 _mm_sub_pd(__m128d a, __m128d b)
68 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
69 _mm_mul_sd(__m128d a, __m128d b)
75 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
76 _mm_mul_pd(__m128d a, __m128d b)
81 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
82 _mm_div_sd(__m128d a, __m128d b)
88 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
89 _mm_div_pd(__m128d a, __m128d b)
94 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
95 _mm_sqrt_sd(__m128d a, __m128d b)
97 __m128d c = __builtin_ia32_sqrtsd(b);
98 return (__m128d) { c[0], a[1] };
101 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
102 _mm_sqrt_pd(__m128d a)
104 return __builtin_ia32_sqrtpd(a);
107 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
108 _mm_min_sd(__m128d a, __m128d b)
110 return __builtin_ia32_minsd(a, b);
113 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
114 _mm_min_pd(__m128d a, __m128d b)
116 return __builtin_ia32_minpd(a, b);
119 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
120 _mm_max_sd(__m128d a, __m128d b)
122 return __builtin_ia32_maxsd(a, b);
125 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
126 _mm_max_pd(__m128d a, __m128d b)
128 return __builtin_ia32_maxpd(a, b);
131 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
132 _mm_and_pd(__m128d a, __m128d b)
134 return (__m128d)((__v4si)a & (__v4si)b);
137 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
138 _mm_andnot_pd(__m128d a, __m128d b)
140 return (__m128d)(~(__v4si)a & (__v4si)b);
143 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
144 _mm_or_pd(__m128d a, __m128d b)
146 return (__m128d)((__v4si)a | (__v4si)b);
149 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
150 _mm_xor_pd(__m128d a, __m128d b)
152 return (__m128d)((__v4si)a ^ (__v4si)b);
155 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
156 _mm_cmpeq_pd(__m128d a, __m128d b)
158 return (__m128d)__builtin_ia32_cmppd(a, b, 0);
161 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
162 _mm_cmplt_pd(__m128d a, __m128d b)
164 return (__m128d)__builtin_ia32_cmppd(a, b, 1);
167 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
168 _mm_cmple_pd(__m128d a, __m128d b)
170 return (__m128d)__builtin_ia32_cmppd(a, b, 2);
173 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
174 _mm_cmpgt_pd(__m128d a, __m128d b)
176 return (__m128d)__builtin_ia32_cmppd(b, a, 1);
179 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
180 _mm_cmpge_pd(__m128d a, __m128d b)
182 return (__m128d)__builtin_ia32_cmppd(b, a, 2);
185 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
186 _mm_cmpord_pd(__m128d a, __m128d b)
188 return (__m128d)__builtin_ia32_cmppd(a, b, 7);
191 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
192 _mm_cmpunord_pd(__m128d a, __m128d b)
194 return (__m128d)__builtin_ia32_cmppd(a, b, 3);
197 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
198 _mm_cmpneq_pd(__m128d a, __m128d b)
200 return (__m128d)__builtin_ia32_cmppd(a, b, 4);
203 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
204 _mm_cmpnlt_pd(__m128d a, __m128d b)
206 return (__m128d)__builtin_ia32_cmppd(a, b, 5);
209 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
210 _mm_cmpnle_pd(__m128d a, __m128d b)
212 return (__m128d)__builtin_ia32_cmppd(a, b, 6);
215 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
216 _mm_cmpngt_pd(__m128d a, __m128d b)
218 return (__m128d)__builtin_ia32_cmppd(b, a, 5);
221 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
222 _mm_cmpnge_pd(__m128d a, __m128d b)
224 return (__m128d)__builtin_ia32_cmppd(b, a, 6);
227 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
228 _mm_cmpeq_sd(__m128d a, __m128d b)
230 return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
233 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
234 _mm_cmplt_sd(__m128d a, __m128d b)
236 return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
239 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
240 _mm_cmple_sd(__m128d a, __m128d b)
242 return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
245 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
246 _mm_cmpgt_sd(__m128d a, __m128d b)
248 return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
251 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
252 _mm_cmpge_sd(__m128d a, __m128d b)
254 return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
257 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
258 _mm_cmpord_sd(__m128d a, __m128d b)
260 return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
263 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
264 _mm_cmpunord_sd(__m128d a, __m128d b)
266 return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
269 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
270 _mm_cmpneq_sd(__m128d a, __m128d b)
272 return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
275 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
276 _mm_cmpnlt_sd(__m128d a, __m128d b)
278 return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
281 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
282 _mm_cmpnle_sd(__m128d a, __m128d b)
284 return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
287 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
288 _mm_cmpngt_sd(__m128d a, __m128d b)
290 return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
293 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
294 _mm_cmpnge_sd(__m128d a, __m128d b)
296 return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
299 static __inline__ int __attribute__((__always_inline__, __nodebug__))
300 _mm_comieq_sd(__m128d a, __m128d b)
302 return __builtin_ia32_comisdeq(a, b);
305 static __inline__ int __attribute__((__always_inline__, __nodebug__))
306 _mm_comilt_sd(__m128d a, __m128d b)
308 return __builtin_ia32_comisdlt(a, b);
311 static __inline__ int __attribute__((__always_inline__, __nodebug__))
312 _mm_comile_sd(__m128d a, __m128d b)
314 return __builtin_ia32_comisdle(a, b);
317 static __inline__ int __attribute__((__always_inline__, __nodebug__))
318 _mm_comigt_sd(__m128d a, __m128d b)
320 return __builtin_ia32_comisdgt(a, b);
323 static __inline__ int __attribute__((__always_inline__, __nodebug__))
324 _mm_comige_sd(__m128d a, __m128d b)
326 return __builtin_ia32_comisdge(a, b);
329 static __inline__ int __attribute__((__always_inline__, __nodebug__))
330 _mm_comineq_sd(__m128d a, __m128d b)
332 return __builtin_ia32_comisdneq(a, b);
335 static __inline__ int __attribute__((__always_inline__, __nodebug__))
336 _mm_ucomieq_sd(__m128d a, __m128d b)
338 return __builtin_ia32_ucomisdeq(a, b);
341 static __inline__ int __attribute__((__always_inline__, __nodebug__))
342 _mm_ucomilt_sd(__m128d a, __m128d b)
344 return __builtin_ia32_ucomisdlt(a, b);
347 static __inline__ int __attribute__((__always_inline__, __nodebug__))
348 _mm_ucomile_sd(__m128d a, __m128d b)
350 return __builtin_ia32_ucomisdle(a, b);
353 static __inline__ int __attribute__((__always_inline__, __nodebug__))
354 _mm_ucomigt_sd(__m128d a, __m128d b)
356 return __builtin_ia32_ucomisdgt(a, b);
359 static __inline__ int __attribute__((__always_inline__, __nodebug__))
360 _mm_ucomige_sd(__m128d a, __m128d b)
362 return __builtin_ia32_ucomisdge(a, b);
365 static __inline__ int __attribute__((__always_inline__, __nodebug__))
366 _mm_ucomineq_sd(__m128d a, __m128d b)
368 return __builtin_ia32_ucomisdneq(a, b);
371 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
372 _mm_cvtpd_ps(__m128d a)
374 return __builtin_ia32_cvtpd2ps(a);
377 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
378 _mm_cvtps_pd(__m128 a)
380 return __builtin_ia32_cvtps2pd(a);
383 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
384 _mm_cvtepi32_pd(__m128i a)
386 return __builtin_ia32_cvtdq2pd((__v4si)a);
389 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
390 _mm_cvtpd_epi32(__m128d a)
392 return __builtin_ia32_cvtpd2dq(a);
395 static __inline__ int __attribute__((__always_inline__, __nodebug__))
396 _mm_cvtsd_si32(__m128d a)
398 return __builtin_ia32_cvtsd2si(a);
401 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
402 _mm_cvtsd_ss(__m128 a, __m128d b)
408 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
409 _mm_cvtsi32_sd(__m128d a, int b)
415 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
416 _mm_cvtss_sd(__m128d a, __m128 b)
422 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
423 _mm_cvttpd_epi32(__m128d a)
425 return (__m128i)__builtin_ia32_cvttpd2dq(a);
428 static __inline__ int __attribute__((__always_inline__, __nodebug__))
429 _mm_cvttsd_si32(__m128d a)
434 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
435 _mm_cvtpd_pi32(__m128d a)
437 return (__m64)__builtin_ia32_cvtpd2pi(a);
440 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
441 _mm_cvttpd_pi32(__m128d a)
443 return (__m64)__builtin_ia32_cvttpd2pi(a);
446 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
447 _mm_cvtpi32_pd(__m64 a)
449 return __builtin_ia32_cvtpi2pd((__v2si)a);
452 static __inline__ double __attribute__((__always_inline__, __nodebug__))
453 _mm_cvtsd_f64(__m128d a)
458 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
459 _mm_load_pd(double const *dp)
461 return *(__m128d*)dp;
464 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
465 _mm_load1_pd(double const *dp)
467 struct __mm_load1_pd_struct {
469 } __attribute__((__packed__, __may_alias__));
470 double u = ((struct __mm_load1_pd_struct*)dp)->u;
471 return (__m128d){ u, u };
474 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
476 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
477 _mm_loadr_pd(double const *dp)
479 __m128d u = *(__m128d*)dp;
480 return __builtin_shufflevector(u, u, 1, 0);
483 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
484 _mm_loadu_pd(double const *dp)
488 } __attribute__((packed, may_alias));
489 return ((struct __loadu_pd*)dp)->v;
492 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
493 _mm_load_sd(double const *dp)
495 struct __mm_load_sd_struct {
497 } __attribute__((__packed__, __may_alias__));
498 double u = ((struct __mm_load_sd_struct*)dp)->u;
499 return (__m128d){ u, 0 };
502 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
503 _mm_loadh_pd(__m128d a, double const *dp)
505 struct __mm_loadh_pd_struct {
507 } __attribute__((__packed__, __may_alias__));
508 double u = ((struct __mm_loadh_pd_struct*)dp)->u;
509 return (__m128d){ a[0], u };
512 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
513 _mm_loadl_pd(__m128d a, double const *dp)
515 struct __mm_loadl_pd_struct {
517 } __attribute__((__packed__, __may_alias__));
518 double u = ((struct __mm_loadl_pd_struct*)dp)->u;
519 return (__m128d){ u, a[1] };
522 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
525 return (__m128d){ w, 0 };
528 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
529 _mm_set1_pd(double w)
531 return (__m128d){ w, w };
534 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
535 _mm_set_pd(double w, double x)
537 return (__m128d){ x, w };
540 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
541 _mm_setr_pd(double w, double x)
543 return (__m128d){ w, x };
546 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
549 return (__m128d){ 0, 0 };
552 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
553 _mm_move_sd(__m128d a, __m128d b)
555 return (__m128d){ b[0], a[1] };
558 static __inline__ void __attribute__((__always_inline__, __nodebug__))
559 _mm_store_sd(double *dp, __m128d a)
561 struct __mm_store_sd_struct {
563 } __attribute__((__packed__, __may_alias__));
564 ((struct __mm_store_sd_struct*)dp)->u = a[0];
567 static __inline__ void __attribute__((__always_inline__, __nodebug__))
568 _mm_store1_pd(double *dp, __m128d a)
570 struct __mm_store1_pd_struct {
572 } __attribute__((__packed__, __may_alias__));
573 ((struct __mm_store1_pd_struct*)dp)->u[0] = a[0];
574 ((struct __mm_store1_pd_struct*)dp)->u[1] = a[0];
577 static __inline__ void __attribute__((__always_inline__, __nodebug__))
578 _mm_store_pd(double *dp, __m128d a)
583 static __inline__ void __attribute__((__always_inline__, __nodebug__))
584 _mm_storeu_pd(double *dp, __m128d a)
586 __builtin_ia32_storeupd(dp, a);
589 static __inline__ void __attribute__((__always_inline__, __nodebug__))
590 _mm_storer_pd(double *dp, __m128d a)
592 a = __builtin_shufflevector(a, a, 1, 0);
596 static __inline__ void __attribute__((__always_inline__, __nodebug__))
597 _mm_storeh_pd(double *dp, __m128d a)
599 struct __mm_storeh_pd_struct {
601 } __attribute__((__packed__, __may_alias__));
602 ((struct __mm_storeh_pd_struct*)dp)->u = a[1];
605 static __inline__ void __attribute__((__always_inline__, __nodebug__))
606 _mm_storel_pd(double *dp, __m128d a)
608 struct __mm_storeh_pd_struct {
610 } __attribute__((__packed__, __may_alias__));
611 ((struct __mm_storeh_pd_struct*)dp)->u = a[0];
614 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
615 _mm_add_epi8(__m128i a, __m128i b)
617 return (__m128i)((__v16qi)a + (__v16qi)b);
620 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
621 _mm_add_epi16(__m128i a, __m128i b)
623 return (__m128i)((__v8hi)a + (__v8hi)b);
626 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
627 _mm_add_epi32(__m128i a, __m128i b)
629 return (__m128i)((__v4si)a + (__v4si)b);
632 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
633 _mm_add_si64(__m64 a, __m64 b)
638 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
639 _mm_add_epi64(__m128i a, __m128i b)
644 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
645 _mm_adds_epi8(__m128i a, __m128i b)
647 return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
650 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
651 _mm_adds_epi16(__m128i a, __m128i b)
653 return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
656 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
657 _mm_adds_epu8(__m128i a, __m128i b)
659 return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
662 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
663 _mm_adds_epu16(__m128i a, __m128i b)
665 return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
668 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
669 _mm_avg_epu8(__m128i a, __m128i b)
671 return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
674 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
675 _mm_avg_epu16(__m128i a, __m128i b)
677 return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
680 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
681 _mm_madd_epi16(__m128i a, __m128i b)
683 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
686 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
687 _mm_max_epi16(__m128i a, __m128i b)
689 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
692 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
693 _mm_max_epu8(__m128i a, __m128i b)
695 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
698 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
699 _mm_min_epi16(__m128i a, __m128i b)
701 return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
704 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
705 _mm_min_epu8(__m128i a, __m128i b)
707 return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
710 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
711 _mm_mulhi_epi16(__m128i a, __m128i b)
713 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
716 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
717 _mm_mulhi_epu16(__m128i a, __m128i b)
719 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
722 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
723 _mm_mullo_epi16(__m128i a, __m128i b)
725 return (__m128i)((__v8hi)a * (__v8hi)b);
728 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
729 _mm_mul_su32(__m64 a, __m64 b)
731 return __builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
734 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
735 _mm_mul_epu32(__m128i a, __m128i b)
737 return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
740 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
741 _mm_sad_epu8(__m128i a, __m128i b)
743 return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
746 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
747 _mm_sub_epi8(__m128i a, __m128i b)
749 return (__m128i)((__v16qi)a - (__v16qi)b);
752 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
753 _mm_sub_epi16(__m128i a, __m128i b)
755 return (__m128i)((__v8hi)a - (__v8hi)b);
758 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
759 _mm_sub_epi32(__m128i a, __m128i b)
761 return (__m128i)((__v4si)a - (__v4si)b);
764 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
765 _mm_sub_si64(__m64 a, __m64 b)
770 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
771 _mm_sub_epi64(__m128i a, __m128i b)
776 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
777 _mm_subs_epi8(__m128i a, __m128i b)
779 return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
782 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
783 _mm_subs_epi16(__m128i a, __m128i b)
785 return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
788 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
789 _mm_subs_epu8(__m128i a, __m128i b)
791 return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
794 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
795 _mm_subs_epu16(__m128i a, __m128i b)
797 return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
800 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
801 _mm_and_si128(__m128i a, __m128i b)
806 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
807 _mm_andnot_si128(__m128i a, __m128i b)
812 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
813 _mm_or_si128(__m128i a, __m128i b)
818 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
819 _mm_xor_si128(__m128i a, __m128i b)
824 #define _mm_slli_si128(VEC, IMM) \
825 ((__m128i)__builtin_ia32_pslldqi128((__m128i)(VEC), (IMM)*8))
827 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
828 _mm_slli_epi16(__m128i a, int count)
830 return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
833 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
834 _mm_sll_epi16(__m128i a, __m128i count)
836 return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
839 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
840 _mm_slli_epi32(__m128i a, int count)
842 return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
845 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
846 _mm_sll_epi32(__m128i a, __m128i count)
848 return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
851 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
852 _mm_slli_epi64(__m128i a, int count)
854 return __builtin_ia32_psllqi128(a, count);
857 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
858 _mm_sll_epi64(__m128i a, __m128i count)
860 return __builtin_ia32_psllq128(a, count);
863 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
864 _mm_srai_epi16(__m128i a, int count)
866 return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
869 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
870 _mm_sra_epi16(__m128i a, __m128i count)
872 return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
875 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
876 _mm_srai_epi32(__m128i a, int count)
878 return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
881 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
882 _mm_sra_epi32(__m128i a, __m128i count)
884 return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
888 #define _mm_srli_si128(VEC, IMM) \
889 ((__m128i)__builtin_ia32_psrldqi128((__m128i)(VEC), (IMM)*8))
891 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
892 _mm_srli_epi16(__m128i a, int count)
894 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
897 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
898 _mm_srl_epi16(__m128i a, __m128i count)
900 return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
903 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
904 _mm_srli_epi32(__m128i a, int count)
906 return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
909 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
910 _mm_srl_epi32(__m128i a, __m128i count)
912 return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
915 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
916 _mm_srli_epi64(__m128i a, int count)
918 return __builtin_ia32_psrlqi128(a, count);
921 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
922 _mm_srl_epi64(__m128i a, __m128i count)
924 return __builtin_ia32_psrlq128(a, count);
927 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
928 _mm_cmpeq_epi8(__m128i a, __m128i b)
930 return (__m128i)((__v16qi)a == (__v16qi)b);
933 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
934 _mm_cmpeq_epi16(__m128i a, __m128i b)
936 return (__m128i)((__v8hi)a == (__v8hi)b);
939 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
940 _mm_cmpeq_epi32(__m128i a, __m128i b)
942 return (__m128i)((__v4si)a == (__v4si)b);
945 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
946 _mm_cmpgt_epi8(__m128i a, __m128i b)
948 return (__m128i)((__v16qi)a > (__v16qi)b);
951 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
952 _mm_cmpgt_epi16(__m128i a, __m128i b)
954 return (__m128i)((__v8hi)a > (__v8hi)b);
957 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
958 _mm_cmpgt_epi32(__m128i a, __m128i b)
960 return (__m128i)((__v4si)a > (__v4si)b);
963 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
964 _mm_cmplt_epi8(__m128i a, __m128i b)
966 return _mm_cmpgt_epi8(b,a);
969 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
970 _mm_cmplt_epi16(__m128i a, __m128i b)
972 return _mm_cmpgt_epi16(b,a);
975 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
976 _mm_cmplt_epi32(__m128i a, __m128i b)
978 return _mm_cmpgt_epi32(b,a);
982 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
983 _mm_cvtsi64_sd(__m128d a, long long b)
989 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
990 _mm_cvtsd_si64(__m128d a)
992 return __builtin_ia32_cvtsd2si64(a);
995 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
996 _mm_cvttsd_si64(__m128d a)
1002 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1003 _mm_cvtepi32_ps(__m128i a)
1005 return __builtin_ia32_cvtdq2ps((__v4si)a);
1008 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1009 _mm_cvtps_epi32(__m128 a)
1011 return (__m128i)__builtin_ia32_cvtps2dq(a);
1014 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1015 _mm_cvttps_epi32(__m128 a)
1017 return (__m128i)__builtin_ia32_cvttps2dq(a);
1020 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1021 _mm_cvtsi32_si128(int a)
1023 return (__m128i)(__v4si){ a, 0, 0, 0 };
1027 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1028 _mm_cvtsi64_si128(long long a)
1030 return (__m128i){ a, 0 };
1034 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1035 _mm_cvtsi128_si32(__m128i a)
1037 __v4si b = (__v4si)a;
1042 static __inline__ long long __attribute__((__always_inline__, __nodebug__))
1043 _mm_cvtsi128_si64(__m128i a)
1049 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1050 _mm_load_si128(__m128i const *p)
1055 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1056 _mm_loadu_si128(__m128i const *p)
1058 struct __loadu_si128 {
1060 } __attribute__((packed, may_alias));
1061 return ((struct __loadu_si128*)p)->v;
1064 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1065 _mm_loadl_epi64(__m128i const *p)
1067 struct __mm_loadl_epi64_struct {
1069 } __attribute__((__packed__, __may_alias__));
1070 return (__m128i) { ((struct __mm_loadl_epi64_struct*)p)->u, 0};
1073 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1074 _mm_set_epi64x(long long q1, long long q0)
1076 return (__m128i){ q0, q1 };
1079 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1080 _mm_set_epi64(__m64 q1, __m64 q0)
1082 return (__m128i){ (long long)q0, (long long)q1 };
1085 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1086 _mm_set_epi32(int i3, int i2, int i1, int i0)
1088 return (__m128i)(__v4si){ i0, i1, i2, i3};
1091 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1092 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1094 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1097 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1098 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1100 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1103 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1104 _mm_set1_epi64x(long long q)
1106 return (__m128i){ q, q };
1109 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1110 _mm_set1_epi64(__m64 q)
1112 return (__m128i){ (long long)q, (long long)q };
1115 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1116 _mm_set1_epi32(int i)
1118 return (__m128i)(__v4si){ i, i, i, i };
1121 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1122 _mm_set1_epi16(short w)
1124 return (__m128i)(__v8hi){ w, w, w, w, w, w, w, w };
1127 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1128 _mm_set1_epi8(char b)
1130 return (__m128i)(__v16qi){ b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b };
1133 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1134 _mm_setr_epi64(__m64 q0, __m64 q1)
1136 return (__m128i){ (long long)q0, (long long)q1 };
1139 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1140 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1142 return (__m128i)(__v4si){ i0, i1, i2, i3};
1145 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1146 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1148 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1151 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1152 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1154 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1157 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1158 _mm_setzero_si128(void)
1160 return (__m128i){ 0LL, 0LL };
1163 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1164 _mm_store_si128(__m128i *p, __m128i b)
1169 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1170 _mm_storeu_si128(__m128i *p, __m128i b)
1172 __builtin_ia32_storedqu((char *)p, (__v16qi)b);
1175 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1176 _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1178 __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1181 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1182 _mm_storel_epi64(__m128i *p, __m128i a)
1184 __builtin_ia32_storelv4si((__v2si *)p, a);
1187 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1188 _mm_stream_pd(double *p, __m128d a)
1190 __builtin_ia32_movntpd(p, a);
1193 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1194 _mm_stream_si128(__m128i *p, __m128i a)
1196 __builtin_ia32_movntdq(p, a);
1199 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1200 _mm_stream_si32(int *p, int a)
1202 __builtin_ia32_movnti(p, a);
1205 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1206 _mm_clflush(void const *p)
1208 __builtin_ia32_clflush(p);
1211 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1214 __builtin_ia32_lfence();
1217 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1220 __builtin_ia32_mfence();
1223 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1224 _mm_packs_epi16(__m128i a, __m128i b)
1226 return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1229 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1230 _mm_packs_epi32(__m128i a, __m128i b)
1232 return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1235 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1236 _mm_packus_epi16(__m128i a, __m128i b)
1238 return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1241 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1242 _mm_extract_epi16(__m128i a, int imm)
1244 __v8hi b = (__v8hi)a;
1245 return (unsigned short)b[imm];
1248 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1249 _mm_insert_epi16(__m128i a, int b, int imm)
1251 __v8hi c = (__v8hi)a;
1256 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1257 _mm_movemask_epi8(__m128i a)
1259 return __builtin_ia32_pmovmskb128((__v16qi)a);
1262 #define _mm_shuffle_epi32(a, imm) \
1263 ((__m128i)__builtin_shufflevector((__v4si)(a), (__v4si) _mm_set1_epi32(0), \
1264 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1265 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6))
1268 #define _mm_shufflelo_epi16(a, imm) \
1269 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), \
1270 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1271 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1273 #define _mm_shufflehi_epi16(a, imm) \
1274 ((__m128i)__builtin_shufflevector((__v8hi)(a), (__v8hi) _mm_set1_epi16(0), 0, 1, 2, 3, \
1275 4 + (((imm) & 0x03) >> 0), \
1276 4 + (((imm) & 0x0c) >> 2), \
1277 4 + (((imm) & 0x30) >> 4), \
1278 4 + (((imm) & 0xc0) >> 6)))
1280 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1281 _mm_unpackhi_epi8(__m128i a, __m128i b)
1283 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1286 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1287 _mm_unpackhi_epi16(__m128i a, __m128i b)
1289 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1292 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1293 _mm_unpackhi_epi32(__m128i a, __m128i b)
1295 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4+2, 3, 4+3);
1298 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1299 _mm_unpackhi_epi64(__m128i a, __m128i b)
1301 return (__m128i)__builtin_shufflevector(a, b, 1, 2+1);
1304 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1305 _mm_unpacklo_epi8(__m128i a, __m128i b)
1307 return (__m128i)__builtin_shufflevector((__v16qi)a, (__v16qi)b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1310 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1311 _mm_unpacklo_epi16(__m128i a, __m128i b)
1313 return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1316 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1317 _mm_unpacklo_epi32(__m128i a, __m128i b)
1319 return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4+0, 1, 4+1);
1322 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1323 _mm_unpacklo_epi64(__m128i a, __m128i b)
1325 return (__m128i)__builtin_shufflevector(a, b, 0, 2+0);
1328 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__))
1329 _mm_movepi64_pi64(__m128i a)
1334 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1335 _mm_movpi64_pi64(__m64 a)
1337 return (__m128i){ (long long)a, 0 };
1340 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1341 _mm_move_epi64(__m128i a)
1343 return __builtin_shufflevector(a, (__m128i){ 0 }, 0, 2);
1346 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1347 _mm_unpackhi_pd(__m128d a, __m128d b)
1349 return __builtin_shufflevector(a, b, 1, 2+1);
1352 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1353 _mm_unpacklo_pd(__m128d a, __m128d b)
1355 return __builtin_shufflevector(a, b, 0, 2+0);
1358 static __inline__ int __attribute__((__always_inline__, __nodebug__))
1359 _mm_movemask_pd(__m128d a)
1361 return __builtin_ia32_movmskpd(a);
1364 #define _mm_shuffle_pd(a, b, i) \
1365 (__builtin_shufflevector((__m128d)(a), (__m128d)(b), (i) & 1, \
1366 (((i) & 2) >> 1) + 2))
1368 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1369 _mm_castpd_ps(__m128d in)
1374 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1375 _mm_castpd_si128(__m128d in)
1380 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1381 _mm_castps_pd(__m128 in)
1386 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
1387 _mm_castps_si128(__m128 in)
1392 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
1393 _mm_castsi128_ps(__m128i in)
1398 static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
1399 _mm_castsi128_pd(__m128i in)
1404 static __inline__ void __attribute__((__always_inline__, __nodebug__))
1407 __asm__ volatile ("pause");
1410 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1412 #endif /* __SSE2__ */
1414 #endif /* __EMMINTRIN_H */