1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 *===-----------------------------------------------------------------------===
28 #error "SSE2 instruction set not enabled"
31 #include <xmmintrin.h>
33 typedef double __m128d __attribute__((__vector_size__(16)));
34 typedef long long __m128i __attribute__((__vector_size__(16)));
37 typedef double __v2df __attribute__ ((__vector_size__ (16)));
38 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
39 typedef short __v8hi __attribute__((__vector_size__(16)));
40 typedef char __v16qi __attribute__((__vector_size__(16)));
42 /* Define the default attributes for the functions in this file. */
43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
45 static __inline__ __m128d __DEFAULT_FN_ATTRS
46 _mm_add_sd(__m128d __a, __m128d __b)
52 static __inline__ __m128d __DEFAULT_FN_ATTRS
53 _mm_add_pd(__m128d __a, __m128d __b)
58 static __inline__ __m128d __DEFAULT_FN_ATTRS
59 _mm_sub_sd(__m128d __a, __m128d __b)
65 static __inline__ __m128d __DEFAULT_FN_ATTRS
66 _mm_sub_pd(__m128d __a, __m128d __b)
71 static __inline__ __m128d __DEFAULT_FN_ATTRS
72 _mm_mul_sd(__m128d __a, __m128d __b)
78 static __inline__ __m128d __DEFAULT_FN_ATTRS
79 _mm_mul_pd(__m128d __a, __m128d __b)
84 static __inline__ __m128d __DEFAULT_FN_ATTRS
85 _mm_div_sd(__m128d __a, __m128d __b)
91 static __inline__ __m128d __DEFAULT_FN_ATTRS
92 _mm_div_pd(__m128d __a, __m128d __b)
97 static __inline__ __m128d __DEFAULT_FN_ATTRS
98 _mm_sqrt_sd(__m128d __a, __m128d __b)
100 __m128d __c = __builtin_ia32_sqrtsd(__b);
101 return (__m128d) { __c[0], __a[1] };
104 static __inline__ __m128d __DEFAULT_FN_ATTRS
105 _mm_sqrt_pd(__m128d __a)
107 return __builtin_ia32_sqrtpd(__a);
110 static __inline__ __m128d __DEFAULT_FN_ATTRS
111 _mm_min_sd(__m128d __a, __m128d __b)
113 return __builtin_ia32_minsd(__a, __b);
116 static __inline__ __m128d __DEFAULT_FN_ATTRS
117 _mm_min_pd(__m128d __a, __m128d __b)
119 return __builtin_ia32_minpd(__a, __b);
122 static __inline__ __m128d __DEFAULT_FN_ATTRS
123 _mm_max_sd(__m128d __a, __m128d __b)
125 return __builtin_ia32_maxsd(__a, __b);
128 static __inline__ __m128d __DEFAULT_FN_ATTRS
129 _mm_max_pd(__m128d __a, __m128d __b)
131 return __builtin_ia32_maxpd(__a, __b);
134 static __inline__ __m128d __DEFAULT_FN_ATTRS
135 _mm_and_pd(__m128d __a, __m128d __b)
137 return (__m128d)((__v4si)__a & (__v4si)__b);
140 static __inline__ __m128d __DEFAULT_FN_ATTRS
141 _mm_andnot_pd(__m128d __a, __m128d __b)
143 return (__m128d)(~(__v4si)__a & (__v4si)__b);
146 static __inline__ __m128d __DEFAULT_FN_ATTRS
147 _mm_or_pd(__m128d __a, __m128d __b)
149 return (__m128d)((__v4si)__a | (__v4si)__b);
152 static __inline__ __m128d __DEFAULT_FN_ATTRS
153 _mm_xor_pd(__m128d __a, __m128d __b)
155 return (__m128d)((__v4si)__a ^ (__v4si)__b);
158 static __inline__ __m128d __DEFAULT_FN_ATTRS
159 _mm_cmpeq_pd(__m128d __a, __m128d __b)
161 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
164 static __inline__ __m128d __DEFAULT_FN_ATTRS
165 _mm_cmplt_pd(__m128d __a, __m128d __b)
167 return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
170 static __inline__ __m128d __DEFAULT_FN_ATTRS
171 _mm_cmple_pd(__m128d __a, __m128d __b)
173 return (__m128d)__builtin_ia32_cmplepd(__a, __b);
176 static __inline__ __m128d __DEFAULT_FN_ATTRS
177 _mm_cmpgt_pd(__m128d __a, __m128d __b)
179 return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
182 static __inline__ __m128d __DEFAULT_FN_ATTRS
183 _mm_cmpge_pd(__m128d __a, __m128d __b)
185 return (__m128d)__builtin_ia32_cmplepd(__b, __a);
188 static __inline__ __m128d __DEFAULT_FN_ATTRS
189 _mm_cmpord_pd(__m128d __a, __m128d __b)
191 return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
194 static __inline__ __m128d __DEFAULT_FN_ATTRS
195 _mm_cmpunord_pd(__m128d __a, __m128d __b)
197 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
200 static __inline__ __m128d __DEFAULT_FN_ATTRS
201 _mm_cmpneq_pd(__m128d __a, __m128d __b)
203 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
206 static __inline__ __m128d __DEFAULT_FN_ATTRS
207 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
209 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
212 static __inline__ __m128d __DEFAULT_FN_ATTRS
213 _mm_cmpnle_pd(__m128d __a, __m128d __b)
215 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
218 static __inline__ __m128d __DEFAULT_FN_ATTRS
219 _mm_cmpngt_pd(__m128d __a, __m128d __b)
221 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
224 static __inline__ __m128d __DEFAULT_FN_ATTRS
225 _mm_cmpnge_pd(__m128d __a, __m128d __b)
227 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
230 static __inline__ __m128d __DEFAULT_FN_ATTRS
231 _mm_cmpeq_sd(__m128d __a, __m128d __b)
233 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
236 static __inline__ __m128d __DEFAULT_FN_ATTRS
237 _mm_cmplt_sd(__m128d __a, __m128d __b)
239 return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
242 static __inline__ __m128d __DEFAULT_FN_ATTRS
243 _mm_cmple_sd(__m128d __a, __m128d __b)
245 return (__m128d)__builtin_ia32_cmplesd(__a, __b);
248 static __inline__ __m128d __DEFAULT_FN_ATTRS
249 _mm_cmpgt_sd(__m128d __a, __m128d __b)
251 __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
252 return (__m128d) { __c[0], __a[1] };
255 static __inline__ __m128d __DEFAULT_FN_ATTRS
256 _mm_cmpge_sd(__m128d __a, __m128d __b)
258 __m128d __c = __builtin_ia32_cmplesd(__b, __a);
259 return (__m128d) { __c[0], __a[1] };
262 static __inline__ __m128d __DEFAULT_FN_ATTRS
263 _mm_cmpord_sd(__m128d __a, __m128d __b)
265 return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
268 static __inline__ __m128d __DEFAULT_FN_ATTRS
269 _mm_cmpunord_sd(__m128d __a, __m128d __b)
271 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
274 static __inline__ __m128d __DEFAULT_FN_ATTRS
275 _mm_cmpneq_sd(__m128d __a, __m128d __b)
277 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
280 static __inline__ __m128d __DEFAULT_FN_ATTRS
281 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
283 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
286 static __inline__ __m128d __DEFAULT_FN_ATTRS
287 _mm_cmpnle_sd(__m128d __a, __m128d __b)
289 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
292 static __inline__ __m128d __DEFAULT_FN_ATTRS
293 _mm_cmpngt_sd(__m128d __a, __m128d __b)
295 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
296 return (__m128d) { __c[0], __a[1] };
299 static __inline__ __m128d __DEFAULT_FN_ATTRS
300 _mm_cmpnge_sd(__m128d __a, __m128d __b)
302 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
303 return (__m128d) { __c[0], __a[1] };
306 static __inline__ int __DEFAULT_FN_ATTRS
307 _mm_comieq_sd(__m128d __a, __m128d __b)
309 return __builtin_ia32_comisdeq(__a, __b);
312 static __inline__ int __DEFAULT_FN_ATTRS
313 _mm_comilt_sd(__m128d __a, __m128d __b)
315 return __builtin_ia32_comisdlt(__a, __b);
318 static __inline__ int __DEFAULT_FN_ATTRS
319 _mm_comile_sd(__m128d __a, __m128d __b)
321 return __builtin_ia32_comisdle(__a, __b);
324 static __inline__ int __DEFAULT_FN_ATTRS
325 _mm_comigt_sd(__m128d __a, __m128d __b)
327 return __builtin_ia32_comisdgt(__a, __b);
330 static __inline__ int __DEFAULT_FN_ATTRS
331 _mm_comige_sd(__m128d __a, __m128d __b)
333 return __builtin_ia32_comisdge(__a, __b);
336 static __inline__ int __DEFAULT_FN_ATTRS
337 _mm_comineq_sd(__m128d __a, __m128d __b)
339 return __builtin_ia32_comisdneq(__a, __b);
342 static __inline__ int __DEFAULT_FN_ATTRS
343 _mm_ucomieq_sd(__m128d __a, __m128d __b)
345 return __builtin_ia32_ucomisdeq(__a, __b);
348 static __inline__ int __DEFAULT_FN_ATTRS
349 _mm_ucomilt_sd(__m128d __a, __m128d __b)
351 return __builtin_ia32_ucomisdlt(__a, __b);
354 static __inline__ int __DEFAULT_FN_ATTRS
355 _mm_ucomile_sd(__m128d __a, __m128d __b)
357 return __builtin_ia32_ucomisdle(__a, __b);
360 static __inline__ int __DEFAULT_FN_ATTRS
361 _mm_ucomigt_sd(__m128d __a, __m128d __b)
363 return __builtin_ia32_ucomisdgt(__a, __b);
366 static __inline__ int __DEFAULT_FN_ATTRS
367 _mm_ucomige_sd(__m128d __a, __m128d __b)
369 return __builtin_ia32_ucomisdge(__a, __b);
372 static __inline__ int __DEFAULT_FN_ATTRS
373 _mm_ucomineq_sd(__m128d __a, __m128d __b)
375 return __builtin_ia32_ucomisdneq(__a, __b);
378 static __inline__ __m128 __DEFAULT_FN_ATTRS
379 _mm_cvtpd_ps(__m128d __a)
381 return __builtin_ia32_cvtpd2ps(__a);
384 static __inline__ __m128d __DEFAULT_FN_ATTRS
385 _mm_cvtps_pd(__m128 __a)
387 return __builtin_ia32_cvtps2pd(__a);
390 static __inline__ __m128d __DEFAULT_FN_ATTRS
391 _mm_cvtepi32_pd(__m128i __a)
393 return __builtin_ia32_cvtdq2pd((__v4si)__a);
396 static __inline__ __m128i __DEFAULT_FN_ATTRS
397 _mm_cvtpd_epi32(__m128d __a)
399 return __builtin_ia32_cvtpd2dq(__a);
402 static __inline__ int __DEFAULT_FN_ATTRS
403 _mm_cvtsd_si32(__m128d __a)
405 return __builtin_ia32_cvtsd2si(__a);
408 static __inline__ __m128 __DEFAULT_FN_ATTRS
409 _mm_cvtsd_ss(__m128 __a, __m128d __b)
415 static __inline__ __m128d __DEFAULT_FN_ATTRS
416 _mm_cvtsi32_sd(__m128d __a, int __b)
422 static __inline__ __m128d __DEFAULT_FN_ATTRS
423 _mm_cvtss_sd(__m128d __a, __m128 __b)
429 static __inline__ __m128i __DEFAULT_FN_ATTRS
430 _mm_cvttpd_epi32(__m128d __a)
432 return (__m128i)__builtin_ia32_cvttpd2dq(__a);
435 static __inline__ int __DEFAULT_FN_ATTRS
436 _mm_cvttsd_si32(__m128d __a)
441 static __inline__ __m64 __DEFAULT_FN_ATTRS
442 _mm_cvtpd_pi32(__m128d __a)
444 return (__m64)__builtin_ia32_cvtpd2pi(__a);
447 static __inline__ __m64 __DEFAULT_FN_ATTRS
448 _mm_cvttpd_pi32(__m128d __a)
450 return (__m64)__builtin_ia32_cvttpd2pi(__a);
453 static __inline__ __m128d __DEFAULT_FN_ATTRS
454 _mm_cvtpi32_pd(__m64 __a)
456 return __builtin_ia32_cvtpi2pd((__v2si)__a);
459 static __inline__ double __DEFAULT_FN_ATTRS
460 _mm_cvtsd_f64(__m128d __a)
465 static __inline__ __m128d __DEFAULT_FN_ATTRS
466 _mm_load_pd(double const *__dp)
468 return *(__m128d*)__dp;
471 static __inline__ __m128d __DEFAULT_FN_ATTRS
472 _mm_load1_pd(double const *__dp)
474 struct __mm_load1_pd_struct {
476 } __attribute__((__packed__, __may_alias__));
477 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
478 return (__m128d){ __u, __u };
481 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
483 static __inline__ __m128d __DEFAULT_FN_ATTRS
484 _mm_loadr_pd(double const *__dp)
486 __m128d __u = *(__m128d*)__dp;
487 return __builtin_shufflevector(__u, __u, 1, 0);
490 static __inline__ __m128d __DEFAULT_FN_ATTRS
491 _mm_loadu_pd(double const *__dp)
495 } __attribute__((__packed__, __may_alias__));
496 return ((struct __loadu_pd*)__dp)->__v;
499 static __inline__ __m128d __DEFAULT_FN_ATTRS
500 _mm_load_sd(double const *__dp)
502 struct __mm_load_sd_struct {
504 } __attribute__((__packed__, __may_alias__));
505 double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
506 return (__m128d){ __u, 0 };
509 static __inline__ __m128d __DEFAULT_FN_ATTRS
510 _mm_loadh_pd(__m128d __a, double const *__dp)
512 struct __mm_loadh_pd_struct {
514 } __attribute__((__packed__, __may_alias__));
515 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
516 return (__m128d){ __a[0], __u };
519 static __inline__ __m128d __DEFAULT_FN_ATTRS
520 _mm_loadl_pd(__m128d __a, double const *__dp)
522 struct __mm_loadl_pd_struct {
524 } __attribute__((__packed__, __may_alias__));
525 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
526 return (__m128d){ __u, __a[1] };
529 static __inline__ __m128d __DEFAULT_FN_ATTRS
530 _mm_set_sd(double __w)
532 return (__m128d){ __w, 0 };
535 static __inline__ __m128d __DEFAULT_FN_ATTRS
536 _mm_set1_pd(double __w)
538 return (__m128d){ __w, __w };
541 static __inline__ __m128d __DEFAULT_FN_ATTRS
542 _mm_set_pd(double __w, double __x)
544 return (__m128d){ __x, __w };
547 static __inline__ __m128d __DEFAULT_FN_ATTRS
548 _mm_setr_pd(double __w, double __x)
550 return (__m128d){ __w, __x };
553 static __inline__ __m128d __DEFAULT_FN_ATTRS
556 return (__m128d){ 0, 0 };
559 static __inline__ __m128d __DEFAULT_FN_ATTRS
560 _mm_move_sd(__m128d __a, __m128d __b)
562 return (__m128d){ __b[0], __a[1] };
565 static __inline__ void __DEFAULT_FN_ATTRS
566 _mm_store_sd(double *__dp, __m128d __a)
568 struct __mm_store_sd_struct {
570 } __attribute__((__packed__, __may_alias__));
571 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
574 static __inline__ void __DEFAULT_FN_ATTRS
575 _mm_store1_pd(double *__dp, __m128d __a)
577 struct __mm_store1_pd_struct {
579 } __attribute__((__packed__, __may_alias__));
580 ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
581 ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
584 static __inline__ void __DEFAULT_FN_ATTRS
585 _mm_store_pd(double *__dp, __m128d __a)
587 *(__m128d *)__dp = __a;
590 static __inline__ void __DEFAULT_FN_ATTRS
591 _mm_storeu_pd(double *__dp, __m128d __a)
593 __builtin_ia32_storeupd(__dp, __a);
596 static __inline__ void __DEFAULT_FN_ATTRS
597 _mm_storer_pd(double *__dp, __m128d __a)
599 __a = __builtin_shufflevector(__a, __a, 1, 0);
600 *(__m128d *)__dp = __a;
603 static __inline__ void __DEFAULT_FN_ATTRS
604 _mm_storeh_pd(double *__dp, __m128d __a)
606 struct __mm_storeh_pd_struct {
608 } __attribute__((__packed__, __may_alias__));
609 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
612 static __inline__ void __DEFAULT_FN_ATTRS
613 _mm_storel_pd(double *__dp, __m128d __a)
615 struct __mm_storeh_pd_struct {
617 } __attribute__((__packed__, __may_alias__));
618 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
621 static __inline__ __m128i __DEFAULT_FN_ATTRS
622 _mm_add_epi8(__m128i __a, __m128i __b)
624 return (__m128i)((__v16qi)__a + (__v16qi)__b);
627 static __inline__ __m128i __DEFAULT_FN_ATTRS
628 _mm_add_epi16(__m128i __a, __m128i __b)
630 return (__m128i)((__v8hi)__a + (__v8hi)__b);
633 static __inline__ __m128i __DEFAULT_FN_ATTRS
634 _mm_add_epi32(__m128i __a, __m128i __b)
636 return (__m128i)((__v4si)__a + (__v4si)__b);
639 static __inline__ __m64 __DEFAULT_FN_ATTRS
640 _mm_add_si64(__m64 __a, __m64 __b)
645 static __inline__ __m128i __DEFAULT_FN_ATTRS
646 _mm_add_epi64(__m128i __a, __m128i __b)
651 static __inline__ __m128i __DEFAULT_FN_ATTRS
652 _mm_adds_epi8(__m128i __a, __m128i __b)
654 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
657 static __inline__ __m128i __DEFAULT_FN_ATTRS
658 _mm_adds_epi16(__m128i __a, __m128i __b)
660 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
663 static __inline__ __m128i __DEFAULT_FN_ATTRS
664 _mm_adds_epu8(__m128i __a, __m128i __b)
666 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
669 static __inline__ __m128i __DEFAULT_FN_ATTRS
670 _mm_adds_epu16(__m128i __a, __m128i __b)
672 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
675 static __inline__ __m128i __DEFAULT_FN_ATTRS
676 _mm_avg_epu8(__m128i __a, __m128i __b)
678 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
681 static __inline__ __m128i __DEFAULT_FN_ATTRS
682 _mm_avg_epu16(__m128i __a, __m128i __b)
684 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
687 static __inline__ __m128i __DEFAULT_FN_ATTRS
688 _mm_madd_epi16(__m128i __a, __m128i __b)
690 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
693 static __inline__ __m128i __DEFAULT_FN_ATTRS
694 _mm_max_epi16(__m128i __a, __m128i __b)
696 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
699 static __inline__ __m128i __DEFAULT_FN_ATTRS
700 _mm_max_epu8(__m128i __a, __m128i __b)
702 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
705 static __inline__ __m128i __DEFAULT_FN_ATTRS
706 _mm_min_epi16(__m128i __a, __m128i __b)
708 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
711 static __inline__ __m128i __DEFAULT_FN_ATTRS
712 _mm_min_epu8(__m128i __a, __m128i __b)
714 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
717 static __inline__ __m128i __DEFAULT_FN_ATTRS
718 _mm_mulhi_epi16(__m128i __a, __m128i __b)
720 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
723 static __inline__ __m128i __DEFAULT_FN_ATTRS
724 _mm_mulhi_epu16(__m128i __a, __m128i __b)
726 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
729 static __inline__ __m128i __DEFAULT_FN_ATTRS
730 _mm_mullo_epi16(__m128i __a, __m128i __b)
732 return (__m128i)((__v8hi)__a * (__v8hi)__b);
735 static __inline__ __m64 __DEFAULT_FN_ATTRS
736 _mm_mul_su32(__m64 __a, __m64 __b)
738 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
741 static __inline__ __m128i __DEFAULT_FN_ATTRS
742 _mm_mul_epu32(__m128i __a, __m128i __b)
744 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
747 static __inline__ __m128i __DEFAULT_FN_ATTRS
748 _mm_sad_epu8(__m128i __a, __m128i __b)
750 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
753 static __inline__ __m128i __DEFAULT_FN_ATTRS
754 _mm_sub_epi8(__m128i __a, __m128i __b)
756 return (__m128i)((__v16qi)__a - (__v16qi)__b);
759 static __inline__ __m128i __DEFAULT_FN_ATTRS
760 _mm_sub_epi16(__m128i __a, __m128i __b)
762 return (__m128i)((__v8hi)__a - (__v8hi)__b);
765 static __inline__ __m128i __DEFAULT_FN_ATTRS
766 _mm_sub_epi32(__m128i __a, __m128i __b)
768 return (__m128i)((__v4si)__a - (__v4si)__b);
771 static __inline__ __m64 __DEFAULT_FN_ATTRS
772 _mm_sub_si64(__m64 __a, __m64 __b)
777 static __inline__ __m128i __DEFAULT_FN_ATTRS
778 _mm_sub_epi64(__m128i __a, __m128i __b)
783 static __inline__ __m128i __DEFAULT_FN_ATTRS
784 _mm_subs_epi8(__m128i __a, __m128i __b)
786 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
789 static __inline__ __m128i __DEFAULT_FN_ATTRS
790 _mm_subs_epi16(__m128i __a, __m128i __b)
792 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
795 static __inline__ __m128i __DEFAULT_FN_ATTRS
796 _mm_subs_epu8(__m128i __a, __m128i __b)
798 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
801 static __inline__ __m128i __DEFAULT_FN_ATTRS
802 _mm_subs_epu16(__m128i __a, __m128i __b)
804 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
807 static __inline__ __m128i __DEFAULT_FN_ATTRS
808 _mm_and_si128(__m128i __a, __m128i __b)
813 static __inline__ __m128i __DEFAULT_FN_ATTRS
814 _mm_andnot_si128(__m128i __a, __m128i __b)
819 static __inline__ __m128i __DEFAULT_FN_ATTRS
820 _mm_or_si128(__m128i __a, __m128i __b)
825 static __inline__ __m128i __DEFAULT_FN_ATTRS
826 _mm_xor_si128(__m128i __a, __m128i __b)
831 #define _mm_slli_si128(a, imm) __extension__ ({ \
832 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \
833 (__v16qi)(__m128i)(a), \
834 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
835 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
836 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
837 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
838 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
839 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
840 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
841 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
842 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
843 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
844 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
845 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
846 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
847 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
848 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
849 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
851 #define _mm_bslli_si128(a, imm) \
852 _mm_slli_si128((a), (imm))
854 static __inline__ __m128i __DEFAULT_FN_ATTRS
855 _mm_slli_epi16(__m128i __a, int __count)
857 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
860 static __inline__ __m128i __DEFAULT_FN_ATTRS
861 _mm_sll_epi16(__m128i __a, __m128i __count)
863 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
866 static __inline__ __m128i __DEFAULT_FN_ATTRS
867 _mm_slli_epi32(__m128i __a, int __count)
869 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
872 static __inline__ __m128i __DEFAULT_FN_ATTRS
873 _mm_sll_epi32(__m128i __a, __m128i __count)
875 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
878 static __inline__ __m128i __DEFAULT_FN_ATTRS
879 _mm_slli_epi64(__m128i __a, int __count)
881 return __builtin_ia32_psllqi128(__a, __count);
884 static __inline__ __m128i __DEFAULT_FN_ATTRS
885 _mm_sll_epi64(__m128i __a, __m128i __count)
887 return __builtin_ia32_psllq128(__a, __count);
890 static __inline__ __m128i __DEFAULT_FN_ATTRS
891 _mm_srai_epi16(__m128i __a, int __count)
893 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
896 static __inline__ __m128i __DEFAULT_FN_ATTRS
897 _mm_sra_epi16(__m128i __a, __m128i __count)
899 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
902 static __inline__ __m128i __DEFAULT_FN_ATTRS
903 _mm_srai_epi32(__m128i __a, int __count)
905 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
908 static __inline__ __m128i __DEFAULT_FN_ATTRS
909 _mm_sra_epi32(__m128i __a, __m128i __count)
911 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
914 #define _mm_srli_si128(a, imm) __extension__ ({ \
915 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \
916 (__v16qi)_mm_setzero_si128(), \
917 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \
918 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \
919 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \
920 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \
921 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \
922 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \
923 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \
924 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \
925 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \
926 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \
927 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
928 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
929 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
930 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
931 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
932 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
934 #define _mm_bsrli_si128(a, imm) \
935 _mm_srli_si128((a), (imm))
937 static __inline__ __m128i __DEFAULT_FN_ATTRS
938 _mm_srli_epi16(__m128i __a, int __count)
940 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
943 static __inline__ __m128i __DEFAULT_FN_ATTRS
944 _mm_srl_epi16(__m128i __a, __m128i __count)
946 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
949 static __inline__ __m128i __DEFAULT_FN_ATTRS
950 _mm_srli_epi32(__m128i __a, int __count)
952 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
955 static __inline__ __m128i __DEFAULT_FN_ATTRS
956 _mm_srl_epi32(__m128i __a, __m128i __count)
958 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
961 static __inline__ __m128i __DEFAULT_FN_ATTRS
962 _mm_srli_epi64(__m128i __a, int __count)
964 return __builtin_ia32_psrlqi128(__a, __count);
967 static __inline__ __m128i __DEFAULT_FN_ATTRS
968 _mm_srl_epi64(__m128i __a, __m128i __count)
970 return __builtin_ia32_psrlq128(__a, __count);
973 static __inline__ __m128i __DEFAULT_FN_ATTRS
974 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
976 return (__m128i)((__v16qi)__a == (__v16qi)__b);
979 static __inline__ __m128i __DEFAULT_FN_ATTRS
980 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
982 return (__m128i)((__v8hi)__a == (__v8hi)__b);
985 static __inline__ __m128i __DEFAULT_FN_ATTRS
986 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
988 return (__m128i)((__v4si)__a == (__v4si)__b);
991 static __inline__ __m128i __DEFAULT_FN_ATTRS
992 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
994 /* This function always performs a signed comparison, but __v16qi is a char
995 which may be signed or unsigned. */
996 typedef signed char __v16qs __attribute__((__vector_size__(16)));
997 return (__m128i)((__v16qs)__a > (__v16qs)__b);
1000 static __inline__ __m128i __DEFAULT_FN_ATTRS
1001 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
1003 return (__m128i)((__v8hi)__a > (__v8hi)__b);
1006 static __inline__ __m128i __DEFAULT_FN_ATTRS
1007 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1009 return (__m128i)((__v4si)__a > (__v4si)__b);
1012 static __inline__ __m128i __DEFAULT_FN_ATTRS
1013 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1015 return _mm_cmpgt_epi8(__b, __a);
1018 static __inline__ __m128i __DEFAULT_FN_ATTRS
1019 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1021 return _mm_cmpgt_epi16(__b, __a);
1024 static __inline__ __m128i __DEFAULT_FN_ATTRS
1025 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1027 return _mm_cmpgt_epi32(__b, __a);
1031 static __inline__ __m128d __DEFAULT_FN_ATTRS
1032 _mm_cvtsi64_sd(__m128d __a, long long __b)
1038 static __inline__ long long __DEFAULT_FN_ATTRS
1039 _mm_cvtsd_si64(__m128d __a)
1041 return __builtin_ia32_cvtsd2si64(__a);
1044 static __inline__ long long __DEFAULT_FN_ATTRS
1045 _mm_cvttsd_si64(__m128d __a)
1051 static __inline__ __m128 __DEFAULT_FN_ATTRS
1052 _mm_cvtepi32_ps(__m128i __a)
1054 return __builtin_ia32_cvtdq2ps((__v4si)__a);
1057 static __inline__ __m128i __DEFAULT_FN_ATTRS
1058 _mm_cvtps_epi32(__m128 __a)
1060 return (__m128i)__builtin_ia32_cvtps2dq(__a);
1063 static __inline__ __m128i __DEFAULT_FN_ATTRS
1064 _mm_cvttps_epi32(__m128 __a)
1066 return (__m128i)__builtin_ia32_cvttps2dq(__a);
1069 static __inline__ __m128i __DEFAULT_FN_ATTRS
1070 _mm_cvtsi32_si128(int __a)
1072 return (__m128i)(__v4si){ __a, 0, 0, 0 };
1076 static __inline__ __m128i __DEFAULT_FN_ATTRS
1077 _mm_cvtsi64_si128(long long __a)
1079 return (__m128i){ __a, 0 };
1083 static __inline__ int __DEFAULT_FN_ATTRS
1084 _mm_cvtsi128_si32(__m128i __a)
1086 __v4si __b = (__v4si)__a;
1091 static __inline__ long long __DEFAULT_FN_ATTRS
1092 _mm_cvtsi128_si64(__m128i __a)
1098 static __inline__ __m128i __DEFAULT_FN_ATTRS
1099 _mm_load_si128(__m128i const *__p)
1104 static __inline__ __m128i __DEFAULT_FN_ATTRS
1105 _mm_loadu_si128(__m128i const *__p)
1107 struct __loadu_si128 {
1109 } __attribute__((__packed__, __may_alias__));
1110 return ((struct __loadu_si128*)__p)->__v;
1113 static __inline__ __m128i __DEFAULT_FN_ATTRS
1114 _mm_loadl_epi64(__m128i const *__p)
1116 struct __mm_loadl_epi64_struct {
1118 } __attribute__((__packed__, __may_alias__));
1119 return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1122 static __inline__ __m128i __DEFAULT_FN_ATTRS
1123 _mm_set_epi64x(long long q1, long long q0)
1125 return (__m128i){ q0, q1 };
1128 static __inline__ __m128i __DEFAULT_FN_ATTRS
1129 _mm_set_epi64(__m64 q1, __m64 q0)
1131 return (__m128i){ (long long)q0, (long long)q1 };
1134 static __inline__ __m128i __DEFAULT_FN_ATTRS
1135 _mm_set_epi32(int i3, int i2, int i1, int i0)
1137 return (__m128i)(__v4si){ i0, i1, i2, i3};
1140 static __inline__ __m128i __DEFAULT_FN_ATTRS
1141 _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
1143 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1146 static __inline__ __m128i __DEFAULT_FN_ATTRS
1147 _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
1149 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1152 static __inline__ __m128i __DEFAULT_FN_ATTRS
1153 _mm_set1_epi64x(long long __q)
1155 return (__m128i){ __q, __q };
1158 static __inline__ __m128i __DEFAULT_FN_ATTRS
1159 _mm_set1_epi64(__m64 __q)
1161 return (__m128i){ (long long)__q, (long long)__q };
1164 static __inline__ __m128i __DEFAULT_FN_ATTRS
1165 _mm_set1_epi32(int __i)
1167 return (__m128i)(__v4si){ __i, __i, __i, __i };
1170 static __inline__ __m128i __DEFAULT_FN_ATTRS
1171 _mm_set1_epi16(short __w)
1173 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1176 static __inline__ __m128i __DEFAULT_FN_ATTRS
1177 _mm_set1_epi8(char __b)
1179 return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
1182 static __inline__ __m128i __DEFAULT_FN_ATTRS
1183 _mm_setr_epi64(__m64 q0, __m64 q1)
1185 return (__m128i){ (long long)q0, (long long)q1 };
1188 static __inline__ __m128i __DEFAULT_FN_ATTRS
1189 _mm_setr_epi32(int i0, int i1, int i2, int i3)
1191 return (__m128i)(__v4si){ i0, i1, i2, i3};
1194 static __inline__ __m128i __DEFAULT_FN_ATTRS
1195 _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
1197 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1200 static __inline__ __m128i __DEFAULT_FN_ATTRS
1201 _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
1203 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1206 static __inline__ __m128i __DEFAULT_FN_ATTRS
1207 _mm_setzero_si128(void)
1209 return (__m128i){ 0LL, 0LL };
1212 static __inline__ void __DEFAULT_FN_ATTRS
1213 _mm_store_si128(__m128i *__p, __m128i __b)
1218 static __inline__ void __DEFAULT_FN_ATTRS
1219 _mm_storeu_si128(__m128i *__p, __m128i __b)
1221 __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
1224 static __inline__ void __DEFAULT_FN_ATTRS
1225 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
1227 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1230 static __inline__ void __DEFAULT_FN_ATTRS
1231 _mm_storel_epi64(__m128i *__p, __m128i __a)
1233 struct __mm_storel_epi64_struct {
1235 } __attribute__((__packed__, __may_alias__));
1236 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1239 static __inline__ void __DEFAULT_FN_ATTRS
1240 _mm_stream_pd(double *__p, __m128d __a)
1242 __builtin_ia32_movntpd(__p, __a);
1245 static __inline__ void __DEFAULT_FN_ATTRS
1246 _mm_stream_si128(__m128i *__p, __m128i __a)
1248 __builtin_ia32_movntdq(__p, __a);
1251 static __inline__ void __DEFAULT_FN_ATTRS
1252 _mm_stream_si32(int *__p, int __a)
1254 __builtin_ia32_movnti(__p, __a);
1258 static __inline__ void __DEFAULT_FN_ATTRS
1259 _mm_stream_si64(long long *__p, long long __a)
1261 __builtin_ia32_movnti64(__p, __a);
1265 static __inline__ void __DEFAULT_FN_ATTRS
1266 _mm_clflush(void const *__p)
1268 __builtin_ia32_clflush(__p);
1271 static __inline__ void __DEFAULT_FN_ATTRS
1274 __builtin_ia32_lfence();
1277 static __inline__ void __DEFAULT_FN_ATTRS
1280 __builtin_ia32_mfence();
1283 static __inline__ __m128i __DEFAULT_FN_ATTRS
1284 _mm_packs_epi16(__m128i __a, __m128i __b)
1286 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1289 static __inline__ __m128i __DEFAULT_FN_ATTRS
1290 _mm_packs_epi32(__m128i __a, __m128i __b)
1292 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1295 static __inline__ __m128i __DEFAULT_FN_ATTRS
1296 _mm_packus_epi16(__m128i __a, __m128i __b)
1298 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1301 static __inline__ int __DEFAULT_FN_ATTRS
1302 _mm_extract_epi16(__m128i __a, int __imm)
1304 __v8hi __b = (__v8hi)__a;
1305 return (unsigned short)__b[__imm & 7];
1308 static __inline__ __m128i __DEFAULT_FN_ATTRS
1309 _mm_insert_epi16(__m128i __a, int __b, int __imm)
1311 __v8hi __c = (__v8hi)__a;
1312 __c[__imm & 7] = __b;
1313 return (__m128i)__c;
1316 static __inline__ int __DEFAULT_FN_ATTRS
1317 _mm_movemask_epi8(__m128i __a)
1319 return __builtin_ia32_pmovmskb128((__v16qi)__a);
1322 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1323 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
1324 (__v4si)_mm_set1_epi32(0), \
1325 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1326 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1328 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1329 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1330 (__v8hi)_mm_set1_epi16(0), \
1331 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1332 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1335 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1336 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1337 (__v8hi)_mm_set1_epi16(0), \
1339 4 + (((imm) & 0x03) >> 0), \
1340 4 + (((imm) & 0x0c) >> 2), \
1341 4 + (((imm) & 0x30) >> 4), \
1342 4 + (((imm) & 0xc0) >> 6)); })
1344 static __inline__ __m128i __DEFAULT_FN_ATTRS
1345 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1347 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1350 static __inline__ __m128i __DEFAULT_FN_ATTRS
1351 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1353 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1356 static __inline__ __m128i __DEFAULT_FN_ATTRS
1357 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1359 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1362 static __inline__ __m128i __DEFAULT_FN_ATTRS
1363 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1365 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1368 static __inline__ __m128i __DEFAULT_FN_ATTRS
1369 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1371 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1374 static __inline__ __m128i __DEFAULT_FN_ATTRS
1375 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1377 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1380 static __inline__ __m128i __DEFAULT_FN_ATTRS
1381 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1383 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1386 static __inline__ __m128i __DEFAULT_FN_ATTRS
1387 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1389 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1392 static __inline__ __m64 __DEFAULT_FN_ATTRS
1393 _mm_movepi64_pi64(__m128i __a)
1395 return (__m64)__a[0];
1398 static __inline__ __m128i __DEFAULT_FN_ATTRS
1399 _mm_movpi64_epi64(__m64 __a)
1401 return (__m128i){ (long long)__a, 0 };
1404 static __inline__ __m128i __DEFAULT_FN_ATTRS
1405 _mm_move_epi64(__m128i __a)
1407 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1410 static __inline__ __m128d __DEFAULT_FN_ATTRS
1411 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1413 return __builtin_shufflevector(__a, __b, 1, 2+1);
1416 static __inline__ __m128d __DEFAULT_FN_ATTRS
1417 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1419 return __builtin_shufflevector(__a, __b, 0, 2+0);
1422 static __inline__ int __DEFAULT_FN_ATTRS
1423 _mm_movemask_pd(__m128d __a)
1425 return __builtin_ia32_movmskpd(__a);
1428 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1429 __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
1430 (i) & 1, (((i) & 2) >> 1) + 2); })
1432 static __inline__ __m128 __DEFAULT_FN_ATTRS
1433 _mm_castpd_ps(__m128d __a)
1438 static __inline__ __m128i __DEFAULT_FN_ATTRS
1439 _mm_castpd_si128(__m128d __a)
1441 return (__m128i)__a;
1444 static __inline__ __m128d __DEFAULT_FN_ATTRS
1445 _mm_castps_pd(__m128 __a)
1447 return (__m128d)__a;
1450 static __inline__ __m128i __DEFAULT_FN_ATTRS
1451 _mm_castps_si128(__m128 __a)
1453 return (__m128i)__a;
1456 static __inline__ __m128 __DEFAULT_FN_ATTRS
1457 _mm_castsi128_ps(__m128i __a)
1462 static __inline__ __m128d __DEFAULT_FN_ATTRS
1463 _mm_castsi128_pd(__m128i __a)
1465 return (__m128d)__a;
1468 static __inline__ void __DEFAULT_FN_ATTRS
1471 __asm__ volatile ("pause");
1474 #undef __DEFAULT_FN_ATTRS
1476 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
1478 #endif /* __SSE2__ */
1480 #endif /* __EMMINTRIN_H */