1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since PowerPC target doesn't support native 64-bit vector type, we
18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19 works well for _si64 and some _pi32 operations.
21 For _pi16 and _pi8 operations, it's better to transfer __m64 into
22 128-bit PowerPC vector first. Power8 introduced direct register
23 move instructions which helps for more efficient implementation.
25 It's user's responsibility to determine if the results of such port
26 are acceptable or further changes are needed. Please note that much
27 code using Intel intrinsics CAN BE REWRITTEN in more portable and
28 efficient standard C or GNU C extensions with 64-bit scalar
29 operations, or 128-bit SSE/Altivec operations, which are more
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
39 /* The Intel API is flexible enough that we must allow aliasing with other
40 vector types, and their scalar components. */
41 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
43 typedef __attribute__((__aligned__(8))) union {
46 signed char as_signed_char[8];
49 long long as_long_long;
54 /* Empty the multimedia state. */
56 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
58 /* nothing to do on PowerPC. */
62 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
64 /* nothing to do on PowerPC. */
67 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
69 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_cvtsi32_si64(int __i) {
71 return (__m64)(unsigned int)__i;
75 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76 _m_from_int(int __i) {
77 return _mm_cvtsi32_si64(__i);
80 /* Convert the lower 32 bits of the __m64 object into an integer. */
82 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_cvtsi64_si32(__m64 __i) {
88 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
89 _m_to_int(__m64 __i) {
90 return _mm_cvtsi64_si32(__i);
93 /* Convert I to a __m64 object. */
95 /* Intel intrinsic. */
97 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 _m_from_int64(long long __i) {
102 extern __inline __m64
103 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_cvtsi64_m64(long long __i) {
108 /* Microsoft intrinsic. */
109 extern __inline __m64
110 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
111 _mm_cvtsi64x_si64(long long __i) {
115 extern __inline __m64
116 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_set_pi64x(long long __i) {
121 /* Convert the __m64 object to a 64bit integer. */
123 /* Intel intrinsic. */
124 extern __inline long long
125 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _m_to_int64(__m64 __i) {
127 return (long long)__i;
130 extern __inline long long
131 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_cvtm64_si64(__m64 __i) {
133 return (long long)__i;
136 /* Microsoft intrinsic. */
137 extern __inline long long
138 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_cvtsi64_si64x(__m64 __i) {
140 return (long long)__i;
144 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
145 the result, and the four 16-bit values from M2 into the upper four 8-bit
146 values of the result, all with signed saturation. */
147 extern __inline __m64
148 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
149 _mm_packs_pi16(__m64 __m1, __m64 __m2) {
150 __vector signed short vm1;
151 __vector signed char vresult;
153 vm1 = (__vector signed short)(__vector unsigned long long)
154 #ifdef __LITTLE_ENDIAN__
159 vresult = vec_packs(vm1, vm1);
160 return (__m64)((__vector long long)vresult)[0];
163 extern __inline __m64
164 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
165 _m_packsswb(__m64 __m1, __m64 __m2) {
166 return _mm_packs_pi16(__m1, __m2);
169 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
170 the result, and the two 32-bit values from M2 into the upper two 16-bit
171 values of the result, all with signed saturation. */
172 extern __inline __m64
173 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174 _mm_packs_pi32(__m64 __m1, __m64 __m2) {
175 __vector signed int vm1;
176 __vector signed short vresult;
178 vm1 = (__vector signed int)(__vector unsigned long long)
179 #ifdef __LITTLE_ENDIAN__
184 vresult = vec_packs(vm1, vm1);
185 return (__m64)((__vector long long)vresult)[0];
188 extern __inline __m64
189 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
190 _m_packssdw(__m64 __m1, __m64 __m2) {
191 return _mm_packs_pi32(__m1, __m2);
194 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
195 the result, and the four 16-bit values from M2 into the upper four 8-bit
196 values of the result, all with unsigned saturation. */
197 extern __inline __m64
198 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
199 _mm_packs_pu16(__m64 __m1, __m64 __m2) {
200 __vector unsigned char r;
201 __vector signed short vm1 = (__vector signed short)(__vector long long)
202 #ifdef __LITTLE_ENDIAN__
207 const __vector signed short __zero = {0};
208 __vector __bool short __select = vec_cmplt(vm1, __zero);
209 r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
210 __vector __bool char packsel = vec_pack(__select, __select);
211 r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
212 return (__m64)((__vector long long)r)[0];
215 extern __inline __m64
216 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
217 _m_packuswb(__m64 __m1, __m64 __m2) {
218 return _mm_packs_pu16(__m1, __m2);
220 #endif /* end ARCH_PWR8 */
222 /* Interleave the four 8-bit values from the high half of M1 with the four
223 8-bit values from the high half of M2. */
224 extern __inline __m64
225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
226 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
228 __vector unsigned char a, b, c;
230 a = (__vector unsigned char)vec_splats(__m1);
231 b = (__vector unsigned char)vec_splats(__m2);
232 c = vec_mergel(a, b);
233 return (__m64)((__vector long long)c)[1];
235 __m64_union m1, m2, res;
240 res.as_char[0] = m1.as_char[4];
241 res.as_char[1] = m2.as_char[4];
242 res.as_char[2] = m1.as_char[5];
243 res.as_char[3] = m2.as_char[5];
244 res.as_char[4] = m1.as_char[6];
245 res.as_char[5] = m2.as_char[6];
246 res.as_char[6] = m1.as_char[7];
247 res.as_char[7] = m2.as_char[7];
249 return (__m64)res.as_m64;
253 extern __inline __m64
254 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
255 _m_punpckhbw(__m64 __m1, __m64 __m2) {
256 return _mm_unpackhi_pi8(__m1, __m2);
259 /* Interleave the two 16-bit values from the high half of M1 with the two
260 16-bit values from the high half of M2. */
261 extern __inline __m64
262 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
264 __m64_union m1, m2, res;
269 res.as_short[0] = m1.as_short[2];
270 res.as_short[1] = m2.as_short[2];
271 res.as_short[2] = m1.as_short[3];
272 res.as_short[3] = m2.as_short[3];
274 return (__m64)res.as_m64;
277 extern __inline __m64
278 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
279 _m_punpckhwd(__m64 __m1, __m64 __m2) {
280 return _mm_unpackhi_pi16(__m1, __m2);
282 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
283 value from the high half of M2. */
284 extern __inline __m64
285 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
286 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
287 __m64_union m1, m2, res;
292 res.as_int[0] = m1.as_int[1];
293 res.as_int[1] = m2.as_int[1];
295 return (__m64)res.as_m64;
298 extern __inline __m64
299 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 _m_punpckhdq(__m64 __m1, __m64 __m2) {
301 return _mm_unpackhi_pi32(__m1, __m2);
303 /* Interleave the four 8-bit values from the low half of M1 with the four
304 8-bit values from the low half of M2. */
305 extern __inline __m64
306 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
309 __vector unsigned char a, b, c;
311 a = (__vector unsigned char)vec_splats(__m1);
312 b = (__vector unsigned char)vec_splats(__m2);
313 c = vec_mergel(a, b);
314 return (__m64)((__vector long long)c)[0];
316 __m64_union m1, m2, res;
321 res.as_char[0] = m1.as_char[0];
322 res.as_char[1] = m2.as_char[0];
323 res.as_char[2] = m1.as_char[1];
324 res.as_char[3] = m2.as_char[1];
325 res.as_char[4] = m1.as_char[2];
326 res.as_char[5] = m2.as_char[2];
327 res.as_char[6] = m1.as_char[3];
328 res.as_char[7] = m2.as_char[3];
330 return (__m64)res.as_m64;
334 extern __inline __m64
335 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
336 _m_punpcklbw(__m64 __m1, __m64 __m2) {
337 return _mm_unpacklo_pi8(__m1, __m2);
339 /* Interleave the two 16-bit values from the low half of M1 with the two
340 16-bit values from the low half of M2. */
341 extern __inline __m64
342 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
344 __m64_union m1, m2, res;
349 res.as_short[0] = m1.as_short[0];
350 res.as_short[1] = m2.as_short[0];
351 res.as_short[2] = m1.as_short[1];
352 res.as_short[3] = m2.as_short[1];
354 return (__m64)res.as_m64;
357 extern __inline __m64
358 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
359 _m_punpcklwd(__m64 __m1, __m64 __m2) {
360 return _mm_unpacklo_pi16(__m1, __m2);
363 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
364 value from the low half of M2. */
365 extern __inline __m64
366 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
368 __m64_union m1, m2, res;
373 res.as_int[0] = m1.as_int[0];
374 res.as_int[1] = m2.as_int[0];
376 return (__m64)res.as_m64;
379 extern __inline __m64
380 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
381 _m_punpckldq(__m64 __m1, __m64 __m2) {
382 return _mm_unpacklo_pi32(__m1, __m2);
385 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
386 extern __inline __m64
387 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
388 _mm_add_pi8(__m64 __m1, __m64 __m2) {
390 __vector signed char a, b, c;
392 a = (__vector signed char)vec_splats(__m1);
393 b = (__vector signed char)vec_splats(__m2);
395 return (__m64)((__vector long long)c)[0];
397 __m64_union m1, m2, res;
402 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
403 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
404 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
405 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
406 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
407 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
408 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
409 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
411 return (__m64)res.as_m64;
415 extern __inline __m64
416 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417 _m_paddb(__m64 __m1, __m64 __m2) {
418 return _mm_add_pi8(__m1, __m2);
421 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
422 extern __inline __m64
423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424 _mm_add_pi16(__m64 __m1, __m64 __m2) {
426 __vector signed short a, b, c;
428 a = (__vector signed short)vec_splats(__m1);
429 b = (__vector signed short)vec_splats(__m2);
431 return (__m64)((__vector long long)c)[0];
433 __m64_union m1, m2, res;
438 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
439 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
440 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
441 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
443 return (__m64)res.as_m64;
447 extern __inline __m64
448 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _m_paddw(__m64 __m1, __m64 __m2) {
450 return _mm_add_pi16(__m1, __m2);
453 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
454 extern __inline __m64
455 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm_add_pi32(__m64 __m1, __m64 __m2) {
458 __vector signed int a, b, c;
460 a = (__vector signed int)vec_splats(__m1);
461 b = (__vector signed int)vec_splats(__m2);
463 return (__m64)((__vector long long)c)[0];
465 __m64_union m1, m2, res;
470 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
471 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
473 return (__m64)res.as_m64;
477 extern __inline __m64
478 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
479 _m_paddd(__m64 __m1, __m64 __m2) {
480 return _mm_add_pi32(__m1, __m2);
483 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
484 extern __inline __m64
485 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
486 _mm_sub_pi8(__m64 __m1, __m64 __m2) {
488 __vector signed char a, b, c;
490 a = (__vector signed char)vec_splats(__m1);
491 b = (__vector signed char)vec_splats(__m2);
493 return (__m64)((__vector long long)c)[0];
495 __m64_union m1, m2, res;
500 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
501 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
502 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
503 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
504 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
505 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
506 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
507 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
509 return (__m64)res.as_m64;
513 extern __inline __m64
514 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515 _m_psubb(__m64 __m1, __m64 __m2) {
516 return _mm_sub_pi8(__m1, __m2);
519 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
520 extern __inline __m64
521 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522 _mm_sub_pi16(__m64 __m1, __m64 __m2) {
524 __vector signed short a, b, c;
526 a = (__vector signed short)vec_splats(__m1);
527 b = (__vector signed short)vec_splats(__m2);
529 return (__m64)((__vector long long)c)[0];
531 __m64_union m1, m2, res;
536 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
537 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
538 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
539 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
541 return (__m64)res.as_m64;
545 extern __inline __m64
546 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
547 _m_psubw(__m64 __m1, __m64 __m2) {
548 return _mm_sub_pi16(__m1, __m2);
551 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
552 extern __inline __m64
553 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554 _mm_sub_pi32(__m64 __m1, __m64 __m2) {
556 __vector signed int a, b, c;
558 a = (__vector signed int)vec_splats(__m1);
559 b = (__vector signed int)vec_splats(__m2);
561 return (__m64)((__vector long long)c)[0];
563 __m64_union m1, m2, res;
568 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
569 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
571 return (__m64)res.as_m64;
575 extern __inline __m64
576 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
577 _m_psubd(__m64 __m1, __m64 __m2) {
578 return _mm_sub_pi32(__m1, __m2);
581 extern __inline __m64
582 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583 _mm_add_si64(__m64 __m1, __m64 __m2) {
584 return (__m1 + __m2);
587 extern __inline __m64
588 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_sub_si64(__m64 __m1, __m64 __m2) {
590 return (__m1 - __m2);
593 /* Shift the 64-bit value in M left by COUNT. */
594 extern __inline __m64
595 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596 _mm_sll_si64(__m64 __m, __m64 __count) {
597 return (__m << __count);
600 extern __inline __m64
601 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
602 _m_psllq(__m64 __m, __m64 __count) {
603 return _mm_sll_si64(__m, __count);
606 extern __inline __m64
607 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608 _mm_slli_si64(__m64 __m, const int __count) {
609 return (__m << __count);
612 extern __inline __m64
613 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
614 _m_psllqi(__m64 __m, const int __count) {
615 return _mm_slli_si64(__m, __count);
618 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
619 extern __inline __m64
620 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
621 _mm_srl_si64(__m64 __m, __m64 __count) {
622 return (__m >> __count);
625 extern __inline __m64
626 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
627 _m_psrlq(__m64 __m, __m64 __count) {
628 return _mm_srl_si64(__m, __count);
631 extern __inline __m64
632 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
633 _mm_srli_si64(__m64 __m, const int __count) {
634 return (__m >> __count);
637 extern __inline __m64
638 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
639 _m_psrlqi(__m64 __m, const int __count) {
640 return _mm_srli_si64(__m, __count);
643 /* Bit-wise AND the 64-bit values in M1 and M2. */
644 extern __inline __m64
645 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646 _mm_and_si64(__m64 __m1, __m64 __m2) {
647 return (__m1 & __m2);
650 extern __inline __m64
651 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
652 _m_pand(__m64 __m1, __m64 __m2) {
653 return _mm_and_si64(__m1, __m2);
656 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
657 64-bit value in M2. */
658 extern __inline __m64
659 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
660 _mm_andnot_si64(__m64 __m1, __m64 __m2) {
661 return (~__m1 & __m2);
664 extern __inline __m64
665 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666 _m_pandn(__m64 __m1, __m64 __m2) {
667 return _mm_andnot_si64(__m1, __m2);
670 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
671 extern __inline __m64
672 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673 _mm_or_si64(__m64 __m1, __m64 __m2) {
674 return (__m1 | __m2);
677 extern __inline __m64
678 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
679 _m_por(__m64 __m1, __m64 __m2) {
680 return _mm_or_si64(__m1, __m2);
683 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
684 extern __inline __m64
685 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
686 _mm_xor_si64(__m64 __m1, __m64 __m2) {
687 return (__m1 ^ __m2);
690 extern __inline __m64
691 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
692 _m_pxor(__m64 __m1, __m64 __m2) {
693 return _mm_xor_si64(__m1, __m2);
696 /* Creates a 64-bit zero. */
697 extern __inline __m64
698 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699 _mm_setzero_si64(void) {
703 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
704 test is true and zero if false. */
705 extern __inline __m64
706 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
708 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
710 __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
713 __m64_union m1, m2, res;
718 res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
719 res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
720 res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
721 res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
722 res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
723 res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
724 res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
725 res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
727 return (__m64)res.as_m64;
731 extern __inline __m64
732 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733 _m_pcmpeqb(__m64 __m1, __m64 __m2) {
734 return _mm_cmpeq_pi8(__m1, __m2);
737 extern __inline __m64
738 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
739 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
741 __vector signed char a, b, c;
743 a = (__vector signed char)vec_splats(__m1);
744 b = (__vector signed char)vec_splats(__m2);
745 c = (__vector signed char)vec_cmpgt(a, b);
746 return (__m64)((__vector long long)c)[0];
748 __m64_union m1, m2, res;
753 res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
754 res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
755 res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
756 res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
757 res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
758 res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
759 res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
760 res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
762 return (__m64)res.as_m64;
766 extern __inline __m64
767 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
768 _m_pcmpgtb(__m64 __m1, __m64 __m2) {
769 return _mm_cmpgt_pi8(__m1, __m2);
772 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
773 the test is true and zero if false. */
774 extern __inline __m64
775 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
778 __vector signed short a, b, c;
780 a = (__vector signed short)vec_splats(__m1);
781 b = (__vector signed short)vec_splats(__m2);
782 c = (__vector signed short)vec_cmpeq(a, b);
783 return (__m64)((__vector long long)c)[0];
785 __m64_union m1, m2, res;
790 res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
791 res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
792 res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
793 res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
795 return (__m64)res.as_m64;
799 extern __inline __m64
800 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801 _m_pcmpeqw(__m64 __m1, __m64 __m2) {
802 return _mm_cmpeq_pi16(__m1, __m2);
805 extern __inline __m64
806 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
809 __vector signed short a, b, c;
811 a = (__vector signed short)vec_splats(__m1);
812 b = (__vector signed short)vec_splats(__m2);
813 c = (__vector signed short)vec_cmpgt(a, b);
814 return (__m64)((__vector long long)c)[0];
816 __m64_union m1, m2, res;
821 res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
822 res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
823 res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
824 res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
826 return (__m64)res.as_m64;
830 extern __inline __m64
831 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
832 _m_pcmpgtw(__m64 __m1, __m64 __m2) {
833 return _mm_cmpgt_pi16(__m1, __m2);
836 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
837 the test is true and zero if false. */
838 extern __inline __m64
839 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
842 __vector signed int a, b, c;
844 a = (__vector signed int)vec_splats(__m1);
845 b = (__vector signed int)vec_splats(__m2);
846 c = (__vector signed int)vec_cmpeq(a, b);
847 return (__m64)((__vector long long)c)[0];
849 __m64_union m1, m2, res;
854 res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
855 res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
857 return (__m64)res.as_m64;
861 extern __inline __m64
862 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _m_pcmpeqd(__m64 __m1, __m64 __m2) {
864 return _mm_cmpeq_pi32(__m1, __m2);
867 extern __inline __m64
868 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
871 __vector signed int a, b, c;
873 a = (__vector signed int)vec_splats(__m1);
874 b = (__vector signed int)vec_splats(__m2);
875 c = (__vector signed int)vec_cmpgt(a, b);
876 return (__m64)((__vector long long)c)[0];
878 __m64_union m1, m2, res;
883 res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
884 res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
886 return (__m64)res.as_m64;
890 extern __inline __m64
891 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
892 _m_pcmpgtd(__m64 __m1, __m64 __m2) {
893 return _mm_cmpgt_pi32(__m1, __m2);
897 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
898 saturated arithmetic. */
899 extern __inline __m64
900 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
901 _mm_adds_pi8(__m64 __m1, __m64 __m2) {
902 __vector signed char a, b, c;
904 a = (__vector signed char)vec_splats(__m1);
905 b = (__vector signed char)vec_splats(__m2);
907 return (__m64)((__vector long long)c)[0];
910 extern __inline __m64
911 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
912 _m_paddsb(__m64 __m1, __m64 __m2) {
913 return _mm_adds_pi8(__m1, __m2);
915 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
916 saturated arithmetic. */
917 extern __inline __m64
918 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
919 _mm_adds_pi16(__m64 __m1, __m64 __m2) {
920 __vector signed short a, b, c;
922 a = (__vector signed short)vec_splats(__m1);
923 b = (__vector signed short)vec_splats(__m2);
925 return (__m64)((__vector long long)c)[0];
928 extern __inline __m64
929 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
930 _m_paddsw(__m64 __m1, __m64 __m2) {
931 return _mm_adds_pi16(__m1, __m2);
933 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
934 saturated arithmetic. */
935 extern __inline __m64
936 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
937 _mm_adds_pu8(__m64 __m1, __m64 __m2) {
938 __vector unsigned char a, b, c;
940 a = (__vector unsigned char)vec_splats(__m1);
941 b = (__vector unsigned char)vec_splats(__m2);
943 return (__m64)((__vector long long)c)[0];
946 extern __inline __m64
947 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
948 _m_paddusb(__m64 __m1, __m64 __m2) {
949 return _mm_adds_pu8(__m1, __m2);
952 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
953 saturated arithmetic. */
954 extern __inline __m64
955 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_adds_pu16(__m64 __m1, __m64 __m2) {
957 __vector unsigned short a, b, c;
959 a = (__vector unsigned short)vec_splats(__m1);
960 b = (__vector unsigned short)vec_splats(__m2);
962 return (__m64)((__vector long long)c)[0];
965 extern __inline __m64
966 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
967 _m_paddusw(__m64 __m1, __m64 __m2) {
968 return _mm_adds_pu16(__m1, __m2);
971 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
972 saturating arithmetic. */
973 extern __inline __m64
974 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
975 _mm_subs_pi8(__m64 __m1, __m64 __m2) {
976 __vector signed char a, b, c;
978 a = (__vector signed char)vec_splats(__m1);
979 b = (__vector signed char)vec_splats(__m2);
981 return (__m64)((__vector long long)c)[0];
984 extern __inline __m64
985 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
986 _m_psubsb(__m64 __m1, __m64 __m2) {
987 return _mm_subs_pi8(__m1, __m2);
990 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
991 signed saturating arithmetic. */
992 extern __inline __m64
993 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
994 _mm_subs_pi16(__m64 __m1, __m64 __m2) {
995 __vector signed short a, b, c;
997 a = (__vector signed short)vec_splats(__m1);
998 b = (__vector signed short)vec_splats(__m2);
1000 return (__m64)((__vector long long)c)[0];
1003 extern __inline __m64
1004 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1005 _m_psubsw(__m64 __m1, __m64 __m2) {
1006 return _mm_subs_pi16(__m1, __m2);
1009 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1010 unsigned saturating arithmetic. */
1011 extern __inline __m64
1012 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1013 _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1014 __vector unsigned char a, b, c;
1016 a = (__vector unsigned char)vec_splats(__m1);
1017 b = (__vector unsigned char)vec_splats(__m2);
1019 return (__m64)((__vector long long)c)[0];
1022 extern __inline __m64
1023 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1024 _m_psubusb(__m64 __m1, __m64 __m2) {
1025 return _mm_subs_pu8(__m1, __m2);
1028 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1029 unsigned saturating arithmetic. */
1030 extern __inline __m64
1031 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1032 _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1033 __vector unsigned short a, b, c;
1035 a = (__vector unsigned short)vec_splats(__m1);
1036 b = (__vector unsigned short)vec_splats(__m2);
1038 return (__m64)((__vector long long)c)[0];
1041 extern __inline __m64
1042 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1043 _m_psubusw(__m64 __m1, __m64 __m2) {
1044 return _mm_subs_pu16(__m1, __m2);
1047 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1048 four 32-bit intermediate results, which are then summed by pairs to
1049 produce two 32-bit results. */
1050 extern __inline __m64
1051 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1052 _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1053 __vector signed short a, b;
1054 __vector signed int c;
1055 __vector signed int zero = {0, 0, 0, 0};
1057 a = (__vector signed short)vec_splats(__m1);
1058 b = (__vector signed short)vec_splats(__m2);
1059 c = vec_vmsumshm(a, b, zero);
1060 return (__m64)((__vector long long)c)[0];
1063 extern __inline __m64
1064 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065 _m_pmaddwd(__m64 __m1, __m64 __m2) {
1066 return _mm_madd_pi16(__m1, __m2);
1068 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1069 M2 and produce the high 16 bits of the 32-bit results. */
1070 extern __inline __m64
1071 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1072 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1073 __vector signed short a, b;
1074 __vector signed short c;
1075 __vector signed int w0, w1;
1076 __vector unsigned char xform1 = {
1077 #ifdef __LITTLE_ENDIAN__
1078 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1079 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1081 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1082 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1086 a = (__vector signed short)vec_splats(__m1);
1087 b = (__vector signed short)vec_splats(__m2);
1089 w0 = vec_vmulesh(a, b);
1090 w1 = vec_vmulosh(a, b);
1091 c = (__vector signed short)vec_perm(w0, w1, xform1);
1093 return (__m64)((__vector long long)c)[0];
1096 extern __inline __m64
1097 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098 _m_pmulhw(__m64 __m1, __m64 __m2) {
1099 return _mm_mulhi_pi16(__m1, __m2);
1102 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1103 the low 16 bits of the results. */
1104 extern __inline __m64
1105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1106 _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1107 __vector signed short a, b, c;
1109 a = (__vector signed short)vec_splats(__m1);
1110 b = (__vector signed short)vec_splats(__m2);
1112 return (__m64)((__vector long long)c)[0];
1115 extern __inline __m64
1116 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1117 _m_pmullw(__m64 __m1, __m64 __m2) {
1118 return _mm_mullo_pi16(__m1, __m2);
1121 /* Shift four 16-bit values in M left by COUNT. */
1122 extern __inline __m64
1123 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1124 _mm_sll_pi16(__m64 __m, __m64 __count) {
1125 __vector signed short m, r;
1126 __vector unsigned short c;
1128 if (__count <= 15) {
1129 m = (__vector signed short)vec_splats(__m);
1130 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1131 r = vec_sl(m, (__vector unsigned short)c);
1132 return (__m64)((__vector long long)r)[0];
1137 extern __inline __m64
1138 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1139 _m_psllw(__m64 __m, __m64 __count) {
1140 return _mm_sll_pi16(__m, __count);
1143 extern __inline __m64
1144 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1145 _mm_slli_pi16(__m64 __m, int __count) {
1146 /* Promote int to long then invoke mm_sll_pi16. */
1147 return _mm_sll_pi16(__m, __count);
1150 extern __inline __m64
1151 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1152 _m_psllwi(__m64 __m, int __count) {
1153 return _mm_slli_pi16(__m, __count);
1156 /* Shift two 32-bit values in M left by COUNT. */
1157 extern __inline __m64
1158 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm_sll_pi32(__m64 __m, __m64 __count) {
1164 res.as_int[0] = m.as_int[0] << __count;
1165 res.as_int[1] = m.as_int[1] << __count;
1166 return (res.as_m64);
1169 extern __inline __m64
1170 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1171 _m_pslld(__m64 __m, __m64 __count) {
1172 return _mm_sll_pi32(__m, __count);
1175 extern __inline __m64
1176 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1177 _mm_slli_pi32(__m64 __m, int __count) {
1178 /* Promote int to long then invoke mm_sll_pi32. */
1179 return _mm_sll_pi32(__m, __count);
1182 extern __inline __m64
1183 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _m_pslldi(__m64 __m, int __count) {
1185 return _mm_slli_pi32(__m, __count);
1188 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1189 extern __inline __m64
1190 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1191 _mm_sra_pi16(__m64 __m, __m64 __count) {
1192 __vector signed short m, r;
1193 __vector unsigned short c;
1195 if (__count <= 15) {
1196 m = (__vector signed short)vec_splats(__m);
1197 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1198 r = vec_sra(m, (__vector unsigned short)c);
1199 return (__m64)((__vector long long)r)[0];
1204 extern __inline __m64
1205 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1206 _m_psraw(__m64 __m, __m64 __count) {
1207 return _mm_sra_pi16(__m, __count);
1210 extern __inline __m64
1211 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_srai_pi16(__m64 __m, int __count) {
1213 /* Promote int to long then invoke mm_sra_pi32. */
1214 return _mm_sra_pi16(__m, __count);
1217 extern __inline __m64
1218 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1219 _m_psrawi(__m64 __m, int __count) {
1220 return _mm_srai_pi16(__m, __count);
1223 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1224 extern __inline __m64
1225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1226 _mm_sra_pi32(__m64 __m, __m64 __count) {
1231 res.as_int[0] = m.as_int[0] >> __count;
1232 res.as_int[1] = m.as_int[1] >> __count;
1233 return (res.as_m64);
1236 extern __inline __m64
1237 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1238 _m_psrad(__m64 __m, __m64 __count) {
1239 return _mm_sra_pi32(__m, __count);
1242 extern __inline __m64
1243 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1244 _mm_srai_pi32(__m64 __m, int __count) {
1245 /* Promote int to long then invoke mm_sra_pi32. */
1246 return _mm_sra_pi32(__m, __count);
1249 extern __inline __m64
1250 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251 _m_psradi(__m64 __m, int __count) {
1252 return _mm_srai_pi32(__m, __count);
1255 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1256 extern __inline __m64
1257 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1258 _mm_srl_pi16(__m64 __m, __m64 __count) {
1259 __vector unsigned short m, r;
1260 __vector unsigned short c;
1262 if (__count <= 15) {
1263 m = (__vector unsigned short)vec_splats(__m);
1264 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1265 r = vec_sr(m, (__vector unsigned short)c);
1266 return (__m64)((__vector long long)r)[0];
1271 extern __inline __m64
1272 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1273 _m_psrlw(__m64 __m, __m64 __count) {
1274 return _mm_srl_pi16(__m, __count);
1277 extern __inline __m64
1278 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1279 _mm_srli_pi16(__m64 __m, int __count) {
1280 /* Promote int to long then invoke mm_sra_pi32. */
1281 return _mm_srl_pi16(__m, __count);
1284 extern __inline __m64
1285 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1286 _m_psrlwi(__m64 __m, int __count) {
1287 return _mm_srli_pi16(__m, __count);
1290 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1291 extern __inline __m64
1292 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293 _mm_srl_pi32(__m64 __m, __m64 __count) {
1298 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1299 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1300 return (res.as_m64);
1303 extern __inline __m64
1304 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305 _m_psrld(__m64 __m, __m64 __count) {
1306 return _mm_srl_pi32(__m, __count);
1309 extern __inline __m64
1310 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311 _mm_srli_pi32(__m64 __m, int __count) {
1312 /* Promote int to long then invoke mm_srl_pi32. */
1313 return _mm_srl_pi32(__m, __count);
1316 extern __inline __m64
1317 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _m_psrldi(__m64 __m, int __count) {
1319 return _mm_srli_pi32(__m, __count);
1321 #endif /* _ARCH_PWR8 */
1323 /* Creates a vector of two 32-bit values; I0 is least significant. */
1324 extern __inline __m64
1325 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1326 _mm_set_pi32(int __i1, int __i0) {
1329 res.as_int[0] = __i0;
1330 res.as_int[1] = __i1;
1331 return (res.as_m64);
1334 /* Creates a vector of four 16-bit values; W0 is least significant. */
1335 extern __inline __m64
1336 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1337 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340 res.as_short[0] = __w0;
1341 res.as_short[1] = __w1;
1342 res.as_short[2] = __w2;
1343 res.as_short[3] = __w3;
1344 return (res.as_m64);
1347 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1348 extern __inline __m64
1349 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1350 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1351 char __b2, char __b1, char __b0) {
1354 res.as_char[0] = __b0;
1355 res.as_char[1] = __b1;
1356 res.as_char[2] = __b2;
1357 res.as_char[3] = __b3;
1358 res.as_char[4] = __b4;
1359 res.as_char[5] = __b5;
1360 res.as_char[6] = __b6;
1361 res.as_char[7] = __b7;
1362 return (res.as_m64);
1365 /* Similar, but with the arguments in reverse order. */
1366 extern __inline __m64
1367 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368 _mm_setr_pi32(int __i0, int __i1) {
1371 res.as_int[0] = __i0;
1372 res.as_int[1] = __i1;
1373 return (res.as_m64);
1376 extern __inline __m64
1377 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1378 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1379 return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382 extern __inline __m64
1383 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1384 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1385 char __b5, char __b6, char __b7) {
1386 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389 /* Creates a vector of two 32-bit values, both elements containing I. */
1390 extern __inline __m64
1391 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1392 _mm_set1_pi32(int __i) {
1395 res.as_int[0] = __i;
1396 res.as_int[1] = __i;
1397 return (res.as_m64);
1400 /* Creates a vector of four 16-bit values, all elements containing W. */
1401 extern __inline __m64
1402 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1403 _mm_set1_pi16(short __w) {
1405 __vector signed short w;
1407 w = (__vector signed short)vec_splats(__w);
1408 return (__m64)((__vector long long)w)[0];
1412 res.as_short[0] = __w;
1413 res.as_short[1] = __w;
1414 res.as_short[2] = __w;
1415 res.as_short[3] = __w;
1416 return (res.as_m64);
1420 /* Creates a vector of eight 8-bit values, all elements containing B. */
1421 extern __inline __m64
1422 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1423 _mm_set1_pi8(signed char __b) {
1425 __vector signed char b;
1427 b = (__vector signed char)vec_splats(__b);
1428 return (__m64)((__vector long long)b)[0];
1432 res.as_char[0] = __b;
1433 res.as_char[1] = __b;
1434 res.as_char[2] = __b;
1435 res.as_char[3] = __b;
1436 res.as_char[4] = __b;
1437 res.as_char[5] = __b;
1438 res.as_char[6] = __b;
1439 res.as_char[7] = __b;
1440 return (res.as_m64);
1443 #endif /* _MMINTRIN_H_INCLUDED */