1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since PowerPC target doesn't support native 64-bit vector type, we
18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19 works well for _si64 and some _pi32 operations.
21 For _pi16 and _pi8 operations, it's better to transfer __m64 into
22 128-bit PowerPC vector first. Power8 introduced direct register
23 move instructions which helps for more efficient implementation.
25 It's user's responsibility to determine if the results of such port
26 are acceptable or further changes are needed. Please note that much
27 code using Intel intrinsics CAN BE REWRITTEN in more portable and
28 efficient standard C or GNU C extensions with 64-bit scalar
29 operations, or 128-bit SSE/Altivec operations, which are more
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
38 #if defined(__linux__) && defined(__ppc64__)
41 /* The Intel API is flexible enough that we must allow aliasing with other
42 vector types, and their scalar components. */
43 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45 typedef __attribute__((__aligned__(8))) union {
48 signed char as_signed_char[8];
51 long long as_long_long;
56 /* Empty the multimedia state. */
58 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60 /* nothing to do on PowerPC. */
64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 /* nothing to do on PowerPC. */
69 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
71 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72 _mm_cvtsi32_si64(int __i) {
73 return (__m64)(unsigned int)__i;
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _m_from_int(int __i) {
79 return _mm_cvtsi32_si64(__i);
82 /* Convert the lower 32 bits of the __m64 object into an integer. */
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 _mm_cvtsi64_si32(__m64 __i) {
90 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 _m_to_int(__m64 __i) {
92 return _mm_cvtsi64_si32(__i);
95 /* Convert I to a __m64 object. */
97 /* Intel intrinsic. */
99 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 _m_from_int64(long long __i) {
104 extern __inline __m64
105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 _mm_cvtsi64_m64(long long __i) {
110 /* Microsoft intrinsic. */
111 extern __inline __m64
112 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_cvtsi64x_si64(long long __i) {
117 extern __inline __m64
118 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm_set_pi64x(long long __i) {
123 /* Convert the __m64 object to a 64bit integer. */
125 /* Intel intrinsic. */
126 extern __inline long long
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _m_to_int64(__m64 __i) {
129 return (long long)__i;
132 extern __inline long long
133 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm_cvtm64_si64(__m64 __i) {
135 return (long long)__i;
138 /* Microsoft intrinsic. */
139 extern __inline long long
140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_cvtsi64_si64x(__m64 __i) {
142 return (long long)__i;
146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147 the result, and the four 16-bit values from M2 into the upper four 8-bit
148 values of the result, all with signed saturation. */
149 extern __inline __m64
150 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152 __vector signed short vm1;
153 __vector signed char vresult;
155 vm1 = (__vector signed short)(__vector unsigned long long)
156 #ifdef __LITTLE_ENDIAN__
161 vresult = vec_packs(vm1, vm1);
162 return (__m64)((__vector long long)vresult)[0];
165 extern __inline __m64
166 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167 _m_packsswb(__m64 __m1, __m64 __m2) {
168 return _mm_packs_pi16(__m1, __m2);
171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172 the result, and the two 32-bit values from M2 into the upper two 16-bit
173 values of the result, all with signed saturation. */
174 extern __inline __m64
175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177 __vector signed int vm1;
178 __vector signed short vresult;
180 vm1 = (__vector signed int)(__vector unsigned long long)
181 #ifdef __LITTLE_ENDIAN__
186 vresult = vec_packs(vm1, vm1);
187 return (__m64)((__vector long long)vresult)[0];
190 extern __inline __m64
191 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 _m_packssdw(__m64 __m1, __m64 __m2) {
193 return _mm_packs_pi32(__m1, __m2);
196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197 the result, and the four 16-bit values from M2 into the upper four 8-bit
198 values of the result, all with unsigned saturation. */
199 extern __inline __m64
200 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202 __vector unsigned char r;
203 __vector signed short vm1 = (__vector signed short)(__vector long long)
204 #ifdef __LITTLE_ENDIAN__
209 const __vector signed short __zero = {0};
210 __vector __bool short __select = vec_cmplt(vm1, __zero);
211 r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
212 __vector __bool char packsel = vec_pack(__select, __select);
213 r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
214 return (__m64)((__vector long long)r)[0];
217 extern __inline __m64
218 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _m_packuswb(__m64 __m1, __m64 __m2) {
220 return _mm_packs_pu16(__m1, __m2);
222 #endif /* end ARCH_PWR8 */
224 /* Interleave the four 8-bit values from the high half of M1 with the four
225 8-bit values from the high half of M2. */
226 extern __inline __m64
227 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
230 __vector unsigned char a, b, c;
232 a = (__vector unsigned char)vec_splats(__m1);
233 b = (__vector unsigned char)vec_splats(__m2);
234 c = vec_mergel(a, b);
235 return (__m64)((__vector long long)c)[1];
237 __m64_union m1, m2, res;
242 res.as_char[0] = m1.as_char[4];
243 res.as_char[1] = m2.as_char[4];
244 res.as_char[2] = m1.as_char[5];
245 res.as_char[3] = m2.as_char[5];
246 res.as_char[4] = m1.as_char[6];
247 res.as_char[5] = m2.as_char[6];
248 res.as_char[6] = m1.as_char[7];
249 res.as_char[7] = m2.as_char[7];
251 return (__m64)res.as_m64;
255 extern __inline __m64
256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _m_punpckhbw(__m64 __m1, __m64 __m2) {
258 return _mm_unpackhi_pi8(__m1, __m2);
261 /* Interleave the two 16-bit values from the high half of M1 with the two
262 16-bit values from the high half of M2. */
263 extern __inline __m64
264 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
266 __m64_union m1, m2, res;
271 res.as_short[0] = m1.as_short[2];
272 res.as_short[1] = m2.as_short[2];
273 res.as_short[2] = m1.as_short[3];
274 res.as_short[3] = m2.as_short[3];
276 return (__m64)res.as_m64;
279 extern __inline __m64
280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _m_punpckhwd(__m64 __m1, __m64 __m2) {
282 return _mm_unpackhi_pi16(__m1, __m2);
284 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
285 value from the high half of M2. */
286 extern __inline __m64
287 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
289 __m64_union m1, m2, res;
294 res.as_int[0] = m1.as_int[1];
295 res.as_int[1] = m2.as_int[1];
297 return (__m64)res.as_m64;
300 extern __inline __m64
301 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _m_punpckhdq(__m64 __m1, __m64 __m2) {
303 return _mm_unpackhi_pi32(__m1, __m2);
305 /* Interleave the four 8-bit values from the low half of M1 with the four
306 8-bit values from the low half of M2. */
307 extern __inline __m64
308 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
311 __vector unsigned char a, b, c;
313 a = (__vector unsigned char)vec_splats(__m1);
314 b = (__vector unsigned char)vec_splats(__m2);
315 c = vec_mergel(a, b);
316 return (__m64)((__vector long long)c)[0];
318 __m64_union m1, m2, res;
323 res.as_char[0] = m1.as_char[0];
324 res.as_char[1] = m2.as_char[0];
325 res.as_char[2] = m1.as_char[1];
326 res.as_char[3] = m2.as_char[1];
327 res.as_char[4] = m1.as_char[2];
328 res.as_char[5] = m2.as_char[2];
329 res.as_char[6] = m1.as_char[3];
330 res.as_char[7] = m2.as_char[3];
332 return (__m64)res.as_m64;
336 extern __inline __m64
337 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _m_punpcklbw(__m64 __m1, __m64 __m2) {
339 return _mm_unpacklo_pi8(__m1, __m2);
341 /* Interleave the two 16-bit values from the low half of M1 with the two
342 16-bit values from the low half of M2. */
343 extern __inline __m64
344 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
346 __m64_union m1, m2, res;
351 res.as_short[0] = m1.as_short[0];
352 res.as_short[1] = m2.as_short[0];
353 res.as_short[2] = m1.as_short[1];
354 res.as_short[3] = m2.as_short[1];
356 return (__m64)res.as_m64;
359 extern __inline __m64
360 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _m_punpcklwd(__m64 __m1, __m64 __m2) {
362 return _mm_unpacklo_pi16(__m1, __m2);
365 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
366 value from the low half of M2. */
367 extern __inline __m64
368 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
370 __m64_union m1, m2, res;
375 res.as_int[0] = m1.as_int[0];
376 res.as_int[1] = m2.as_int[0];
378 return (__m64)res.as_m64;
381 extern __inline __m64
382 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383 _m_punpckldq(__m64 __m1, __m64 __m2) {
384 return _mm_unpacklo_pi32(__m1, __m2);
387 /* Add the 8-bit values in M1 to the 8-bit values in M2. */
388 extern __inline __m64
389 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 _mm_add_pi8(__m64 __m1, __m64 __m2) {
392 __vector signed char a, b, c;
394 a = (__vector signed char)vec_splats(__m1);
395 b = (__vector signed char)vec_splats(__m2);
397 return (__m64)((__vector long long)c)[0];
399 __m64_union m1, m2, res;
404 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
405 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
406 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
407 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
408 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
409 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
410 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
411 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
413 return (__m64)res.as_m64;
417 extern __inline __m64
418 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419 _m_paddb(__m64 __m1, __m64 __m2) {
420 return _mm_add_pi8(__m1, __m2);
423 /* Add the 16-bit values in M1 to the 16-bit values in M2. */
424 extern __inline __m64
425 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426 _mm_add_pi16(__m64 __m1, __m64 __m2) {
428 __vector signed short a, b, c;
430 a = (__vector signed short)vec_splats(__m1);
431 b = (__vector signed short)vec_splats(__m2);
433 return (__m64)((__vector long long)c)[0];
435 __m64_union m1, m2, res;
440 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
441 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
442 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
443 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
445 return (__m64)res.as_m64;
449 extern __inline __m64
450 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451 _m_paddw(__m64 __m1, __m64 __m2) {
452 return _mm_add_pi16(__m1, __m2);
455 /* Add the 32-bit values in M1 to the 32-bit values in M2. */
456 extern __inline __m64
457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458 _mm_add_pi32(__m64 __m1, __m64 __m2) {
460 __vector signed int a, b, c;
462 a = (__vector signed int)vec_splats(__m1);
463 b = (__vector signed int)vec_splats(__m2);
465 return (__m64)((__vector long long)c)[0];
467 __m64_union m1, m2, res;
472 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
473 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
475 return (__m64)res.as_m64;
479 extern __inline __m64
480 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 _m_paddd(__m64 __m1, __m64 __m2) {
482 return _mm_add_pi32(__m1, __m2);
485 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
486 extern __inline __m64
487 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 _mm_sub_pi8(__m64 __m1, __m64 __m2) {
490 __vector signed char a, b, c;
492 a = (__vector signed char)vec_splats(__m1);
493 b = (__vector signed char)vec_splats(__m2);
495 return (__m64)((__vector long long)c)[0];
497 __m64_union m1, m2, res;
502 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
503 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
504 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
505 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
506 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
507 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
508 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
509 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
511 return (__m64)res.as_m64;
515 extern __inline __m64
516 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 _m_psubb(__m64 __m1, __m64 __m2) {
518 return _mm_sub_pi8(__m1, __m2);
521 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
522 extern __inline __m64
523 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 _mm_sub_pi16(__m64 __m1, __m64 __m2) {
526 __vector signed short a, b, c;
528 a = (__vector signed short)vec_splats(__m1);
529 b = (__vector signed short)vec_splats(__m2);
531 return (__m64)((__vector long long)c)[0];
533 __m64_union m1, m2, res;
538 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
539 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
540 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
541 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
543 return (__m64)res.as_m64;
547 extern __inline __m64
548 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _m_psubw(__m64 __m1, __m64 __m2) {
550 return _mm_sub_pi16(__m1, __m2);
553 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
554 extern __inline __m64
555 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_sub_pi32(__m64 __m1, __m64 __m2) {
558 __vector signed int a, b, c;
560 a = (__vector signed int)vec_splats(__m1);
561 b = (__vector signed int)vec_splats(__m2);
563 return (__m64)((__vector long long)c)[0];
565 __m64_union m1, m2, res;
570 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
571 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
573 return (__m64)res.as_m64;
577 extern __inline __m64
578 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579 _m_psubd(__m64 __m1, __m64 __m2) {
580 return _mm_sub_pi32(__m1, __m2);
583 extern __inline __m64
584 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm_add_si64(__m64 __m1, __m64 __m2) {
586 return (__m1 + __m2);
589 extern __inline __m64
590 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm_sub_si64(__m64 __m1, __m64 __m2) {
592 return (__m1 - __m2);
595 /* Shift the 64-bit value in M left by COUNT. */
596 extern __inline __m64
597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 _mm_sll_si64(__m64 __m, __m64 __count) {
599 return (__m << __count);
602 extern __inline __m64
603 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _m_psllq(__m64 __m, __m64 __count) {
605 return _mm_sll_si64(__m, __count);
608 extern __inline __m64
609 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_slli_si64(__m64 __m, const int __count) {
611 return (__m << __count);
614 extern __inline __m64
615 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _m_psllqi(__m64 __m, const int __count) {
617 return _mm_slli_si64(__m, __count);
620 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */
621 extern __inline __m64
622 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 _mm_srl_si64(__m64 __m, __m64 __count) {
624 return (__m >> __count);
627 extern __inline __m64
628 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 _m_psrlq(__m64 __m, __m64 __count) {
630 return _mm_srl_si64(__m, __count);
633 extern __inline __m64
634 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 _mm_srli_si64(__m64 __m, const int __count) {
636 return (__m >> __count);
639 extern __inline __m64
640 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 _m_psrlqi(__m64 __m, const int __count) {
642 return _mm_srli_si64(__m, __count);
645 /* Bit-wise AND the 64-bit values in M1 and M2. */
646 extern __inline __m64
647 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 _mm_and_si64(__m64 __m1, __m64 __m2) {
649 return (__m1 & __m2);
652 extern __inline __m64
653 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _m_pand(__m64 __m1, __m64 __m2) {
655 return _mm_and_si64(__m1, __m2);
658 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
659 64-bit value in M2. */
660 extern __inline __m64
661 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662 _mm_andnot_si64(__m64 __m1, __m64 __m2) {
663 return (~__m1 & __m2);
666 extern __inline __m64
667 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _m_pandn(__m64 __m1, __m64 __m2) {
669 return _mm_andnot_si64(__m1, __m2);
672 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
673 extern __inline __m64
674 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
675 _mm_or_si64(__m64 __m1, __m64 __m2) {
676 return (__m1 | __m2);
679 extern __inline __m64
680 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681 _m_por(__m64 __m1, __m64 __m2) {
682 return _mm_or_si64(__m1, __m2);
685 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
686 extern __inline __m64
687 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm_xor_si64(__m64 __m1, __m64 __m2) {
689 return (__m1 ^ __m2);
692 extern __inline __m64
693 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _m_pxor(__m64 __m1, __m64 __m2) {
695 return _mm_xor_si64(__m1, __m2);
698 /* Creates a 64-bit zero. */
699 extern __inline __m64
700 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_setzero_si64(void) {
705 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the
706 test is true and zero if false. */
707 extern __inline __m64
708 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
710 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
712 __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
715 __m64_union m1, m2, res;
720 res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
721 res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
722 res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
723 res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
724 res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
725 res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
726 res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
727 res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
729 return (__m64)res.as_m64;
733 extern __inline __m64
734 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735 _m_pcmpeqb(__m64 __m1, __m64 __m2) {
736 return _mm_cmpeq_pi8(__m1, __m2);
739 extern __inline __m64
740 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
743 __vector signed char a, b, c;
745 a = (__vector signed char)vec_splats(__m1);
746 b = (__vector signed char)vec_splats(__m2);
747 c = (__vector signed char)vec_cmpgt(a, b);
748 return (__m64)((__vector long long)c)[0];
750 __m64_union m1, m2, res;
755 res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
756 res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
757 res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
758 res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
759 res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
760 res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
761 res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
762 res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
764 return (__m64)res.as_m64;
768 extern __inline __m64
769 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _m_pcmpgtb(__m64 __m1, __m64 __m2) {
771 return _mm_cmpgt_pi8(__m1, __m2);
774 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if
775 the test is true and zero if false. */
776 extern __inline __m64
777 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
780 __vector signed short a, b, c;
782 a = (__vector signed short)vec_splats(__m1);
783 b = (__vector signed short)vec_splats(__m2);
784 c = (__vector signed short)vec_cmpeq(a, b);
785 return (__m64)((__vector long long)c)[0];
787 __m64_union m1, m2, res;
792 res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
793 res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
794 res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
795 res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
797 return (__m64)res.as_m64;
801 extern __inline __m64
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _m_pcmpeqw(__m64 __m1, __m64 __m2) {
804 return _mm_cmpeq_pi16(__m1, __m2);
807 extern __inline __m64
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
811 __vector signed short a, b, c;
813 a = (__vector signed short)vec_splats(__m1);
814 b = (__vector signed short)vec_splats(__m2);
815 c = (__vector signed short)vec_cmpgt(a, b);
816 return (__m64)((__vector long long)c)[0];
818 __m64_union m1, m2, res;
823 res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
824 res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
825 res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
826 res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
828 return (__m64)res.as_m64;
832 extern __inline __m64
833 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 _m_pcmpgtw(__m64 __m1, __m64 __m2) {
835 return _mm_cmpgt_pi16(__m1, __m2);
838 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
839 the test is true and zero if false. */
840 extern __inline __m64
841 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
844 __vector signed int a, b, c;
846 a = (__vector signed int)vec_splats(__m1);
847 b = (__vector signed int)vec_splats(__m2);
848 c = (__vector signed int)vec_cmpeq(a, b);
849 return (__m64)((__vector long long)c)[0];
851 __m64_union m1, m2, res;
856 res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
857 res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
859 return (__m64)res.as_m64;
863 extern __inline __m64
864 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _m_pcmpeqd(__m64 __m1, __m64 __m2) {
866 return _mm_cmpeq_pi32(__m1, __m2);
869 extern __inline __m64
870 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
873 __vector signed int a, b, c;
875 a = (__vector signed int)vec_splats(__m1);
876 b = (__vector signed int)vec_splats(__m2);
877 c = (__vector signed int)vec_cmpgt(a, b);
878 return (__m64)((__vector long long)c)[0];
880 __m64_union m1, m2, res;
885 res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
886 res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
888 return (__m64)res.as_m64;
892 extern __inline __m64
893 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894 _m_pcmpgtd(__m64 __m1, __m64 __m2) {
895 return _mm_cmpgt_pi32(__m1, __m2);
899 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
900 saturated arithmetic. */
901 extern __inline __m64
902 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm_adds_pi8(__m64 __m1, __m64 __m2) {
904 __vector signed char a, b, c;
906 a = (__vector signed char)vec_splats(__m1);
907 b = (__vector signed char)vec_splats(__m2);
909 return (__m64)((__vector long long)c)[0];
912 extern __inline __m64
913 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914 _m_paddsb(__m64 __m1, __m64 __m2) {
915 return _mm_adds_pi8(__m1, __m2);
917 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
918 saturated arithmetic. */
919 extern __inline __m64
920 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921 _mm_adds_pi16(__m64 __m1, __m64 __m2) {
922 __vector signed short a, b, c;
924 a = (__vector signed short)vec_splats(__m1);
925 b = (__vector signed short)vec_splats(__m2);
927 return (__m64)((__vector long long)c)[0];
930 extern __inline __m64
931 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _m_paddsw(__m64 __m1, __m64 __m2) {
933 return _mm_adds_pi16(__m1, __m2);
935 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
936 saturated arithmetic. */
937 extern __inline __m64
938 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_adds_pu8(__m64 __m1, __m64 __m2) {
940 __vector unsigned char a, b, c;
942 a = (__vector unsigned char)vec_splats(__m1);
943 b = (__vector unsigned char)vec_splats(__m2);
945 return (__m64)((__vector long long)c)[0];
948 extern __inline __m64
949 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950 _m_paddusb(__m64 __m1, __m64 __m2) {
951 return _mm_adds_pu8(__m1, __m2);
954 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
955 saturated arithmetic. */
956 extern __inline __m64
957 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_adds_pu16(__m64 __m1, __m64 __m2) {
959 __vector unsigned short a, b, c;
961 a = (__vector unsigned short)vec_splats(__m1);
962 b = (__vector unsigned short)vec_splats(__m2);
964 return (__m64)((__vector long long)c)[0];
967 extern __inline __m64
968 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _m_paddusw(__m64 __m1, __m64 __m2) {
970 return _mm_adds_pu16(__m1, __m2);
973 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
974 saturating arithmetic. */
975 extern __inline __m64
976 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm_subs_pi8(__m64 __m1, __m64 __m2) {
978 __vector signed char a, b, c;
980 a = (__vector signed char)vec_splats(__m1);
981 b = (__vector signed char)vec_splats(__m2);
983 return (__m64)((__vector long long)c)[0];
986 extern __inline __m64
987 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988 _m_psubsb(__m64 __m1, __m64 __m2) {
989 return _mm_subs_pi8(__m1, __m2);
992 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
993 signed saturating arithmetic. */
994 extern __inline __m64
995 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm_subs_pi16(__m64 __m1, __m64 __m2) {
997 __vector signed short a, b, c;
999 a = (__vector signed short)vec_splats(__m1);
1000 b = (__vector signed short)vec_splats(__m2);
1002 return (__m64)((__vector long long)c)[0];
1005 extern __inline __m64
1006 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _m_psubsw(__m64 __m1, __m64 __m2) {
1008 return _mm_subs_pi16(__m1, __m2);
1011 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012 unsigned saturating arithmetic. */
1013 extern __inline __m64
1014 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016 __vector unsigned char a, b, c;
1018 a = (__vector unsigned char)vec_splats(__m1);
1019 b = (__vector unsigned char)vec_splats(__m2);
1021 return (__m64)((__vector long long)c)[0];
1024 extern __inline __m64
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _m_psubusb(__m64 __m1, __m64 __m2) {
1027 return _mm_subs_pu8(__m1, __m2);
1030 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031 unsigned saturating arithmetic. */
1032 extern __inline __m64
1033 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035 __vector unsigned short a, b, c;
1037 a = (__vector unsigned short)vec_splats(__m1);
1038 b = (__vector unsigned short)vec_splats(__m2);
1040 return (__m64)((__vector long long)c)[0];
1043 extern __inline __m64
1044 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _m_psubusw(__m64 __m1, __m64 __m2) {
1046 return _mm_subs_pu16(__m1, __m2);
1049 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050 four 32-bit intermediate results, which are then summed by pairs to
1051 produce two 32-bit results. */
1052 extern __inline __m64
1053 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055 __vector signed short a, b;
1056 __vector signed int c;
1057 __vector signed int zero = {0, 0, 0, 0};
1059 a = (__vector signed short)vec_splats(__m1);
1060 b = (__vector signed short)vec_splats(__m2);
1061 c = vec_vmsumshm(a, b, zero);
1062 return (__m64)((__vector long long)c)[0];
1065 extern __inline __m64
1066 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067 _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068 return _mm_madd_pi16(__m1, __m2);
1070 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071 M2 and produce the high 16 bits of the 32-bit results. */
1072 extern __inline __m64
1073 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075 __vector signed short a, b;
1076 __vector signed short c;
1077 __vector signed int w0, w1;
1078 __vector unsigned char xform1 = {
1079 #ifdef __LITTLE_ENDIAN__
1080 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1083 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1088 a = (__vector signed short)vec_splats(__m1);
1089 b = (__vector signed short)vec_splats(__m2);
1091 w0 = vec_vmulesh(a, b);
1092 w1 = vec_vmulosh(a, b);
1093 c = (__vector signed short)vec_perm(w0, w1, xform1);
1095 return (__m64)((__vector long long)c)[0];
1098 extern __inline __m64
1099 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _m_pmulhw(__m64 __m1, __m64 __m2) {
1101 return _mm_mulhi_pi16(__m1, __m2);
1104 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105 the low 16 bits of the results. */
1106 extern __inline __m64
1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109 __vector signed short a, b, c;
1111 a = (__vector signed short)vec_splats(__m1);
1112 b = (__vector signed short)vec_splats(__m2);
1114 return (__m64)((__vector long long)c)[0];
1117 extern __inline __m64
1118 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119 _m_pmullw(__m64 __m1, __m64 __m2) {
1120 return _mm_mullo_pi16(__m1, __m2);
1123 /* Shift four 16-bit values in M left by COUNT. */
1124 extern __inline __m64
1125 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm_sll_pi16(__m64 __m, __m64 __count) {
1127 __vector signed short m, r;
1128 __vector unsigned short c;
1130 if (__count <= 15) {
1131 m = (__vector signed short)vec_splats(__m);
1132 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133 r = vec_sl(m, (__vector unsigned short)c);
1134 return (__m64)((__vector long long)r)[0];
1139 extern __inline __m64
1140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _m_psllw(__m64 __m, __m64 __count) {
1142 return _mm_sll_pi16(__m, __count);
1145 extern __inline __m64
1146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_slli_pi16(__m64 __m, int __count) {
1148 /* Promote int to long then invoke mm_sll_pi16. */
1149 return _mm_sll_pi16(__m, __count);
1152 extern __inline __m64
1153 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154 _m_psllwi(__m64 __m, int __count) {
1155 return _mm_slli_pi16(__m, __count);
1158 /* Shift two 32-bit values in M left by COUNT. */
1159 extern __inline __m64
1160 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161 _mm_sll_pi32(__m64 __m, __m64 __count) {
1166 res.as_int[0] = m.as_int[0] << __count;
1167 res.as_int[1] = m.as_int[1] << __count;
1168 return (res.as_m64);
1171 extern __inline __m64
1172 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173 _m_pslld(__m64 __m, __m64 __count) {
1174 return _mm_sll_pi32(__m, __count);
1177 extern __inline __m64
1178 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179 _mm_slli_pi32(__m64 __m, int __count) {
1180 /* Promote int to long then invoke mm_sll_pi32. */
1181 return _mm_sll_pi32(__m, __count);
1184 extern __inline __m64
1185 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186 _m_pslldi(__m64 __m, int __count) {
1187 return _mm_slli_pi32(__m, __count);
1190 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1191 extern __inline __m64
1192 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193 _mm_sra_pi16(__m64 __m, __m64 __count) {
1194 __vector signed short m, r;
1195 __vector unsigned short c;
1197 if (__count <= 15) {
1198 m = (__vector signed short)vec_splats(__m);
1199 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200 r = vec_sra(m, (__vector unsigned short)c);
1201 return (__m64)((__vector long long)r)[0];
1206 extern __inline __m64
1207 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 _m_psraw(__m64 __m, __m64 __count) {
1209 return _mm_sra_pi16(__m, __count);
1212 extern __inline __m64
1213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_srai_pi16(__m64 __m, int __count) {
1215 /* Promote int to long then invoke mm_sra_pi32. */
1216 return _mm_sra_pi16(__m, __count);
1219 extern __inline __m64
1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _m_psrawi(__m64 __m, int __count) {
1222 return _mm_srai_pi16(__m, __count);
1225 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1226 extern __inline __m64
1227 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 _mm_sra_pi32(__m64 __m, __m64 __count) {
1233 res.as_int[0] = m.as_int[0] >> __count;
1234 res.as_int[1] = m.as_int[1] >> __count;
1235 return (res.as_m64);
1238 extern __inline __m64
1239 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240 _m_psrad(__m64 __m, __m64 __count) {
1241 return _mm_sra_pi32(__m, __count);
1244 extern __inline __m64
1245 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246 _mm_srai_pi32(__m64 __m, int __count) {
1247 /* Promote int to long then invoke mm_sra_pi32. */
1248 return _mm_sra_pi32(__m, __count);
1251 extern __inline __m64
1252 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _m_psradi(__m64 __m, int __count) {
1254 return _mm_srai_pi32(__m, __count);
1257 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1258 extern __inline __m64
1259 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 _mm_srl_pi16(__m64 __m, __m64 __count) {
1261 __vector unsigned short m, r;
1262 __vector unsigned short c;
1264 if (__count <= 15) {
1265 m = (__vector unsigned short)vec_splats(__m);
1266 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267 r = vec_sr(m, (__vector unsigned short)c);
1268 return (__m64)((__vector long long)r)[0];
1273 extern __inline __m64
1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _m_psrlw(__m64 __m, __m64 __count) {
1276 return _mm_srl_pi16(__m, __count);
1279 extern __inline __m64
1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_srli_pi16(__m64 __m, int __count) {
1282 /* Promote int to long then invoke mm_sra_pi32. */
1283 return _mm_srl_pi16(__m, __count);
1286 extern __inline __m64
1287 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 _m_psrlwi(__m64 __m, int __count) {
1289 return _mm_srli_pi16(__m, __count);
1292 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1293 extern __inline __m64
1294 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295 _mm_srl_pi32(__m64 __m, __m64 __count) {
1300 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302 return (res.as_m64);
1305 extern __inline __m64
1306 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _m_psrld(__m64 __m, __m64 __count) {
1308 return _mm_srl_pi32(__m, __count);
1311 extern __inline __m64
1312 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313 _mm_srli_pi32(__m64 __m, int __count) {
1314 /* Promote int to long then invoke mm_srl_pi32. */
1315 return _mm_srl_pi32(__m, __count);
1318 extern __inline __m64
1319 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320 _m_psrldi(__m64 __m, int __count) {
1321 return _mm_srli_pi32(__m, __count);
1323 #endif /* _ARCH_PWR8 */
1325 /* Creates a vector of two 32-bit values; I0 is least significant. */
1326 extern __inline __m64
1327 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_set_pi32(int __i1, int __i0) {
1331 res.as_int[0] = __i0;
1332 res.as_int[1] = __i1;
1333 return (res.as_m64);
1336 /* Creates a vector of four 16-bit values; W0 is least significant. */
1337 extern __inline __m64
1338 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342 res.as_short[0] = __w0;
1343 res.as_short[1] = __w1;
1344 res.as_short[2] = __w2;
1345 res.as_short[3] = __w3;
1346 return (res.as_m64);
1349 /* Creates a vector of eight 8-bit values; B0 is least significant. */
1350 extern __inline __m64
1351 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353 char __b2, char __b1, char __b0) {
1356 res.as_char[0] = __b0;
1357 res.as_char[1] = __b1;
1358 res.as_char[2] = __b2;
1359 res.as_char[3] = __b3;
1360 res.as_char[4] = __b4;
1361 res.as_char[5] = __b5;
1362 res.as_char[6] = __b6;
1363 res.as_char[7] = __b7;
1364 return (res.as_m64);
1367 /* Similar, but with the arguments in reverse order. */
1368 extern __inline __m64
1369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_setr_pi32(int __i0, int __i1) {
1373 res.as_int[0] = __i0;
1374 res.as_int[1] = __i1;
1375 return (res.as_m64);
1378 extern __inline __m64
1379 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381 return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384 extern __inline __m64
1385 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387 char __b5, char __b6, char __b7) {
1388 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391 /* Creates a vector of two 32-bit values, both elements containing I. */
1392 extern __inline __m64
1393 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_set1_pi32(int __i) {
1397 res.as_int[0] = __i;
1398 res.as_int[1] = __i;
1399 return (res.as_m64);
1402 /* Creates a vector of four 16-bit values, all elements containing W. */
1403 extern __inline __m64
1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_set1_pi16(short __w) {
1407 __vector signed short w;
1409 w = (__vector signed short)vec_splats(__w);
1410 return (__m64)((__vector long long)w)[0];
1414 res.as_short[0] = __w;
1415 res.as_short[1] = __w;
1416 res.as_short[2] = __w;
1417 res.as_short[3] = __w;
1418 return (res.as_m64);
1422 /* Creates a vector of eight 8-bit values, all elements containing B. */
1423 extern __inline __m64
1424 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm_set1_pi8(signed char __b) {
1427 __vector signed char b;
1429 b = (__vector signed char)vec_splats(__b);
1430 return (__m64)((__vector long long)b)[0];
1434 res.as_char[0] = __b;
1435 res.as_char[1] = __b;
1436 res.as_char[2] = __b;
1437 res.as_char[3] = __b;
1438 res.as_char[4] = __b;
1439 res.as_char[5] = __b;
1440 res.as_char[6] = __b;
1441 res.as_char[7] = __b;
1442 return (res.as_m64);
1447 #include_next <mmintrin.h>
1448 #endif /* defined(__linux__) && defined(__ppc64__) */
1450 #endif /* _MMINTRIN_H_INCLUDED */