1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
32 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
40 /* We need definitions from the SSE header files. */
41 #include <xmmintrin.h>
44 typedef __vector double __v2df;
45 typedef __vector long long __v2di;
46 typedef __vector unsigned long long __v2du;
47 typedef __vector int __v4si;
48 typedef __vector unsigned int __v4su;
49 typedef __vector short __v8hi;
50 typedef __vector unsigned short __v8hu;
51 typedef __vector signed char __v16qi;
52 typedef __vector unsigned char __v16qu;
54 /* The Intel API is flexible enough that we must allow aliasing with other
55 vector types, and their scalar components. */
56 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
57 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
59 /* Unaligned version of the same types. */
60 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
61 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
63 /* Define two value permute mask. */
64 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
66 /* Create a vector with element 0 as F and the rest zero. */
67 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68 _mm_set_sd (double __F)
70 return __extension__ (__m128d){ __F, 0.0 };
73 /* Create a vector with both elements equal to F. */
74 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_set1_pd (double __F)
77 return __extension__ (__m128d){ __F, __F };
80 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81 _mm_set_pd1 (double __F)
83 return _mm_set1_pd (__F);
86 /* Create a vector with the lower value X and upper value W. */
87 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88 _mm_set_pd (double __W, double __X)
90 return __extension__ (__m128d){ __X, __W };
93 /* Create a vector with the lower value W and upper value X. */
94 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_setr_pd (double __W, double __X)
97 return __extension__ (__m128d){ __W, __X };
100 /* Create an undefined vector. */
101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102 _mm_undefined_pd (void)
108 /* Create a vector of zeros. */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_setzero_pd (void)
112 return (__m128d) vec_splats (0);
115 /* Sets the low DPFP value of A from the low value of B. */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_move_sd (__m128d __A, __m128d __B)
119 __v2df result = (__v2df) __A;
120 result [0] = ((__v2df) __B)[0];
121 return (__m128d) result;
124 /* Load two DPFP values from P. The address must be 16-byte aligned. */
125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_load_pd (double const *__P)
128 return ((__m128d)vec_ld(0, (__v16qu*)__P));
131 /* Load two DPFP values from P. The address need not be 16-byte aligned. */
132 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_loadu_pd (double const *__P)
135 return (vec_vsx_ld(0, __P));
138 /* Create a vector with all two elements equal to *P. */
139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_load1_pd (double const *__P)
142 return (vec_splats (*__P));
145 /* Create a vector with element 0 as *P and the rest zero. */
146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_load_sd (double const *__P)
149 return _mm_set_sd (*__P);
152 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_load_pd1 (double const *__P)
155 return _mm_load1_pd (__P);
158 /* Load two DPFP values in reverse order. The address must be aligned. */
159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_loadr_pd (double const *__P)
162 __v2df __tmp = _mm_load_pd (__P);
163 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
166 /* Store two DPFP values. The address must be 16-byte aligned. */
167 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm_store_pd (double *__P, __m128d __A)
170 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
173 /* Store two DPFP values. The address need not be 16-byte aligned. */
174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_storeu_pd (double *__P, __m128d __A)
177 *(__m128d_u *)__P = __A;
180 /* Stores the lower DPFP value. */
181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_store_sd (double *__P, __m128d __A)
184 *__P = ((__v2df)__A)[0];
187 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_cvtsd_f64 (__m128d __A)
190 return ((__v2df)__A)[0];
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_storel_pd (double *__P, __m128d __A)
196 _mm_store_sd (__P, __A);
199 /* Stores the upper DPFP value. */
200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_storeh_pd (double *__P, __m128d __A)
203 *__P = ((__v2df)__A)[1];
205 /* Store the lower DPFP value across two words.
206 The address must be 16-byte aligned. */
207 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_store1_pd (double *__P, __m128d __A)
210 _mm_store_pd (__P, vec_splat (__A, 0));
213 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 _mm_store_pd1 (double *__P, __m128d __A)
216 _mm_store1_pd (__P, __A);
219 /* Store two DPFP values in reverse order. The address must be aligned. */
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_storer_pd (double *__P, __m128d __A)
223 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
226 /* Intel intrinsic. */
227 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_cvtsi128_si64 (__m128i __A)
230 return ((__v2di)__A)[0];
233 /* Microsoft intrinsic. */
234 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_cvtsi128_si64x (__m128i __A)
237 return ((__v2di)__A)[0];
240 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_add_pd (__m128d __A, __m128d __B)
243 return (__m128d) ((__v2df)__A + (__v2df)__B);
246 /* Add the lower double-precision (64-bit) floating-point element in
247 a and b, store the result in the lower element of dst, and copy
248 the upper element from a to the upper element of dst. */
249 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm_add_sd (__m128d __A, __m128d __B)
252 __A[0] = __A[0] + __B[0];
256 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm_sub_pd (__m128d __A, __m128d __B)
259 return (__m128d) ((__v2df)__A - (__v2df)__B);
262 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_sub_sd (__m128d __A, __m128d __B)
265 __A[0] = __A[0] - __B[0];
269 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_mul_pd (__m128d __A, __m128d __B)
272 return (__m128d) ((__v2df)__A * (__v2df)__B);
275 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 _mm_mul_sd (__m128d __A, __m128d __B)
278 __A[0] = __A[0] * __B[0];
282 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _mm_div_pd (__m128d __A, __m128d __B)
285 return (__m128d) ((__v2df)__A / (__v2df)__B);
288 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_div_sd (__m128d __A, __m128d __B)
291 __A[0] = __A[0] / __B[0];
295 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm_sqrt_pd (__m128d __A)
298 return (vec_sqrt (__A));
301 /* Return pair {sqrt (B[0]), A[1]}. */
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_sqrt_sd (__m128d __A, __m128d __B)
306 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
307 return (__m128d) _mm_setr_pd (c[0], __A[1]);
310 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_min_pd (__m128d __A, __m128d __B)
313 return (vec_min (__A, __B));
316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm_min_sd (__m128d __A, __m128d __B)
320 a = vec_splats (__A[0]);
321 b = vec_splats (__B[0]);
323 return (__m128d) _mm_setr_pd (c[0], __A[1]);
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_max_pd (__m128d __A, __m128d __B)
329 return (vec_max (__A, __B));
332 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333 _mm_max_sd (__m128d __A, __m128d __B)
336 a = vec_splats (__A[0]);
337 b = vec_splats (__B[0]);
339 return (__m128d) _mm_setr_pd (c[0], __A[1]);
342 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_cmpeq_pd (__m128d __A, __m128d __B)
345 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
348 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_cmplt_pd (__m128d __A, __m128d __B)
351 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
354 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_cmple_pd (__m128d __A, __m128d __B)
357 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
360 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_cmpgt_pd (__m128d __A, __m128d __B)
363 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpge_pd (__m128d __A, __m128d __B)
369 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmpneq_pd (__m128d __A, __m128d __B)
375 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
376 return ((__m128d)vec_nor (temp, temp));
379 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
382 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmpnle_pd (__m128d __A, __m128d __B)
388 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
391 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmpngt_pd (__m128d __A, __m128d __B)
394 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
397 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398 _mm_cmpnge_pd (__m128d __A, __m128d __B)
400 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpord_pd (__m128d __A, __m128d __B)
408 /* Compare against self will return false (0's) if NAN. */
409 c = (__v2du)vec_cmpeq (__A, __A);
410 d = (__v2du)vec_cmpeq (__B, __B);
414 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
415 a = (__v2du)vec_abs ((__v2df)__A);
416 b = (__v2du)vec_abs ((__v2df)__B);
417 c = (__v2du)vec_cmpgt (double_exp_mask, a);
418 d = (__v2du)vec_cmpgt (double_exp_mask, b);
420 /* A != NAN and B != NAN. */
421 return ((__m128d)vec_and(c, d));
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425 _mm_cmpunord_pd (__m128d __A, __m128d __B)
429 /* Compare against self will return false (0's) if NAN. */
430 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
431 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
432 /* A == NAN OR B == NAN converts too:
433 NOT(A != NAN) OR NOT(B != NAN). */
435 return ((__m128d)vec_orc(c, d));
438 /* Compare against self will return false (0's) if NAN. */
439 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
440 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
441 /* Convert the true ('1's) is NAN. */
444 return ((__m128d)vec_or(c, d));
448 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpeq_sd(__m128d __A, __m128d __B)
452 /* PowerISA VSX does not allow partial (for just lower double)
453 results. So to insure we don't generate spurious exceptions
454 (from the upper double values) we splat the lower double
455 before we do the operation. */
456 a = vec_splats (__A[0]);
457 b = vec_splats (__B[0]);
458 c = (__v2df) vec_cmpeq(a, b);
459 /* Then we merge the lower double result with the original upper
461 return (__m128d) _mm_setr_pd (c[0], __A[1]);
464 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_cmplt_sd (__m128d __A, __m128d __B)
468 a = vec_splats (__A[0]);
469 b = vec_splats (__B[0]);
470 c = (__v2df) vec_cmplt(a, b);
471 return (__m128d) _mm_setr_pd (c[0], __A[1]);
474 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm_cmple_sd (__m128d __A, __m128d __B)
478 a = vec_splats (__A[0]);
479 b = vec_splats (__B[0]);
480 c = (__v2df) vec_cmple(a, b);
481 return (__m128d) _mm_setr_pd (c[0], __A[1]);
484 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485 _mm_cmpgt_sd (__m128d __A, __m128d __B)
488 a = vec_splats (__A[0]);
489 b = vec_splats (__B[0]);
490 c = (__v2df) vec_cmpgt(a, b);
491 return (__m128d) _mm_setr_pd (c[0], __A[1]);
494 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_cmpge_sd (__m128d __A, __m128d __B)
498 a = vec_splats (__A[0]);
499 b = vec_splats (__B[0]);
500 c = (__v2df) vec_cmpge(a, b);
501 return (__m128d) _mm_setr_pd (c[0], __A[1]);
504 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 _mm_cmpneq_sd (__m128d __A, __m128d __B)
508 a = vec_splats (__A[0]);
509 b = vec_splats (__B[0]);
510 c = (__v2df) vec_cmpeq(a, b);
512 return (__m128d) _mm_setr_pd (c[0], __A[1]);
515 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
519 a = vec_splats (__A[0]);
520 b = vec_splats (__B[0]);
521 /* Not less than is just greater than or equal. */
522 c = (__v2df) vec_cmpge(a, b);
523 return (__m128d) _mm_setr_pd (c[0], __A[1]);
526 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_cmpnle_sd (__m128d __A, __m128d __B)
530 a = vec_splats (__A[0]);
531 b = vec_splats (__B[0]);
532 /* Not less than or equal is just greater than. */
533 c = (__v2df) vec_cmpge(a, b);
534 return (__m128d) _mm_setr_pd (c[0], __A[1]);
537 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpngt_sd (__m128d __A, __m128d __B)
541 a = vec_splats (__A[0]);
542 b = vec_splats (__B[0]);
543 /* Not greater than is just less than or equal. */
544 c = (__v2df) vec_cmple(a, b);
545 return (__m128d) _mm_setr_pd (c[0], __A[1]);
548 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _mm_cmpnge_sd (__m128d __A, __m128d __B)
552 a = vec_splats (__A[0]);
553 b = vec_splats (__B[0]);
554 /* Not greater than or equal is just less than. */
555 c = (__v2df) vec_cmplt(a, b);
556 return (__m128d) _mm_setr_pd (c[0], __A[1]);
559 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
560 _mm_cmpord_sd (__m128d __A, __m128d __B)
563 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
564 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568 _mm_cmpunord_sd (__m128d __A, __m128d __B)
571 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
572 return (__m128d) _mm_setr_pd (r[0], __A[1]);
576 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
577 exactly the same because GCC for PowerPC only generates unordered
578 compares (scalar and vector).
579 Technically __mm_comieq_sp et all should be using the ordered
580 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
582 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583 _mm_comieq_sd (__m128d __A, __m128d __B)
585 return (__A[0] == __B[0]);
588 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_comilt_sd (__m128d __A, __m128d __B)
591 return (__A[0] < __B[0]);
594 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 _mm_comile_sd (__m128d __A, __m128d __B)
597 return (__A[0] <= __B[0]);
600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 _mm_comigt_sd (__m128d __A, __m128d __B)
603 return (__A[0] > __B[0]);
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comige_sd (__m128d __A, __m128d __B)
609 return (__A[0] >= __B[0]);
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comineq_sd (__m128d __A, __m128d __B)
615 return (__A[0] != __B[0]);
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_ucomieq_sd (__m128d __A, __m128d __B)
621 return (__A[0] == __B[0]);
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_ucomilt_sd (__m128d __A, __m128d __B)
627 return (__A[0] < __B[0]);
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_ucomile_sd (__m128d __A, __m128d __B)
633 return (__A[0] <= __B[0]);
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_ucomigt_sd (__m128d __A, __m128d __B)
639 return (__A[0] > __B[0]);
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomige_sd (__m128d __A, __m128d __B)
645 return (__A[0] >= __B[0]);
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomineq_sd (__m128d __A, __m128d __B)
651 return (__A[0] != __B[0]);
654 /* Create a vector of Qi, where i is the element number. */
655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _mm_set_epi64x (long long __q1, long long __q0)
658 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
661 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662 _mm_set_epi64 (__m64 __q1, __m64 __q0)
664 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
667 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
670 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
673 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
675 short __q3, short __q2, short __q1, short __q0)
677 return __extension__ (__m128i)(__v8hi){
678 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
683 char __q11, char __q10, char __q09, char __q08,
684 char __q07, char __q06, char __q05, char __q04,
685 char __q03, char __q02, char __q01, char __q00)
687 return __extension__ (__m128i)(__v16qi){
688 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
689 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
693 /* Set all of the elements of the vector to A. */
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_set1_epi64x (long long __A)
697 return _mm_set_epi64x (__A, __A);
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_set1_epi64 (__m64 __A)
703 return _mm_set_epi64 (__A, __A);
706 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 _mm_set1_epi32 (int __A)
709 return _mm_set_epi32 (__A, __A, __A, __A);
712 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm_set1_epi16 (short __A)
715 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi8 (char __A)
721 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
722 __A, __A, __A, __A, __A, __A, __A, __A);
725 /* Create a vector of Qi, where i is the element number.
726 The parameter order is reversed from the _mm_set_epi* functions. */
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
730 return _mm_set_epi64 (__q1, __q0);
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
736 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
741 short __q4, short __q5, short __q6, short __q7)
743 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
746 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
748 char __q04, char __q05, char __q06, char __q07,
749 char __q08, char __q09, char __q10, char __q11,
750 char __q12, char __q13, char __q14, char __q15)
752 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
753 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
756 /* Create a vector with element 0 as *P and the rest zero. */
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_load_si128 (__m128i const *__P)
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_loadu_si128 (__m128i_u const *__P)
766 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
769 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _mm_loadl_epi64 (__m128i_u const *__P)
772 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
775 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm_store_si128 (__m128i *__P, __m128i __B)
778 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
787 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
790 *(long long *)__P = ((__v2di)__B)[0];
793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_movepi64_pi64 (__m128i_u __B)
796 return (__m64) ((__v2di)__B)[0];
799 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_movpi64_epi64 (__m64 __A)
802 return _mm_set_epi64 ((__m64)0LL, __A);
805 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_move_epi64 (__m128i __A)
808 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
811 /* Create an undefined vector. */
812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_undefined_si128 (void)
819 /* Create a vector of zeros. */
820 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm_setzero_si128 (void)
823 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
827 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 _mm_cvtepi32_pd (__m128i __A)
831 /* For LE need to generate Vector Unpack Low Signed Word.
832 Which is generated from unpackh. */
833 val = (__v2di)vec_unpackh ((__v4si)__A);
835 return (__m128d)vec_ctf (val, 0);
839 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840 _mm_cvtepi32_ps (__m128i __A)
842 return ((__m128)vec_ctf((__v4si)__A, 0));
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_cvtpd_epi32 (__m128d __A)
848 __v2df rounded = vec_rint (__A);
853 /* VSX Vector truncate Double-Precision to integer and Convert to
854 Signed Integer Word format with Saturate. */
862 temp = vec_mergeo (temp, temp);
863 result = (__v4si) vec_vpkudum ((__vector long long) temp,
864 (__vector long long) vzero);
867 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
868 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
869 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
872 return (__m128i) result;
875 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876 _mm_cvtpd_pi32 (__m128d __A)
878 __m128i result = _mm_cvtpd_epi32(__A);
880 return (__m64) result[0];
883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm_cvtpd_ps (__m128d __A)
888 const __v4si vzero = { 0, 0, 0, 0 };
897 temp = vec_mergeo (temp, temp);
898 result = (__v4sf) vec_vpkudum ((__vector long long) temp,
899 (__vector long long) vzero);
902 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
903 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
904 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
907 return ((__m128)result);
910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm_cvttpd_epi32 (__m128d __A)
915 const __v4si vzero = { 0, 0, 0, 0 };
917 /* VSX Vector truncate Double-Precision to integer and Convert to
918 Signed Integer Word format with Saturate. */
926 temp = vec_mergeo (temp, temp);
927 result = (__v4si) vec_vpkudum ((__vector long long) temp,
928 (__vector long long) vzero);
931 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
932 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
933 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
937 return ((__m128i) result);
940 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm_cvttpd_pi32 (__m128d __A)
943 __m128i result = _mm_cvttpd_epi32 (__A);
945 return (__m64) result[0];
948 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949 _mm_cvtsi128_si32 (__m128i __A)
951 return ((__v4si)__A)[0];
955 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_cvtpi32_pd (__m64 __A)
962 temp = (__v4si)vec_splats (__A);
963 tmp2 = (__v2di)vec_unpackl (temp);
964 result = vec_ctf ((__vector signed long long) tmp2, 0);
965 return (__m128d)result;
969 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970 _mm_cvtps_epi32 (__m128 __A)
975 rounded = vec_rint((__v4sf) __A);
976 result = vec_cts (rounded, 0);
977 return (__m128i) result;
980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm_cvttps_epi32 (__m128 __A)
985 result = vec_cts ((__v4sf) __A, 0);
986 return (__m128i) result;
989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_cvtps_pd (__m128 __A)
992 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
994 return (__m128d) vec_doubleh ((__v4sf)__A);
996 /* Otherwise the compiler is not current and so need to generate the
998 __v4sf a = (__v4sf)__A;
1001 #ifdef __LITTLE_ENDIAN__
1002 /* The input float values are in elements {[0], [1]} but the convert
1003 instruction needs them in elements {[1], [3]}, So we use two
1004 shift left double vector word immediates to get the elements
1006 temp = __builtin_vsx_xxsldwi (a, a, 3);
1007 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1009 /* The input float values are in elements {[0], [1]} but the convert
1010 instruction needs them in elements {[0], [2]}, So we use two
1011 shift left double vector word immediates to get the elements
1013 temp = vec_vmrghw (a, a);
1020 return (__m128d) result;
1024 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm_cvtsd_si32 (__m128d __A)
1027 __v2df rounded = vec_rint((__v2df) __A);
1028 int result = ((__v2df)rounded)[0];
1032 /* Intel intrinsic. */
1033 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_cvtsd_si64 (__m128d __A)
1036 __v2df rounded = vec_rint ((__v2df) __A );
1037 long long result = ((__v2df) rounded)[0];
1042 /* Microsoft intrinsic. */
1043 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm_cvtsd_si64x (__m128d __A)
1046 return _mm_cvtsd_si64 ((__v2df)__A);
1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvttsd_si32 (__m128d __A)
1052 int result = ((__v2df)__A)[0];
1057 /* Intel intrinsic. */
1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_cvttsd_si64 (__m128d __A)
1061 long long result = ((__v2df)__A)[0];
1066 /* Microsoft intrinsic. */
1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm_cvttsd_si64x (__m128d __A)
1070 return _mm_cvttsd_si64 (__A);
1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1076 __v4sf result = (__v4sf)__A;
1078 #ifdef __LITTLE_ENDIAN__
1080 /* Copy double element[0] to element [1] for conversion. */
1081 __v2df temp_b = vec_splat((__v2df)__B, 0);
1083 /* Pre-rotate __A left 3 (logically right 1) elements. */
1084 result = __builtin_vsx_xxsldwi (result, result, 3);
1085 /* Convert double to single float scalar in a vector. */
1091 /* Shift the resulting scalar into vector element [0]. */
1092 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1094 result [0] = ((__v2df)__B)[0];
1096 return (__m128) result;
1099 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm_cvtsi32_sd (__m128d __A, int __B)
1102 __v2df result = (__v2df)__A;
1105 return (__m128d)result;
1108 /* Intel intrinsic. */
1109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_cvtsi64_sd (__m128d __A, long long __B)
1112 __v2df result = (__v2df)__A;
1115 return (__m128d)result;
1118 /* Microsoft intrinsic. */
1119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1122 return _mm_cvtsi64_sd (__A, __B);
1125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm_cvtss_sd (__m128d __A, __m128 __B)
1128 #ifdef __LITTLE_ENDIAN__
1129 /* Use splat to move element [0] into position for the convert. */
1130 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1132 /* Convert single float scalar to double in a vector. */
1138 return (__m128d) vec_mergel (res, (__v2df)__A);
1140 __v2df res = (__v2df)__A;
1141 res [0] = ((__v4sf)__B) [0];
1142 return (__m128d) res;
1146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1149 __vector double result;
1150 const int litmsk = __mask & 0x3;
1153 result = vec_mergeh (__A, __B);
1155 else if (litmsk == 1)
1156 result = vec_xxpermdi (__B, __A, 2);
1157 else if (litmsk == 2)
1158 result = vec_xxpermdi (__B, __A, 1);
1160 else if (litmsk == 1)
1161 result = vec_xxpermdi (__A, __B, 2);
1162 else if (litmsk == 2)
1163 result = vec_xxpermdi (__A, __B, 1);
1166 result = vec_mergel (__A, __B);
1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1174 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1177 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1180 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1183 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_loadh_pd (__m128d __A, double const *__B)
1186 __v2df result = (__v2df)__A;
1188 return (__m128d)result;
1191 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm_loadl_pd (__m128d __A, double const *__B)
1194 __v2df result = (__v2df)__A;
1196 return (__m128d)result;
1200 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1202 /* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1203 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1204 _mm_movemask_pd (__m128d __A)
1206 __vector unsigned long long result;
1207 static const __vector unsigned int perm_mask =
1209 #ifdef __LITTLE_ENDIAN__
1210 0x80800040, 0x80808080, 0x80808080, 0x80808080
1212 0x80808080, 0x80808080, 0x80808080, 0x80804000
1216 result = ((__vector unsigned long long)
1217 vec_vbpermq ((__vector unsigned char) __A,
1218 (__vector unsigned char) perm_mask));
1220 #ifdef __LITTLE_ENDIAN__
1226 #endif /* _ARCH_PWR8 */
1228 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229 _mm_packs_epi16 (__m128i __A, __m128i __B)
1231 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1234 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_packs_epi32 (__m128i __A, __m128i __B)
1237 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1240 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm_packus_epi16 (__m128i __A, __m128i __B)
1243 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1246 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1249 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1255 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1261 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1267 return (__m128i) vec_mergel ((__vector long long) __A,
1268 (__vector long long) __B);
1271 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1274 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1277 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1280 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1286 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1292 return (__m128i) vec_mergeh ((__vector long long) __A,
1293 (__vector long long) __B);
1296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_add_epi8 (__m128i __A, __m128i __B)
1299 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_add_epi16 (__m128i __A, __m128i __B)
1305 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_add_epi32 (__m128i __A, __m128i __B)
1311 return (__m128i) ((__v4su)__A + (__v4su)__B);
1314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_add_epi64 (__m128i __A, __m128i __B)
1317 return (__m128i) ((__v2du)__A + (__v2du)__B);
1320 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm_adds_epi8 (__m128i __A, __m128i __B)
1323 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1326 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm_adds_epi16 (__m128i __A, __m128i __B)
1329 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333 _mm_adds_epu8 (__m128i __A, __m128i __B)
1335 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1338 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339 _mm_adds_epu16 (__m128i __A, __m128i __B)
1341 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345 _mm_sub_epi8 (__m128i __A, __m128i __B)
1347 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1350 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 _mm_sub_epi16 (__m128i __A, __m128i __B)
1353 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357 _mm_sub_epi32 (__m128i __A, __m128i __B)
1359 return (__m128i) ((__v4su)__A - (__v4su)__B);
1362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm_sub_epi64 (__m128i __A, __m128i __B)
1365 return (__m128i) ((__v2du)__A - (__v2du)__B);
1368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_subs_epi8 (__m128i __A, __m128i __B)
1371 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm_subs_epi16 (__m128i __A, __m128i __B)
1377 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_subs_epu8 (__m128i __A, __m128i __B)
1383 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_subs_epu16 (__m128i __A, __m128i __B)
1389 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393 _mm_madd_epi16 (__m128i __A, __m128i __B)
1395 __vector signed int zero = {0, 0, 0, 0};
1397 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1403 __vector signed int w0, w1;
1405 __vector unsigned char xform1 = {
1406 #ifdef __LITTLE_ENDIAN__
1407 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1408 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1410 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1411 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1415 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1416 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1417 return (__m128i) vec_perm (w0, w1, xform1);
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1423 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm_mul_su32 (__m64 __A, __m64 __B)
1429 unsigned int a = __A;
1430 unsigned int b = __B;
1432 return ((__m64)a * (__m64)b);
1435 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _mm_mul_epu32 (__m128i __A, __m128i __B)
1441 #ifdef __LITTLE_ENDIAN__
1442 /* VMX Vector Multiply Odd Unsigned Word. */
1446 : "v" (__A), "v" (__B)
1449 /* VMX Vector Multiply Even Unsigned Word. */
1453 : "v" (__A), "v" (__B)
1456 return (__m128i) result;
1458 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 _mm_slli_epi16 (__m128i __A, int __B)
1466 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1468 if (__B >= 0 && __B < 16)
1470 if (__builtin_constant_p(__B))
1471 lshift = (__v8hu) vec_splat_s16(__B);
1473 lshift = vec_splats ((unsigned short) __B);
1475 result = vec_sl ((__v8hi) __A, lshift);
1478 return (__m128i) result;
1481 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482 _mm_slli_epi32 (__m128i __A, int __B)
1485 __v4si result = { 0, 0, 0, 0 };
1487 if (__B >= 0 && __B < 32)
1489 if (__builtin_constant_p(__B) && __B < 16)
1490 lshift = (__v4su) vec_splat_s32(__B);
1492 lshift = vec_splats ((unsigned int) __B);
1494 result = vec_sl ((__v4si) __A, lshift);
1497 return (__m128i) result;
1501 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _mm_slli_epi64 (__m128i __A, int __B)
1505 __v2di result = { 0, 0 };
1507 if (__B >= 0 && __B < 64)
1509 if (__builtin_constant_p(__B) && __B < 16)
1510 lshift = (__v2du) vec_splat_s32(__B);
1512 lshift = (__v2du) vec_splats ((unsigned int) __B);
1514 result = vec_sl ((__v2di) __A, lshift);
1517 return (__m128i) result;
1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 _mm_srai_epi16 (__m128i __A, int __B)
1524 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1529 if (__builtin_constant_p(__B))
1530 rshift = (__v8hu) vec_splat_s16(__B);
1532 rshift = vec_splats ((unsigned short) __B);
1534 result = vec_sra ((__v8hi) __A, rshift);
1536 return (__m128i) result;
1539 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540 _mm_srai_epi32 (__m128i __A, int __B)
1542 __v4su rshift = { 31, 31, 31, 31 };
1547 if (__builtin_constant_p(__B))
1550 rshift = (__v4su) vec_splat_s32(__B);
1552 rshift = (__v4su) vec_splats((unsigned int)__B);
1555 rshift = vec_splats ((unsigned int) __B);
1557 result = vec_sra ((__v4si) __A, rshift);
1559 return (__m128i) result;
1562 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563 _mm_bslli_si128 (__m128i __A, const int __N)
1566 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1569 result = vec_sld ((__v16qu) __A, zeros, __N);
1573 return (__m128i) result;
1576 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 _mm_bsrli_si128 (__m128i __A, const int __N)
1580 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1583 #ifdef __LITTLE_ENDIAN__
1584 if (__builtin_constant_p(__N))
1585 /* Would like to use Vector Shift Left Double by Octet
1586 Immediate here to use the immediate form and avoid
1587 load of __N * 8 value into a separate VR. */
1588 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1592 __v16qu shift = vec_splats((unsigned char)(__N*8));
1593 #ifdef __LITTLE_ENDIAN__
1594 result = vec_sro ((__v16qu)__A, shift);
1596 result = vec_slo ((__v16qu)__A, shift);
1602 return (__m128i) result;
1605 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606 _mm_srli_si128 (__m128i __A, const int __N)
1608 return _mm_bsrli_si128 (__A, __N);
1611 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1612 _mm_slli_si128 (__m128i __A, const int _imm5)
1615 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1618 #ifdef __LITTLE_ENDIAN__
1619 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1621 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1626 return (__m128i) result;
1629 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1631 _mm_srli_epi16 (__m128i __A, int __B)
1634 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1638 if (__builtin_constant_p(__B))
1639 rshift = (__v8hu) vec_splat_s16(__B);
1641 rshift = vec_splats ((unsigned short) __B);
1643 result = vec_sr ((__v8hi) __A, rshift);
1646 return (__m128i) result;
1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 _mm_srli_epi32 (__m128i __A, int __B)
1653 __v4si result = { 0, 0, 0, 0 };
1657 if (__builtin_constant_p(__B))
1660 rshift = (__v4su) vec_splat_s32(__B);
1662 rshift = (__v4su) vec_splats((unsigned int)__B);
1665 rshift = vec_splats ((unsigned int) __B);
1667 result = vec_sr ((__v4si) __A, rshift);
1670 return (__m128i) result;
1674 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 _mm_srli_epi64 (__m128i __A, int __B)
1678 __v2di result = { 0, 0 };
1682 if (__builtin_constant_p(__B))
1685 rshift = (__v2du) vec_splat_s32(__B);
1687 rshift = (__v2du) vec_splats((unsigned long long)__B);
1690 rshift = (__v2du) vec_splats ((unsigned int) __B);
1692 result = vec_sr ((__v2di) __A, rshift);
1695 return (__m128i) result;
1699 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700 _mm_sll_epi16 (__m128i __A, __m128i __B)
1703 __vector __bool short shmask;
1704 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1707 #ifdef __LITTLE_ENDIAN__
1708 lshift = vec_splat ((__v8hu) __B, 0);
1710 lshift = vec_splat ((__v8hu) __B, 3);
1712 shmask = vec_cmple (lshift, shmax);
1713 result = vec_sl ((__v8hu) __A, lshift);
1714 result = vec_sel ((__v8hu) shmask, result, shmask);
1716 return (__m128i) result;
1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720 _mm_sll_epi32 (__m128i __A, __m128i __B)
1723 __vector __bool int shmask;
1724 const __v4su shmax = { 32, 32, 32, 32 };
1726 #ifdef __LITTLE_ENDIAN__
1727 lshift = vec_splat ((__v4su) __B, 0);
1729 lshift = vec_splat ((__v4su) __B, 1);
1731 shmask = vec_cmplt (lshift, shmax);
1732 result = vec_sl ((__v4su) __A, lshift);
1733 result = vec_sel ((__v4su) shmask, result, shmask);
1735 return (__m128i) result;
1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740 _mm_sll_epi64 (__m128i __A, __m128i __B)
1743 __vector __bool long long shmask;
1744 const __v2du shmax = { 64, 64 };
1747 lshift = vec_splat ((__v2du) __B, 0);
1748 shmask = vec_cmplt (lshift, shmax);
1749 result = vec_sl ((__v2du) __A, lshift);
1750 result = vec_sel ((__v2du) shmask, result, shmask);
1752 return (__m128i) result;
1756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_sra_epi16 (__m128i __A, __m128i __B)
1759 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1763 #ifdef __LITTLE_ENDIAN__
1764 rshift = vec_splat ((__v8hu)__B, 0);
1766 rshift = vec_splat ((__v8hu)__B, 3);
1768 rshift = vec_min (rshift, rshmax);
1769 result = vec_sra ((__v8hi) __A, rshift);
1771 return (__m128i) result;
1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775 _mm_sra_epi32 (__m128i __A, __m128i __B)
1777 const __v4su rshmax = { 31, 31, 31, 31 };
1781 #ifdef __LITTLE_ENDIAN__
1782 rshift = vec_splat ((__v4su)__B, 0);
1784 rshift = vec_splat ((__v4su)__B, 1);
1786 rshift = vec_min (rshift, rshmax);
1787 result = vec_sra ((__v4si) __A, rshift);
1789 return (__m128i) result;
1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1793 _mm_srl_epi16 (__m128i __A, __m128i __B)
1796 __vector __bool short shmask;
1797 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1800 #ifdef __LITTLE_ENDIAN__
1801 rshift = vec_splat ((__v8hu) __B, 0);
1803 rshift = vec_splat ((__v8hu) __B, 3);
1805 shmask = vec_cmple (rshift, shmax);
1806 result = vec_sr ((__v8hu) __A, rshift);
1807 result = vec_sel ((__v8hu) shmask, result, shmask);
1809 return (__m128i) result;
1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813 _mm_srl_epi32 (__m128i __A, __m128i __B)
1816 __vector __bool int shmask;
1817 const __v4su shmax = { 32, 32, 32, 32 };
1820 #ifdef __LITTLE_ENDIAN__
1821 rshift = vec_splat ((__v4su) __B, 0);
1823 rshift = vec_splat ((__v4su) __B, 1);
1825 shmask = vec_cmplt (rshift, shmax);
1826 result = vec_sr ((__v4su) __A, rshift);
1827 result = vec_sel ((__v4su) shmask, result, shmask);
1829 return (__m128i) result;
1833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1834 _mm_srl_epi64 (__m128i __A, __m128i __B)
1837 __vector __bool long long shmask;
1838 const __v2du shmax = { 64, 64 };
1841 rshift = vec_splat ((__v2du) __B, 0);
1842 shmask = vec_cmplt (rshift, shmax);
1843 result = vec_sr ((__v2du) __A, rshift);
1844 result = vec_sel ((__v2du) shmask, result, shmask);
1846 return (__m128i) result;
1850 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851 _mm_and_pd (__m128d __A, __m128d __B)
1853 return (vec_and ((__v2df) __A, (__v2df) __B));
1856 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857 _mm_andnot_pd (__m128d __A, __m128d __B)
1859 return (vec_andc ((__v2df) __B, (__v2df) __A));
1862 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_or_pd (__m128d __A, __m128d __B)
1865 return (vec_or ((__v2df) __A, (__v2df) __B));
1868 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_xor_pd (__m128d __A, __m128d __B)
1871 return (vec_xor ((__v2df) __A, (__v2df) __B));
1874 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_and_si128 (__m128i __A, __m128i __B)
1877 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1880 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_andnot_si128 (__m128i __A, __m128i __B)
1883 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1886 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_or_si128 (__m128i __A, __m128i __B)
1889 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1892 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_xor_si128 (__m128i __A, __m128i __B)
1895 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1898 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1901 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1904 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1907 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1913 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1916 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1919 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1922 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1925 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1928 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1931 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1934 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1937 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1943 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1949 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1952 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1953 _mm_extract_epi16 (__m128i const __A, int const __N)
1955 return (unsigned short) ((__v8hi)__A)[__N & 7];
1958 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1959 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1961 __v8hi result = (__v8hi)__A;
1963 result [(__N & 7)] = __D;
1965 return (__m128i) result;
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969 _mm_max_epi16 (__m128i __A, __m128i __B)
1971 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975 _mm_max_epu8 (__m128i __A, __m128i __B)
1977 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981 _mm_min_epi16 (__m128i __A, __m128i __B)
1983 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987 _mm_min_epu8 (__m128i __A, __m128i __B)
1989 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1994 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1996 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1998 _mm_movemask_epi8 (__m128i __A)
2000 __vector unsigned long long result;
2001 static const __vector unsigned char perm_mask =
2003 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2004 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2007 result = ((__vector unsigned long long)
2008 vec_vbpermq ((__vector unsigned char) __A,
2009 (__vector unsigned char) perm_mask));
2011 #ifdef __LITTLE_ENDIAN__
2017 #endif /* _ARCH_PWR8 */
2019 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2020 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2024 #ifdef __LITTLE_ENDIAN__
2025 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2026 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2028 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2029 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2033 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2034 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2035 return (__m128i) vec_perm (w0, w1, xform1);
2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2039 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2041 unsigned long element_selector_98 = __mask & 0x03;
2042 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2043 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2044 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2045 static const unsigned short permute_selectors[4] =
2047 #ifdef __LITTLE_ENDIAN__
2048 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2050 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2054 #ifdef __LITTLE_ENDIAN__
2055 { 0x1716151413121110UL, 0UL};
2057 { 0x1011121314151617UL, 0UL};
2062 t.as_short[0] = permute_selectors[element_selector_98];
2063 t.as_short[1] = permute_selectors[element_selector_BA];
2064 t.as_short[2] = permute_selectors[element_selector_DC];
2065 t.as_short[3] = permute_selectors[element_selector_FE];
2066 pmask[1] = t.as_m64;
2068 r = vec_perm (a, a, (__vector unsigned char)pmask);
2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2075 unsigned long element_selector_10 = __mask & 0x03;
2076 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2077 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2078 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2079 static const unsigned short permute_selectors[4] =
2081 #ifdef __LITTLE_ENDIAN__
2082 0x0100, 0x0302, 0x0504, 0x0706
2084 0x0001, 0x0203, 0x0405, 0x0607
2088 #ifdef __LITTLE_ENDIAN__
2089 { 0UL, 0x1f1e1d1c1b1a1918UL};
2091 { 0UL, 0x18191a1b1c1d1e1fUL};
2095 t.as_short[0] = permute_selectors[element_selector_10];
2096 t.as_short[1] = permute_selectors[element_selector_32];
2097 t.as_short[2] = permute_selectors[element_selector_54];
2098 t.as_short[3] = permute_selectors[element_selector_76];
2099 pmask[0] = t.as_m64;
2101 r = vec_perm (a, a, (__vector unsigned char)pmask);
2105 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2108 unsigned long element_selector_10 = __mask & 0x03;
2109 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2110 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2111 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2112 static const unsigned int permute_selectors[4] =
2114 #ifdef __LITTLE_ENDIAN__
2115 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2117 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2122 t[0] = permute_selectors[element_selector_10];
2123 t[1] = permute_selectors[element_selector_32];
2124 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2125 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2126 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2129 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2130 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2132 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2134 __m128i_u *p = (__m128i_u*)__C;
2136 tmp = (__v16qu)_mm_loadu_si128(p);
2137 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2138 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2139 _mm_storeu_si128 (p, (__m128i)tmp);
2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143 _mm_avg_epu8 (__m128i __A, __m128i __B)
2145 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2149 _mm_avg_epu16 (__m128i __A, __m128i __B)
2151 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2155 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156 _mm_sad_epu8 (__m128i __A, __m128i __B)
2159 __v16qu vmin, vmax, vabsdiff;
2161 const __v4su zero = { 0, 0, 0, 0 };
2166 vmin = vec_min (a, b);
2167 vmax = vec_max (a, b);
2168 vabsdiff = vec_sub (vmax, vmin);
2169 /* Sum four groups of bytes into integers. */
2170 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2171 /* Sum across four integers with two integer results. */
2172 result = vec_sum2s (vsum, (__vector signed int) zero);
2173 /* Rotate the sums into the correct position. */
2174 #ifdef __LITTLE_ENDIAN__
2175 result = vec_sld (result, result, 4);
2177 result = vec_sld (result, result, 6);
2179 /* Rotate the sums into the correct position. */
2180 return (__m128i) result;
2183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2184 _mm_stream_si32 (int *__A, int __B)
2186 /* Use the data cache block touch for store transient. */
2196 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2197 _mm_stream_si64 (long long int *__A, long long int __B)
2199 /* Use the data cache block touch for store transient. */
2209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2210 _mm_stream_si128 (__m128i *__A, __m128i __B)
2212 /* Use the data cache block touch for store transient. */
2222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2223 _mm_stream_pd (double *__A, __m128d __B)
2225 /* Use the data cache block touch for store transient. */
2232 *(__m128d*)__A = __B;
2235 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236 _mm_clflush (void const *__A)
2238 /* Use the data cache block flush. */
2247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2250 /* Use light weight sync for load to load ordering. */
2251 __atomic_thread_fence (__ATOMIC_RELEASE);
2254 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2257 /* Use heavy weight sync for any to any ordering. */
2258 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2262 _mm_cvtsi32_si128 (int __A)
2264 return _mm_set_epi32 (0, 0, 0, __A);
2267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2268 _mm_cvtsi64_si128 (long long __A)
2270 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2273 /* Microsoft intrinsic. */
2274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2275 _mm_cvtsi64x_si128 (long long __A)
2277 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2280 /* Casts between various SP, DP, INT vector types. Note that these do no
2281 conversion of values, they just change the type. */
2282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283 _mm_castpd_ps(__m128d __A)
2285 return (__m128) __A;
2288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289 _mm_castpd_si128(__m128d __A)
2291 return (__m128i) __A;
2294 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295 _mm_castps_pd(__m128 __A)
2297 return (__m128d) __A;
2300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2301 _mm_castps_si128(__m128 __A)
2303 return (__m128i) __A;
2306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2307 _mm_castsi128_ps(__m128i __A)
2309 return (__m128) __A;
2312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2313 _mm_castsi128_pd(__m128i __A)
2315 return (__m128d) __A;
2318 #endif /* EMMINTRIN_H_ */