1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
10 /* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
17 Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18 VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE scalar float semantics on POWER.
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
28 Most SSE scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications. */
31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #ifndef _XMMINTRIN_H_INCLUDED
35 #define _XMMINTRIN_H_INCLUDED
37 #if defined(__linux__) && defined(__ppc64__)
39 /* Define four value permute mask */
40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
44 /* Avoid collisions between altivec.h and strict adherence to C++ and
45 C11 standards. This should eventually be done inside altivec.h itself,
46 but only after testing a full distro build. */
47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 (defined(__STDC_VERSION__) && \
49 __STDC_VERSION__ >= 201112L))
55 /* We need type definitions from the MMX header file. */
58 /* Get _mm_malloc () and _mm_free (). */
60 #include <mm_malloc.h>
63 /* The Intel API is flexible enough that we must allow aliasing with other
64 vector types, and their scalar components. */
65 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
67 /* Unaligned version of the same type. */
68 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
71 /* Internal data types for implementing the intrinsics. */
72 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
74 /* Create an undefined vector. */
75 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
76 _mm_undefined_ps (void)
82 /* Create a vector of zeros. */
83 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
89 /* Load four SPFP values from P. The address must be 16-byte aligned. */
90 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 _mm_load_ps (float const *__P)
93 return ((__m128)vec_ld(0, (__v4sf*)__P));
96 /* Load four SPFP values from P. The address need not be 16-byte aligned. */
97 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
98 _mm_loadu_ps (float const *__P)
100 return (vec_vsx_ld(0, __P));
103 /* Load four SPFP values in reverse order. The address must be aligned. */
104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
105 _mm_loadr_ps (float const *__P)
109 static const __vector unsigned char permute_vector =
110 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
111 0x17, 0x10, 0x11, 0x12, 0x13 };
113 __tmp = vec_ld (0, (__v4sf *) __P);
114 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
118 /* Create a vector with all four elements equal to F. */
119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120 _mm_set1_ps (float __F)
122 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
125 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_set_ps1 (float __F)
128 return _mm_set1_ps (__F);
131 /* Create the vector [Z Y X W]. */
132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
135 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
138 /* Create the vector [W X Y Z]. */
139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
142 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
145 /* Store four SPFP values. The address must be 16-byte aligned. */
146 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_store_ps (float *__P, __m128 __A)
149 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
152 /* Store four SPFP values. The address need not be 16-byte aligned. */
153 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
154 _mm_storeu_ps (float *__P, __m128 __A)
156 *(__m128_u *)__P = __A;
159 /* Store four SPFP values in reverse order. The address must be aligned. */
160 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
161 _mm_storer_ps (float *__P, __m128 __A)
164 static const __vector unsigned char permute_vector =
165 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
166 0x17, 0x10, 0x11, 0x12, 0x13 };
168 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
170 _mm_store_ps (__P, __tmp);
173 /* Store the lower SPFP value across four words. */
174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_store1_ps (float *__P, __m128 __A)
177 __v4sf __va = vec_splat((__v4sf)__A, 0);
178 _mm_store_ps (__P, __va);
181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_store_ps1 (float *__P, __m128 __A)
184 _mm_store1_ps (__P, __A);
187 /* Create a vector with element 0 as F and the rest zero. */
188 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
189 _mm_set_ss (float __F)
191 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
194 /* Sets the low SPFP value of A from the low value of B. */
195 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
196 _mm_move_ss (__m128 __A, __m128 __B)
198 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
200 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
203 /* Create a vector with element 0 as *P and the rest zero. */
204 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
205 _mm_load_ss (float const *__P)
207 return _mm_set_ss (*__P);
210 /* Stores the lower SPFP value. */
211 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
212 _mm_store_ss (float *__P, __m128 __A)
214 *__P = ((__v4sf)__A)[0];
217 /* Perform the respective operation on the lower SPFP (single-precision
218 floating-point) values of A and B; the upper three SPFP values are
219 passed through from A. */
221 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
222 _mm_add_ss (__m128 __A, __m128 __B)
226 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
227 /* PowerISA VSX does not allow partial (for just lower double)
228 results. So to insure we don't generate spurious exceptions
229 (from the upper double values) we splat the lower double
230 before we to the operation. */
231 a = vec_splat (__A, 0);
232 b = vec_splat (__B, 0);
234 /* Then we merge the lower float result with the original upper
235 float elements from __A. */
236 return (vec_sel (__A, c, mask));
238 __A[0] = __A[0] + __B[0];
243 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244 _mm_sub_ss (__m128 __A, __m128 __B)
248 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
249 /* PowerISA VSX does not allow partial (for just lower double)
250 results. So to insure we don't generate spurious exceptions
251 (from the upper double values) we splat the lower double
252 before we to the operation. */
253 a = vec_splat (__A, 0);
254 b = vec_splat (__B, 0);
256 /* Then we merge the lower float result with the original upper
257 float elements from __A. */
258 return (vec_sel (__A, c, mask));
260 __A[0] = __A[0] - __B[0];
265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
266 _mm_mul_ss (__m128 __A, __m128 __B)
270 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
271 /* PowerISA VSX does not allow partial (for just lower double)
272 results. So to insure we don't generate spurious exceptions
273 (from the upper double values) we splat the lower double
274 before we to the operation. */
275 a = vec_splat (__A, 0);
276 b = vec_splat (__B, 0);
278 /* Then we merge the lower float result with the original upper
279 float elements from __A. */
280 return (vec_sel (__A, c, mask));
282 __A[0] = __A[0] * __B[0];
287 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 _mm_div_ss (__m128 __A, __m128 __B)
292 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
293 /* PowerISA VSX does not allow partial (for just lower double)
294 results. So to insure we don't generate spurious exceptions
295 (from the upper double values) we splat the lower double
296 before we to the operation. */
297 a = vec_splat (__A, 0);
298 b = vec_splat (__B, 0);
300 /* Then we merge the lower float result with the original upper
301 float elements from __A. */
302 return (vec_sel (__A, c, mask));
304 __A[0] = __A[0] / __B[0];
309 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
310 _mm_sqrt_ss (__m128 __A)
313 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
314 /* PowerISA VSX does not allow partial (for just lower double)
315 * results. So to insure we don't generate spurious exceptions
316 * (from the upper double values) we splat the lower double
317 * before we to the operation. */
318 a = vec_splat (__A, 0);
320 /* Then we merge the lower float result with the original upper
321 * float elements from __A. */
322 return (vec_sel (__A, c, mask));
325 /* Perform the respective operation on the four SPFP values in A and B. */
326 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_add_ps (__m128 __A, __m128 __B)
329 return (__m128) ((__v4sf)__A + (__v4sf)__B);
332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333 _mm_sub_ps (__m128 __A, __m128 __B)
335 return (__m128) ((__v4sf)__A - (__v4sf)__B);
338 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339 _mm_mul_ps (__m128 __A, __m128 __B)
341 return (__m128) ((__v4sf)__A * (__v4sf)__B);
344 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm_div_ps (__m128 __A, __m128 __B)
347 return (__m128) ((__v4sf)__A / (__v4sf)__B);
350 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351 _mm_sqrt_ps (__m128 __A)
353 return (vec_sqrt ((__v4sf)__A));
356 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357 _mm_rcp_ps (__m128 __A)
359 return (vec_re ((__v4sf)__A));
362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363 _mm_rsqrt_ps (__m128 __A)
365 return (vec_rsqrte (__A));
368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_rcp_ss (__m128 __A)
372 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
373 /* PowerISA VSX does not allow partial (for just lower double)
374 * results. So to insure we don't generate spurious exceptions
375 * (from the upper double values) we splat the lower double
376 * before we to the operation. */
377 a = vec_splat (__A, 0);
379 /* Then we merge the lower float result with the original upper
380 * float elements from __A. */
381 return (vec_sel (__A, c, mask));
384 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385 _mm_rsqrt_ss (__m128 __A)
388 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
389 /* PowerISA VSX does not allow partial (for just lower double)
390 * results. So to insure we don't generate spurious exceptions
391 * (from the upper double values) we splat the lower double
392 * before we to the operation. */
393 a = vec_splat (__A, 0);
395 /* Then we merge the lower float result with the original upper
396 * float elements from __A. */
397 return (vec_sel (__A, c, mask));
400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
401 _mm_min_ss (__m128 __A, __m128 __B)
404 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
405 /* PowerISA VSX does not allow partial (for just lower float)
406 * results. So to insure we don't generate spurious exceptions
407 * (from the upper float values) we splat the lower float
408 * before we to the operation. */
409 a = vec_splat ((__v4sf)__A, 0);
410 b = vec_splat ((__v4sf)__B, 0);
412 /* Then we merge the lower float result with the original upper
413 * float elements from __A. */
414 return (vec_sel ((__v4sf)__A, c, mask));
417 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
418 _mm_max_ss (__m128 __A, __m128 __B)
421 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
422 /* PowerISA VSX does not allow partial (for just lower float)
423 * results. So to insure we don't generate spurious exceptions
424 * (from the upper float values) we splat the lower float
425 * before we to the operation. */
426 a = vec_splat (__A, 0);
427 b = vec_splat (__B, 0);
429 /* Then we merge the lower float result with the original upper
430 * float elements from __A. */
431 return (vec_sel ((__v4sf)__A, c, mask));
434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
435 _mm_min_ps (__m128 __A, __m128 __B)
437 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
438 return vec_sel (__B, __A, m);
441 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442 _mm_max_ps (__m128 __A, __m128 __B)
444 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
445 return vec_sel (__B, __A, m);
448 /* Perform logical bit-wise operations on 128-bit values. */
449 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
450 _mm_and_ps (__m128 __A, __m128 __B)
452 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
453 // return __builtin_ia32_andps (__A, __B);
456 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
457 _mm_andnot_ps (__m128 __A, __m128 __B)
459 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
462 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
463 _mm_or_ps (__m128 __A, __m128 __B)
465 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
468 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
469 _mm_xor_ps (__m128 __A, __m128 __B)
471 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
474 /* Perform a comparison on the four SPFP values of A and B. For each
475 element, if the comparison is true, place a mask of all ones in the
476 result, otherwise a mask of zeros. */
477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478 _mm_cmpeq_ps (__m128 __A, __m128 __B)
480 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
483 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484 _mm_cmplt_ps (__m128 __A, __m128 __B)
486 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
489 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490 _mm_cmple_ps (__m128 __A, __m128 __B)
492 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
495 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496 _mm_cmpgt_ps (__m128 __A, __m128 __B)
498 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
501 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502 _mm_cmpge_ps (__m128 __A, __m128 __B)
504 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
507 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
508 _mm_cmpneq_ps (__m128 __A, __m128 __B)
510 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
511 return ((__m128)vec_nor (temp, temp));
514 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
515 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
517 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
520 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 _mm_cmpnle_ps (__m128 __A, __m128 __B)
523 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
526 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_cmpngt_ps (__m128 __A, __m128 __B)
529 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
532 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533 _mm_cmpnge_ps (__m128 __A, __m128 __B)
535 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
538 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539 _mm_cmpord_ps (__m128 __A, __m128 __B)
541 __vector unsigned int a, b;
542 __vector unsigned int c, d;
543 static const __vector unsigned int float_exp_mask =
544 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
546 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
547 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
548 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
549 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
550 return ((__m128 ) vec_and (c, d));
553 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554 _mm_cmpunord_ps (__m128 __A, __m128 __B)
556 __vector unsigned int a, b;
557 __vector unsigned int c, d;
558 static const __vector unsigned int float_exp_mask =
559 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
561 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
562 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
563 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
564 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
565 return ((__m128 ) vec_or (c, d));
568 /* Perform a comparison on the lower SPFP values of A and B. If the
569 comparison is true, place a mask of all ones in the result, otherwise a
570 mask of zeros. The upper three SPFP values are passed through from A. */
571 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
572 _mm_cmpeq_ss (__m128 __A, __m128 __B)
574 static const __vector unsigned int mask =
575 { 0xffffffff, 0, 0, 0 };
577 /* PowerISA VMX does not allow partial (for just element 0)
578 * results. So to insure we don't generate spurious exceptions
579 * (from the upper elements) we splat the lower float
580 * before we to the operation. */
581 a = vec_splat ((__v4sf) __A, 0);
582 b = vec_splat ((__v4sf) __B, 0);
583 c = (__v4sf) vec_cmpeq(a, b);
584 /* Then we merge the lower float result with the original upper
585 * float elements from __A. */
586 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
589 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
590 _mm_cmplt_ss (__m128 __A, __m128 __B)
592 static const __vector unsigned int mask =
593 { 0xffffffff, 0, 0, 0 };
595 /* PowerISA VMX does not allow partial (for just element 0)
596 * results. So to insure we don't generate spurious exceptions
597 * (from the upper elements) we splat the lower float
598 * before we to the operation. */
599 a = vec_splat ((__v4sf) __A, 0);
600 b = vec_splat ((__v4sf) __B, 0);
601 c = (__v4sf) vec_cmplt(a, b);
602 /* Then we merge the lower float result with the original upper
603 * float elements from __A. */
604 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
607 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
608 _mm_cmple_ss (__m128 __A, __m128 __B)
610 static const __vector unsigned int mask =
611 { 0xffffffff, 0, 0, 0 };
613 /* PowerISA VMX does not allow partial (for just element 0)
614 * results. So to insure we don't generate spurious exceptions
615 * (from the upper elements) we splat the lower float
616 * before we to the operation. */
617 a = vec_splat ((__v4sf) __A, 0);
618 b = vec_splat ((__v4sf) __B, 0);
619 c = (__v4sf) vec_cmple(a, b);
620 /* Then we merge the lower float result with the original upper
621 * float elements from __A. */
622 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
625 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
626 _mm_cmpgt_ss (__m128 __A, __m128 __B)
628 static const __vector unsigned int mask =
629 { 0xffffffff, 0, 0, 0 };
631 /* PowerISA VMX does not allow partial (for just element 0)
632 * results. So to insure we don't generate spurious exceptions
633 * (from the upper elements) we splat the lower float
634 * before we to the operation. */
635 a = vec_splat ((__v4sf) __A, 0);
636 b = vec_splat ((__v4sf) __B, 0);
637 c = (__v4sf) vec_cmpgt(a, b);
638 /* Then we merge the lower float result with the original upper
639 * float elements from __A. */
640 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
643 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644 _mm_cmpge_ss (__m128 __A, __m128 __B)
646 static const __vector unsigned int mask =
647 { 0xffffffff, 0, 0, 0 };
649 /* PowerISA VMX does not allow partial (for just element 0)
650 * results. So to insure we don't generate spurious exceptions
651 * (from the upper elements) we splat the lower float
652 * before we to the operation. */
653 a = vec_splat ((__v4sf) __A, 0);
654 b = vec_splat ((__v4sf) __B, 0);
655 c = (__v4sf) vec_cmpge(a, b);
656 /* Then we merge the lower float result with the original upper
657 * float elements from __A. */
658 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
661 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662 _mm_cmpneq_ss (__m128 __A, __m128 __B)
664 static const __vector unsigned int mask =
665 { 0xffffffff, 0, 0, 0 };
667 /* PowerISA VMX does not allow partial (for just element 0)
668 * results. So to insure we don't generate spurious exceptions
669 * (from the upper elements) we splat the lower float
670 * before we to the operation. */
671 a = vec_splat ((__v4sf) __A, 0);
672 b = vec_splat ((__v4sf) __B, 0);
673 c = (__v4sf) vec_cmpeq(a, b);
675 /* Then we merge the lower float result with the original upper
676 * float elements from __A. */
677 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
680 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
683 static const __vector unsigned int mask =
684 { 0xffffffff, 0, 0, 0 };
686 /* PowerISA VMX does not allow partial (for just element 0)
687 * results. So to insure we don't generate spurious exceptions
688 * (from the upper elements) we splat the lower float
689 * before we to the operation. */
690 a = vec_splat ((__v4sf) __A, 0);
691 b = vec_splat ((__v4sf) __B, 0);
692 c = (__v4sf) vec_cmpge(a, b);
693 /* Then we merge the lower float result with the original upper
694 * float elements from __A. */
695 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
698 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
699 _mm_cmpnle_ss (__m128 __A, __m128 __B)
701 static const __vector unsigned int mask =
702 { 0xffffffff, 0, 0, 0 };
704 /* PowerISA VMX does not allow partial (for just element 0)
705 * results. So to insure we don't generate spurious exceptions
706 * (from the upper elements) we splat the lower float
707 * before we to the operation. */
708 a = vec_splat ((__v4sf) __A, 0);
709 b = vec_splat ((__v4sf) __B, 0);
710 c = (__v4sf) vec_cmpgt(a, b);
711 /* Then we merge the lower float result with the original upper
712 * float elements from __A. */
713 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
716 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
717 _mm_cmpngt_ss (__m128 __A, __m128 __B)
719 static const __vector unsigned int mask =
720 { 0xffffffff, 0, 0, 0 };
722 /* PowerISA VMX does not allow partial (for just element 0)
723 * results. So to insure we don't generate spurious exceptions
724 * (from the upper elements) we splat the lower float
725 * before we to the operation. */
726 a = vec_splat ((__v4sf) __A, 0);
727 b = vec_splat ((__v4sf) __B, 0);
728 c = (__v4sf) vec_cmple(a, b);
729 /* Then we merge the lower float result with the original upper
730 * float elements from __A. */
731 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
734 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735 _mm_cmpnge_ss (__m128 __A, __m128 __B)
737 static const __vector unsigned int mask =
738 { 0xffffffff, 0, 0, 0 };
740 /* PowerISA VMX does not allow partial (for just element 0)
741 * results. So to insure we don't generate spurious exceptions
742 * (from the upper elements) we splat the lower float
743 * before we do the operation. */
744 a = vec_splat ((__v4sf) __A, 0);
745 b = vec_splat ((__v4sf) __B, 0);
746 c = (__v4sf) vec_cmplt(a, b);
747 /* Then we merge the lower float result with the original upper
748 * float elements from __A. */
749 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
752 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
753 _mm_cmpord_ss (__m128 __A, __m128 __B)
755 __vector unsigned int a, b;
756 __vector unsigned int c, d;
757 static const __vector unsigned int float_exp_mask =
758 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
759 static const __vector unsigned int mask =
760 { 0xffffffff, 0, 0, 0 };
762 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
763 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
764 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
765 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
767 /* Then we merge the lower float result with the original upper
768 * float elements from __A. */
769 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
772 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
773 _mm_cmpunord_ss (__m128 __A, __m128 __B)
775 __vector unsigned int a, b;
776 __vector unsigned int c, d;
777 static const __vector unsigned int float_exp_mask =
778 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
779 static const __vector unsigned int mask =
780 { 0xffffffff, 0, 0, 0 };
782 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
783 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
784 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
785 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
787 /* Then we merge the lower float result with the original upper
788 * float elements from __A. */
789 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
792 /* Compare the lower SPFP values of A and B and return 1 if true
794 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
795 _mm_comieq_ss (__m128 __A, __m128 __B)
797 return (__A[0] == __B[0]);
800 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
801 _mm_comilt_ss (__m128 __A, __m128 __B)
803 return (__A[0] < __B[0]);
806 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
807 _mm_comile_ss (__m128 __A, __m128 __B)
809 return (__A[0] <= __B[0]);
812 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_comigt_ss (__m128 __A, __m128 __B)
815 return (__A[0] > __B[0]);
818 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
819 _mm_comige_ss (__m128 __A, __m128 __B)
821 return (__A[0] >= __B[0]);
824 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
825 _mm_comineq_ss (__m128 __A, __m128 __B)
827 return (__A[0] != __B[0]);
831 * The __mm_ucomi??_ss implementations below are exactly the same as
832 * __mm_comi??_ss because GCC for PowerPC only generates unordered
833 * compares (scalar and vector).
834 * Technically __mm_comieq_ss et al should be using the ordered
835 * compare and signal for QNaNs.
836 * The __mm_ucomieq_sd et all should be OK, as is.
838 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
839 _mm_ucomieq_ss (__m128 __A, __m128 __B)
841 return (__A[0] == __B[0]);
844 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
845 _mm_ucomilt_ss (__m128 __A, __m128 __B)
847 return (__A[0] < __B[0]);
850 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
851 _mm_ucomile_ss (__m128 __A, __m128 __B)
853 return (__A[0] <= __B[0]);
856 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
857 _mm_ucomigt_ss (__m128 __A, __m128 __B)
859 return (__A[0] > __B[0]);
862 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
863 _mm_ucomige_ss (__m128 __A, __m128 __B)
865 return (__A[0] >= __B[0]);
868 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
869 _mm_ucomineq_ss (__m128 __A, __m128 __B)
871 return (__A[0] != __B[0]);
874 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
875 _mm_cvtss_f32 (__m128 __A)
877 return ((__v4sf)__A)[0];
880 /* Convert the lower SPFP value to a 32-bit integer according to the current
882 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
883 _mm_cvtss_si32 (__m128 __A)
889 #ifdef __LITTLE_ENDIAN__
890 "xxsldwi %x0,%x0,%x0,3;\n"
892 "xscvspdp %x2,%x0;\n"
900 res = __builtin_rint(__A[0]);
905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
906 _mm_cvt_ss2si (__m128 __A)
908 return _mm_cvtss_si32 (__A);
911 /* Convert the lower SPFP value to a 32-bit integer according to the
912 current rounding mode. */
914 /* Intel intrinsic. */
915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916 _mm_cvtss_si64 (__m128 __A)
922 #ifdef __LITTLE_ENDIAN__
923 "xxsldwi %x0,%x0,%x0,3;\n"
925 "xscvspdp %x2,%x0;\n"
933 res = __builtin_llrint(__A[0]);
938 /* Microsoft intrinsic. */
939 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
940 _mm_cvtss_si64x (__m128 __A)
942 return _mm_cvtss_si64 ((__v4sf) __A);
945 /* Constants for use with _mm_prefetch. */
948 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
957 /* Loads one cache line from address P to a location "closer" to the
958 processor. The selector I specifies the type of prefetch operation. */
959 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960 _mm_prefetch (const void *__P, enum _mm_hint __I)
962 /* Current PowerPC will ignores the hint parameters. */
963 __builtin_prefetch (__P);
966 /* Convert the two lower SPFP values to 32-bit integers according to the
967 current rounding mode. Return the integers in packed form. */
968 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _mm_cvtps_pi32 (__m128 __A)
971 /* Splat two lower SPFP values to both halves. */
972 __v4sf temp, rounded;
973 __vector unsigned long long result;
975 /* Splat two lower SPFP values to both halves. */
976 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
977 rounded = vec_rint(temp);
978 result = (__vector unsigned long long) vec_cts (rounded, 0);
980 return (__m64) ((__vector long long) result)[0];
983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
984 _mm_cvt_ps2pi (__m128 __A)
986 return _mm_cvtps_pi32 (__A);
989 /* Truncate the lower SPFP value to a 32-bit integer. */
990 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
991 _mm_cvttss_si32 (__m128 __A)
993 /* Extract the lower float element. */
995 /* truncate to 32-bit integer and return. */
999 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000 _mm_cvtt_ss2si (__m128 __A)
1002 return _mm_cvttss_si32 (__A);
1005 /* Intel intrinsic. */
1006 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_cvttss_si64 (__m128 __A)
1009 /* Extract the lower float element. */
1010 float temp = __A[0];
1011 /* truncate to 32-bit integer and return. */
1015 /* Microsoft intrinsic. */
1016 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_cvttss_si64x (__m128 __A)
1019 /* Extract the lower float element. */
1020 float temp = __A[0];
1021 /* truncate to 32-bit integer and return. */
1025 /* Truncate the two lower SPFP values to 32-bit integers. Return the
1026 integers in packed form. */
1027 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028 _mm_cvttps_pi32 (__m128 __A)
1031 __vector unsigned long long result;
1033 /* Splat two lower SPFP values to both halves. */
1034 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1035 result = (__vector unsigned long long) vec_cts (temp, 0);
1037 return (__m64) ((__vector long long) result)[0];
1040 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm_cvtt_ps2pi (__m128 __A)
1043 return _mm_cvttps_pi32 (__A);
1046 /* Convert B to a SPFP value and insert it as element zero in A. */
1047 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 _mm_cvtsi32_ss (__m128 __A, int __B)
1056 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm_cvt_si2ss (__m128 __A, int __B)
1059 return _mm_cvtsi32_ss (__A, __B);
1062 /* Convert B to a SPFP value and insert it as element zero in A. */
1063 /* Intel intrinsic. */
1064 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065 _mm_cvtsi64_ss (__m128 __A, long long __B)
1073 /* Microsoft intrinsic. */
1074 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1077 return _mm_cvtsi64_ss (__A, __B);
1080 /* Convert the two 32-bit values in B to SPFP form and insert them
1081 as the two lower elements in A. */
1082 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083 _mm_cvtpi32_ps (__m128 __A, __m64 __B)
1085 __vector signed int vm1;
1088 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1089 vf1 = (__vector float) vec_ctf (vm1, 0);
1091 return ((__m128) (__vector unsigned long long)
1092 { ((__vector unsigned long long)vf1) [0],
1093 ((__vector unsigned long long)__A) [1]});
1096 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1099 return _mm_cvtpi32_ps (__A, __B);
1102 /* Convert the four signed 16-bit values in A to SPFP form. */
1103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm_cvtpi16_ps (__m64 __A)
1106 __vector signed short vs8;
1107 __vector signed int vi4;
1110 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1111 vi4 = vec_vupklsh (vs8);
1112 vf1 = (__vector float) vec_ctf (vi4, 0);
1114 return (__m128) vf1;
1117 /* Convert the four unsigned 16-bit values in A to SPFP form. */
1118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119 _mm_cvtpu16_ps (__m64 __A)
1121 const __vector unsigned short zero =
1122 { 0, 0, 0, 0, 0, 0, 0, 0 };
1123 __vector unsigned short vs8;
1124 __vector unsigned int vi4;
1127 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1128 vi4 = (__vector unsigned int) vec_mergel
1129 #ifdef __LITTLE_ENDIAN__
1134 vf1 = (__vector float) vec_ctf (vi4, 0);
1136 return (__m128) vf1;
1139 /* Convert the low four signed 8-bit values in A to SPFP form. */
1140 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm_cvtpi8_ps (__m64 __A)
1143 __vector signed char vc16;
1144 __vector signed short vs8;
1145 __vector signed int vi4;
1148 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1149 vs8 = vec_vupkhsb (vc16);
1150 vi4 = vec_vupkhsh (vs8);
1151 vf1 = (__vector float) vec_ctf (vi4, 0);
1153 return (__m128) vf1;
1156 /* Convert the low four unsigned 8-bit values in A to SPFP form. */
1157 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1159 _mm_cvtpu8_ps (__m64 __A)
1161 const __vector unsigned char zero =
1162 { 0, 0, 0, 0, 0, 0, 0, 0 };
1163 __vector unsigned char vc16;
1164 __vector unsigned short vs8;
1165 __vector unsigned int vi4;
1168 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1169 #ifdef __LITTLE_ENDIAN__
1170 vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1171 vi4 = (__vector unsigned int) vec_mergeh (vs8,
1172 (__vector unsigned short) zero);
1174 vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1175 vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1178 vf1 = (__vector float) vec_ctf (vi4, 0);
1180 return (__m128) vf1;
1183 /* Convert the four signed 32-bit values in A and B to SPFP form. */
1184 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1187 __vector signed int vi4;
1190 vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1191 vf4 = (__vector float) vec_ctf (vi4, 0);
1192 return (__m128) vf4;
1195 /* Convert the four SPFP values in A to four signed 16-bit integers. */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtps_pi16 (__m128 __A)
1200 __vector signed int temp;
1201 __vector unsigned long long result;
1203 rounded = vec_rint(__A);
1204 temp = vec_cts (rounded, 0);
1205 result = (__vector unsigned long long) vec_pack (temp, temp);
1207 return (__m64) ((__vector long long) result)[0];
1210 /* Convert the four SPFP values in A to four signed 8-bit integers. */
1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_cvtps_pi8 (__m128 __A)
1215 __vector signed int tmp_i;
1216 static const __vector signed int zero = {0, 0, 0, 0};
1217 __vector signed short tmp_s;
1218 __vector signed char res_v;
1220 rounded = vec_rint(__A);
1221 tmp_i = vec_cts (rounded, 0);
1222 tmp_s = vec_pack (tmp_i, zero);
1223 res_v = vec_pack (tmp_s, tmp_s);
1224 return (__m64) ((__vector long long) res_v)[0];
1227 /* Selects four specific SPFP values from A and B based on MASK. */
1228 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230 _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1232 unsigned long element_selector_10 = __mask & 0x03;
1233 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236 static const unsigned int permute_selectors[4] =
1238 #ifdef __LITTLE_ENDIAN__
1239 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1241 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1244 __vector unsigned int t;
1246 t[0] = permute_selectors[element_selector_10];
1247 t[1] = permute_selectors[element_selector_32];
1248 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1253 /* Selects and interleaves the upper two SPFP values from A and B. */
1254 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1257 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1260 /* Selects and interleaves the lower two SPFP values from A and B. */
1261 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1264 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1267 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1268 the lower two values are passed through from A. */
1269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1272 __vector unsigned long long __a = (__vector unsigned long long)__A;
1273 __vector unsigned long long __p = vec_splats(*__P);
1279 /* Stores the upper two SPFP values of A into P. */
1280 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_storeh_pi (__m64 *__P, __m128 __A)
1283 __vector unsigned long long __a = (__vector unsigned long long) __A;
1288 /* Moves the upper two values of B into the lower two values of A. */
1289 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_movehl_ps (__m128 __A, __m128 __B)
1292 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1293 (__vector unsigned long long)__A);
1296 /* Moves the lower two values of B into the upper two values of A. */
1297 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_movelh_ps (__m128 __A, __m128 __B)
1300 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1301 (__vector unsigned long long)__B);
1304 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1305 the upper two values are passed through from A. */
1306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1309 __vector unsigned long long __a = (__vector unsigned long long)__A;
1310 __vector unsigned long long __p = vec_splats(*__P);
1316 /* Stores the lower two SPFP values of A into P. */
1317 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_storel_pi (__m64 *__P, __m128 __A)
1320 __vector unsigned long long __a = (__vector unsigned long long) __A;
1326 /* Intrinsic functions that require PowerISA 2.07 minimum. */
1328 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1329 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm_movemask_ps (__m128 __A)
1332 __vector unsigned long long result;
1333 static const __vector unsigned int perm_mask =
1335 #ifdef __LITTLE_ENDIAN__
1336 0x00204060, 0x80808080, 0x80808080, 0x80808080
1338 0x80808080, 0x80808080, 0x80808080, 0x00204060
1342 result = ((__vector unsigned long long)
1343 vec_vbpermq ((__vector unsigned char) __A,
1344 (__vector unsigned char) perm_mask));
1346 #ifdef __LITTLE_ENDIAN__
1352 #endif /* _ARCH_PWR8 */
1354 /* Create a vector with all four elements equal to *P. */
1355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm_load1_ps (float const *__P)
1358 return _mm_set1_ps (*__P);
1361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362 _mm_load_ps1 (float const *__P)
1364 return _mm_load1_ps (__P);
1367 /* Extracts one of the four words of A. The selector N must be immediate. */
1368 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_extract_pi16 (__m64 const __A, int const __N)
1371 unsigned int shiftr = __N & 3;
1372 #ifdef __BIG_ENDIAN__
1373 shiftr = 3 - shiftr;
1376 return ((__A >> (shiftr * 16)) & 0xffff);
1379 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380 _m_pextrw (__m64 const __A, int const __N)
1382 return _mm_extract_pi16 (__A, __N);
1385 /* Inserts word D into one of four words of A. The selector N must be
1387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1390 const int shiftl = (__N & 3) * 16;
1391 const __m64 shiftD = (const __m64) __D << shiftl;
1392 const __m64 mask = 0xffffUL << shiftl;
1393 __m64 result = (__A & (~mask)) | (shiftD & mask);
1398 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1401 return _mm_insert_pi16 (__A, __D, __N);
1404 /* Compute the element-wise maximum of signed 16-bit values. */
1405 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407 _mm_max_pi16 (__m64 __A, __m64 __B)
1410 __vector signed short a, b, r;
1411 __vector __bool short c;
1413 a = (__vector signed short)vec_splats (__A);
1414 b = (__vector signed short)vec_splats (__B);
1415 c = (__vector __bool short)vec_cmpgt (a, b);
1416 r = vec_sel (b, a, c);
1417 return (__m64) ((__vector long long) r)[0];
1419 __m64_union m1, m2, res;
1425 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1427 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1429 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1431 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1433 return (__m64) res.as_m64;
1437 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438 _m_pmaxsw (__m64 __A, __m64 __B)
1440 return _mm_max_pi16 (__A, __B);
1443 /* Compute the element-wise maximum of unsigned 8-bit values. */
1444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445 _mm_max_pu8 (__m64 __A, __m64 __B)
1448 __vector unsigned char a, b, r;
1449 __vector __bool char c;
1451 a = (__vector unsigned char)vec_splats (__A);
1452 b = (__vector unsigned char)vec_splats (__B);
1453 c = (__vector __bool char)vec_cmpgt (a, b);
1454 r = vec_sel (b, a, c);
1455 return (__m64) ((__vector long long) r)[0];
1457 __m64_union m1, m2, res;
1464 for (i = 0; i < 8; i++)
1466 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1467 m1.as_char[i] : m2.as_char[i];
1469 return (__m64) res.as_m64;
1473 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1474 _m_pmaxub (__m64 __A, __m64 __B)
1476 return _mm_max_pu8 (__A, __B);
1479 /* Compute the element-wise minimum of signed 16-bit values. */
1480 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481 _mm_min_pi16 (__m64 __A, __m64 __B)
1484 __vector signed short a, b, r;
1485 __vector __bool short c;
1487 a = (__vector signed short)vec_splats (__A);
1488 b = (__vector signed short)vec_splats (__B);
1489 c = (__vector __bool short)vec_cmplt (a, b);
1490 r = vec_sel (b, a, c);
1491 return (__m64) ((__vector long long) r)[0];
1493 __m64_union m1, m2, res;
1499 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1501 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1503 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1505 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1507 return (__m64) res.as_m64;
1511 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512 _m_pminsw (__m64 __A, __m64 __B)
1514 return _mm_min_pi16 (__A, __B);
1517 /* Compute the element-wise minimum of unsigned 8-bit values. */
1518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519 _mm_min_pu8 (__m64 __A, __m64 __B)
1522 __vector unsigned char a, b, r;
1523 __vector __bool char c;
1525 a = (__vector unsigned char)vec_splats (__A);
1526 b = (__vector unsigned char)vec_splats (__B);
1527 c = (__vector __bool char)vec_cmplt (a, b);
1528 r = vec_sel (b, a, c);
1529 return (__m64) ((__vector long long) r)[0];
1531 __m64_union m1, m2, res;
1538 for (i = 0; i < 8; i++)
1540 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1541 m1.as_char[i] : m2.as_char[i];
1543 return (__m64) res.as_m64;
1547 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548 _m_pminub (__m64 __A, __m64 __B)
1550 return _mm_min_pu8 (__A, __B);
1553 /* Create an 8-bit mask of the signs of 8-bit values. */
1554 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1555 _mm_movemask_pi8 (__m64 __A)
1557 unsigned long long p =
1558 #ifdef __LITTLE_ENDIAN__
1559 0x0008101820283038UL; // permute control for sign bits
1561 0x3830282018100800UL; // permute control for sign bits
1563 return __builtin_bpermd (p, __A);
1566 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pmovmskb (__m64 __A)
1569 return _mm_movemask_pi8 (__A);
1572 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573 in B and produce the high 16 bits of the 32-bit results. */
1574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1577 __vector unsigned short a, b;
1578 __vector unsigned short c;
1579 __vector unsigned int w0, w1;
1580 __vector unsigned char xform1 = {
1581 #ifdef __LITTLE_ENDIAN__
1582 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1583 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1586 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1590 a = (__vector unsigned short)vec_splats (__A);
1591 b = (__vector unsigned short)vec_splats (__B);
1593 w0 = vec_vmuleuh (a, b);
1594 w1 = vec_vmulouh (a, b);
1595 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1597 return (__m64) ((__vector long long) c)[0];
1600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601 _m_pmulhuw (__m64 __A, __m64 __B)
1603 return _mm_mulhi_pu16 (__A, __B);
1606 /* Return a combination of the four 16-bit values in A. The selector
1607 must be an immediate. */
1608 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1609 _mm_shuffle_pi16 (__m64 __A, int const __N)
1611 unsigned long element_selector_10 = __N & 0x03;
1612 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615 static const unsigned short permute_selectors[4] =
1617 #ifdef __LITTLE_ENDIAN__
1618 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1620 0x0607, 0x0405, 0x0203, 0x0001
1624 __vector unsigned long long a, p, r;
1626 #ifdef __LITTLE_ENDIAN__
1627 t.as_short[0] = permute_selectors[element_selector_10];
1628 t.as_short[1] = permute_selectors[element_selector_32];
1629 t.as_short[2] = permute_selectors[element_selector_54];
1630 t.as_short[3] = permute_selectors[element_selector_76];
1632 t.as_short[3] = permute_selectors[element_selector_10];
1633 t.as_short[2] = permute_selectors[element_selector_32];
1634 t.as_short[1] = permute_selectors[element_selector_54];
1635 t.as_short[0] = permute_selectors[element_selector_76];
1637 p = vec_splats (t.as_m64);
1638 a = vec_splats (__A);
1639 r = vec_perm (a, a, (__vector unsigned char)p);
1640 return (__m64) ((__vector long long) r)[0];
1643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1644 _m_pshufw (__m64 __A, int const __N)
1646 return _mm_shuffle_pi16 (__A, __N);
1649 /* Conditionally store byte elements of A into P. The high bit of each
1650 byte in the selector N determines whether the corresponding byte from
1652 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1653 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1655 __m64 hibit = 0x8080808080808080UL;
1657 __m64 *p = (__m64*)__P;
1660 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1661 tmp = (tmp & (~mask)) | (__A & mask);
1665 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1666 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1668 _mm_maskmove_si64 (__A, __N, __P);
1671 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1673 _mm_avg_pu8 (__m64 __A, __m64 __B)
1675 __vector unsigned char a, b, c;
1677 a = (__vector unsigned char)vec_splats (__A);
1678 b = (__vector unsigned char)vec_splats (__B);
1680 return (__m64) ((__vector long long) c)[0];
1683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684 _m_pavgb (__m64 __A, __m64 __B)
1686 return _mm_avg_pu8 (__A, __B);
1689 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1691 _mm_avg_pu16 (__m64 __A, __m64 __B)
1693 __vector unsigned short a, b, c;
1695 a = (__vector unsigned short)vec_splats (__A);
1696 b = (__vector unsigned short)vec_splats (__B);
1698 return (__m64) ((__vector long long) c)[0];
1701 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702 _m_pavgw (__m64 __A, __m64 __B)
1704 return _mm_avg_pu16 (__A, __B);
1707 /* Compute the sum of the absolute differences of the unsigned 8-bit
1708 values in A and B. Return the value in the lower 16-bit word; the
1709 upper words are cleared. */
1710 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1711 _mm_sad_pu8 (__m64 __A, __m64 __B)
1713 __vector unsigned char a, b;
1714 __vector unsigned char vmin, vmax, vabsdiff;
1715 __vector signed int vsum;
1716 const __vector unsigned int zero =
1718 __m64_union result = {0};
1720 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1721 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1722 vmin = vec_min (a, b);
1723 vmax = vec_max (a, b);
1724 vabsdiff = vec_sub (vmax, vmin);
1725 /* Sum four groups of bytes into integers. */
1726 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1727 /* Sum across four integers with integer result. */
1728 vsum = vec_sums (vsum, (__vector signed int) zero);
1729 /* The sum is in the right most 32-bits of the vector result.
1730 Transfer to a GPR and truncate to 16 bits. */
1731 result.as_short[0] = vsum[3];
1732 return result.as_m64;
1735 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736 _m_psadbw (__m64 __A, __m64 __B)
1738 return _mm_sad_pu8 (__A, __B);
1741 /* Stores the data in A to the address P without polluting the caches. */
1742 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1743 _mm_stream_pi (__m64 *__P, __m64 __A)
1745 /* Use the data cache block touch for store transient. */
1755 /* Likewise. The address must be 16-byte aligned. */
1756 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_stream_ps (float *__P, __m128 __A)
1759 /* Use the data cache block touch for store transient. */
1766 _mm_store_ps (__P, __A);
1769 /* Guarantees that every preceding store is globally visible before
1770 any subsequent store. */
1771 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1774 /* Generate a light weight sync. */
1775 __atomic_thread_fence (__ATOMIC_RELEASE);
1778 /* The execution of the next instruction is delayed by an implementation
1779 specific amount of time. The instruction does not modify the
1780 architectural state. This is after the pop_options pragma because
1781 it does not require SSE support in the processor--the encoding is a
1782 nop on processors that do not support it. */
1783 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1786 /* There is no exact match with this construct, but the following is
1787 close to the desired effect. */
1789 /* On power8 and later processors we can depend on Program Priority
1790 (PRI) and associated "very low" PPI setting. Since we don't know
1791 what PPI this thread is running at we: 1) save the current PRI
1792 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1793 via the special or 31,31,31 encoding. 3) issue an "isync" to
1794 insure the PRI change takes effect before we execute any more
1796 Now we can execute a lwsync (release barrier) while we execute
1797 this thread at "very low" PRI. Finally we restore the original
1798 PRI and continue execution. */
1799 unsigned long __PPR;
1813 /* For older processor where we may not even have Program Priority
1814 controls we can only depend on Heavy Weight Sync. */
1815 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1819 /* Transpose the 4x4 matrix composed of row[0-3]. */
1820 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1822 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1823 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1824 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1825 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1826 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1827 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1828 (__vector long long)__t1); \
1829 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1830 (__vector long long)__t1); \
1831 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1832 (__vector long long)__t3); \
1833 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1834 (__vector long long)__t3); \
1837 /* For backward source compatibility. */
1838 //# include <emmintrin.h>
1841 #include_next <xmmintrin.h>
1842 #endif /* defined(__linux__) && defined(__ppc64__) */
1844 #endif /* _XMMINTRIN_H_INCLUDED */