contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h

   1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  */
   9
  10 /* Implemented from the specification included in the Intel C++ Compiler
  11    User Guide and Reference, version 9.0.  */
  12
  13 #ifndef NO_WARN_X86_INTRINSICS
  14 /* This header file is to help porting code using Intel intrinsics
  15    explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17    Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
  18    VMX/VSX ISA is a good match for vector float SIMD operations.
  19    However scalar float operations in vector (XMM) registers require
  20    the POWER8 VSX ISA (2.07) level. There are differences for data
  21    format and placement of float scalars in the vector register, which
  22    require extra steps to match SSE scalar float semantics on POWER.
  23
  24    It should be noted that there's much difference between X86_64's
  25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
  26    portable <fenv.h> instead of access MXSCR directly.
  27
  28    Most SSE scalar float intrinsic operations can be performed more
  29    efficiently as C language float scalar operations or optimized to
  30    use vector SIMD operations. We recommend this for new applications. */
  31 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  32 #endif
  33
  34 #ifndef _XMMINTRIN_H_INCLUDED
  35 #define _XMMINTRIN_H_INCLUDED
  36
  37 #if defined(__linux__) && defined(__ppc64__)
  38
  39 /* Define four value permute mask */
  40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
  41
  42 #include <altivec.h>
  43
  44 /* Avoid collisions between altivec.h and strict adherence to C++ and
  45    C11 standards.  This should eventually be done inside altivec.h itself,
  46    but only after testing a full distro build.  */
  47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
  48                                  (defined(__STDC_VERSION__) &&  \
  49                                   __STDC_VERSION__ >= 201112L))
  50 #undef vector
  51 #undef pixel
  52 #undef bool
  53 #endif
  54
  55 /* We need type definitions from the MMX header file.  */
  56 #include <mmintrin.h>
  57
  58 /* Get _mm_malloc () and _mm_free ().  */
  59 #if __STDC_HOSTED__
  60 #include <mm_malloc.h>
  61 #endif
  62
  63 /* The Intel API is flexible enough that we must allow aliasing with other
  64    vector types, and their scalar components.  */
  65 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
  66
  67 /* Unaligned version of the same type.  */
  68 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
  69                                        __aligned__ (1)));
  70
  71 /* Internal data types for implementing the intrinsics.  */
  72 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
  73
  74 /* Create an undefined vector.  */
  75 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76 _mm_undefined_ps (void)
  77 {
  78   __m128 __Y = __Y;
  79   return __Y;
  80 }
  81
  82 /* Create a vector of zeros.  */
  83 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  84 _mm_setzero_ps (void)
  85 {
  86   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
  87 }
  88
  89 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
  90 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  91 _mm_load_ps (float const *__P)
  92 {
  93   return ((__m128)vec_ld(0, (__v4sf*)__P));
  94 }
  95
  96 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
  97 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  98 _mm_loadu_ps (float const *__P)
  99 {
 100   return (vec_vsx_ld(0, __P));
 101 }
 102
 103 /* Load four SPFP values in reverse order.  The address must be aligned.  */
 104 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 105 _mm_loadr_ps (float const *__P)
 106 {
 107   __v4sf   __tmp;
 108   __m128 result;
 109   static const __vector unsigned char permute_vector =
 110     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 111         0x17, 0x10, 0x11, 0x12, 0x13 };
 112
 113   __tmp = vec_ld (0, (__v4sf *) __P);
 114   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
 115   return result;
 116 }
 117
 118 /* Create a vector with all four elements equal to F.  */
 119 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120 _mm_set1_ps (float __F)
 121 {
 122   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
 123 }
 124
 125 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 126 _mm_set_ps1 (float __F)
 127 {
 128   return _mm_set1_ps (__F);
 129 }
 130
 131 /* Create the vector [Z Y X W].  */
 132 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 133 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
 134 {
 135   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
 136 }
 137
 138 /* Create the vector [W X Y Z].  */
 139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 140 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 141 {
 142   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 143 }
 144
 145 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 146 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 147 _mm_store_ps (float *__P, __m128 __A)
 148 {
 149   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
 150 }
 151
 152 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
 153 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 154 _mm_storeu_ps (float *__P, __m128 __A)
 155 {
 156   *(__m128_u *)__P = __A;
 157 }
 158
 159 /* Store four SPFP values in reverse order.  The address must be aligned.  */
 160 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 161 _mm_storer_ps (float *__P, __m128 __A)
 162 {
 163   __v4sf   __tmp;
 164   static const __vector unsigned char permute_vector =
 165     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
 166         0x17, 0x10, 0x11, 0x12, 0x13 };
 167
 168   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
 169
 170   _mm_store_ps (__P, __tmp);
 171 }
 172
 173 /* Store the lower SPFP value across four words.  */
 174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 175 _mm_store1_ps (float *__P, __m128 __A)
 176 {
 177   __v4sf __va = vec_splat((__v4sf)__A, 0);
 178   _mm_store_ps (__P, __va);
 179 }
 180
 181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 182 _mm_store_ps1 (float *__P, __m128 __A)
 183 {
 184   _mm_store1_ps (__P, __A);
 185 }
 186
 187 /* Create a vector with element 0 as F and the rest zero.  */
 188 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 189 _mm_set_ss (float __F)
 190 {
 191   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
 192 }
 193
 194 /* Sets the low SPFP value of A from the low value of B.  */
 195 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 196 _mm_move_ss (__m128 __A, __m128 __B)
 197 {
 198   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 199
 200   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
 201 }
 202
 203 /* Create a vector with element 0 as *P and the rest zero.  */
 204 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 205 _mm_load_ss (float const *__P)
 206 {
 207   return _mm_set_ss (*__P);
 208 }
 209
 210 /* Stores the lower SPFP value.  */
 211 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 212 _mm_store_ss (float *__P, __m128 __A)
 213 {
 214   *__P = ((__v4sf)__A)[0];
 215 }
 216
 217 /* Perform the respective operation on the lower SPFP (single-precision
 218    floating-point) values of A and B; the upper three SPFP values are
 219    passed through from A.  */
 220
 221 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 222 _mm_add_ss (__m128 __A, __m128 __B)
 223 {
 224 #ifdef _ARCH_PWR7
 225   __m128 a, b, c;
 226   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 227   /* PowerISA VSX does not allow partial (for just lower double)
 228      results. So to insure we don't generate spurious exceptions
 229      (from the upper double values) we splat the lower double
 230      before we to the operation.  */
 231   a = vec_splat (__A, 0);
 232   b = vec_splat (__B, 0);
 233   c = a + b;
 234   /* Then we merge the lower float result with the original upper
 235      float elements from __A.  */
 236   return (vec_sel (__A, c, mask));
 237 #else
 238   __A[0] = __A[0] + __B[0];
 239   return (__A);
 240 #endif
 241 }
 242
 243 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 244 _mm_sub_ss (__m128 __A, __m128 __B)
 245 {
 246 #ifdef _ARCH_PWR7
 247   __m128 a, b, c;
 248   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 249   /* PowerISA VSX does not allow partial (for just lower double)
 250      results. So to insure we don't generate spurious exceptions
 251      (from the upper double values) we splat the lower double
 252      before we to the operation.  */
 253   a = vec_splat (__A, 0);
 254   b = vec_splat (__B, 0);
 255   c = a - b;
 256   /* Then we merge the lower float result with the original upper
 257      float elements from __A.  */
 258   return (vec_sel (__A, c, mask));
 259 #else
 260   __A[0] = __A[0] - __B[0];
 261   return (__A);
 262 #endif
 263 }
 264
 265 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 266 _mm_mul_ss (__m128 __A, __m128 __B)
 267 {
 268 #ifdef _ARCH_PWR7
 269   __m128 a, b, c;
 270   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 271   /* PowerISA VSX does not allow partial (for just lower double)
 272      results. So to insure we don't generate spurious exceptions
 273      (from the upper double values) we splat the lower double
 274      before we to the operation.  */
 275   a = vec_splat (__A, 0);
 276   b = vec_splat (__B, 0);
 277   c = a * b;
 278   /* Then we merge the lower float result with the original upper
 279      float elements from __A.  */
 280   return (vec_sel (__A, c, mask));
 281 #else
 282   __A[0] = __A[0] * __B[0];
 283   return (__A);
 284 #endif
 285 }
 286
 287 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 288 _mm_div_ss (__m128 __A, __m128 __B)
 289 {
 290 #ifdef _ARCH_PWR7
 291   __m128 a, b, c;
 292   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 293   /* PowerISA VSX does not allow partial (for just lower double)
 294      results. So to insure we don't generate spurious exceptions
 295      (from the upper double values) we splat the lower double
 296      before we to the operation.  */
 297   a = vec_splat (__A, 0);
 298   b = vec_splat (__B, 0);
 299   c = a / b;
 300   /* Then we merge the lower float result with the original upper
 301      float elements from __A.  */
 302   return (vec_sel (__A, c, mask));
 303 #else
 304   __A[0] = __A[0] / __B[0];
 305   return (__A);
 306 #endif
 307 }
 308
 309 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 310 _mm_sqrt_ss (__m128 __A)
 311 {
 312   __m128 a, c;
 313   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 314   /* PowerISA VSX does not allow partial (for just lower double)
 315    * results. So to insure we don't generate spurious exceptions
 316    * (from the upper double values) we splat the lower double
 317    * before we to the operation. */
 318   a = vec_splat (__A, 0);
 319   c = vec_sqrt (a);
 320   /* Then we merge the lower float result with the original upper
 321    * float elements from __A.  */
 322   return (vec_sel (__A, c, mask));
 323 }
 324
 325 /* Perform the respective operation on the four SPFP values in A and B.  */
 326 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 327 _mm_add_ps (__m128 __A, __m128 __B)
 328 {
 329   return (__m128) ((__v4sf)__A + (__v4sf)__B);
 330 }
 331
 332 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 333 _mm_sub_ps (__m128 __A, __m128 __B)
 334 {
 335   return (__m128) ((__v4sf)__A - (__v4sf)__B);
 336 }
 337
 338 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 339 _mm_mul_ps (__m128 __A, __m128 __B)
 340 {
 341   return (__m128) ((__v4sf)__A * (__v4sf)__B);
 342 }
 343
 344 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 345 _mm_div_ps (__m128 __A, __m128 __B)
 346 {
 347   return (__m128) ((__v4sf)__A / (__v4sf)__B);
 348 }
 349
 350 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 351 _mm_sqrt_ps (__m128 __A)
 352 {
 353   return (vec_sqrt ((__v4sf)__A));
 354 }
 355
 356 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 357 _mm_rcp_ps (__m128 __A)
 358 {
 359   return (vec_re ((__v4sf)__A));
 360 }
 361
 362 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363 _mm_rsqrt_ps (__m128 __A)
 364 {
 365   return (vec_rsqrte (__A));
 366 }
 367
 368 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369 _mm_rcp_ss (__m128 __A)
 370 {
 371   __m128 a, c;
 372   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 373   /* PowerISA VSX does not allow partial (for just lower double)
 374    * results. So to insure we don't generate spurious exceptions
 375    * (from the upper double values) we splat the lower double
 376    * before we to the operation. */
 377   a = vec_splat (__A, 0);
 378   c = _mm_rcp_ps (a);
 379   /* Then we merge the lower float result with the original upper
 380    * float elements from __A.  */
 381   return (vec_sel (__A, c, mask));
 382 }
 383
 384 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 385 _mm_rsqrt_ss (__m128 __A)
 386 {
 387   __m128 a, c;
 388   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 389   /* PowerISA VSX does not allow partial (for just lower double)
 390    * results. So to insure we don't generate spurious exceptions
 391    * (from the upper double values) we splat the lower double
 392    * before we to the operation. */
 393   a = vec_splat (__A, 0);
 394   c = vec_rsqrte (a);
 395   /* Then we merge the lower float result with the original upper
 396    * float elements from __A.  */
 397   return (vec_sel (__A, c, mask));
 398 }
 399
 400 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 401 _mm_min_ss (__m128 __A, __m128 __B)
 402 {
 403   __v4sf a, b, c;
 404   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 405   /* PowerISA VSX does not allow partial (for just lower float)
 406    * results. So to insure we don't generate spurious exceptions
 407    * (from the upper float values) we splat the lower float
 408    * before we to the operation. */
 409   a = vec_splat ((__v4sf)__A, 0);
 410   b = vec_splat ((__v4sf)__B, 0);
 411   c = vec_min (a, b);
 412   /* Then we merge the lower float result with the original upper
 413    * float elements from __A.  */
 414   return (vec_sel ((__v4sf)__A, c, mask));
 415 }
 416
 417 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 418 _mm_max_ss (__m128 __A, __m128 __B)
 419 {
 420   __v4sf a, b, c;
 421   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
 422   /* PowerISA VSX does not allow partial (for just lower float)
 423    * results. So to insure we don't generate spurious exceptions
 424    * (from the upper float values) we splat the lower float
 425    * before we to the operation. */
 426   a = vec_splat (__A, 0);
 427   b = vec_splat (__B, 0);
 428   c = vec_max (a, b);
 429   /* Then we merge the lower float result with the original upper
 430    * float elements from __A.  */
 431   return (vec_sel ((__v4sf)__A, c, mask));
 432 }
 433
 434 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 435 _mm_min_ps (__m128 __A, __m128 __B)
 436 {
 437   __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
 438   return vec_sel (__B, __A, m);
 439 }
 440
 441 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 442 _mm_max_ps (__m128 __A, __m128 __B)
 443 {
 444   __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
 445   return vec_sel (__B, __A, m);
 446 }
 447
 448 /* Perform logical bit-wise operations on 128-bit values.  */
 449 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 450 _mm_and_ps (__m128 __A, __m128 __B)
 451 {
 452   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
 453 //  return __builtin_ia32_andps (__A, __B);
 454 }
 455
 456 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 457 _mm_andnot_ps (__m128 __A, __m128 __B)
 458 {
 459   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
 460 }
 461
 462 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 463 _mm_or_ps (__m128 __A, __m128 __B)
 464 {
 465   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
 466 }
 467
 468 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 469 _mm_xor_ps (__m128 __A, __m128 __B)
 470 {
 471   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
 472 }
 473
 474 /* Perform a comparison on the four SPFP values of A and B.  For each
 475    element, if the comparison is true, place a mask of all ones in the
 476    result, otherwise a mask of zeros.  */
 477 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 478 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 479 {
 480   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
 481 }
 482
 483 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 484 _mm_cmplt_ps (__m128 __A, __m128 __B)
 485 {
 486   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 487 }
 488
 489 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 490 _mm_cmple_ps (__m128 __A, __m128 __B)
 491 {
 492   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 493 }
 494
 495 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 496 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 497 {
 498   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 499 }
 500
 501 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 502 _mm_cmpge_ps (__m128 __A, __m128 __B)
 503 {
 504   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 505 }
 506
 507 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 508 _mm_cmpneq_ps (__m128  __A, __m128  __B)
 509 {
 510   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
 511   return ((__m128)vec_nor (temp, temp));
 512 }
 513
 514 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 515 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 516 {
 517   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
 518 }
 519
 520 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 521 _mm_cmpnle_ps (__m128 __A, __m128 __B)
 522 {
 523   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
 524 }
 525
 526 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 527 _mm_cmpngt_ps (__m128 __A, __m128 __B)
 528 {
 529   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
 530 }
 531
 532 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 533 _mm_cmpnge_ps (__m128 __A, __m128 __B)
 534 {
 535   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
 536 }
 537
 538 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 539 _mm_cmpord_ps (__m128  __A, __m128  __B)
 540 {
 541   __vector unsigned int a, b;
 542   __vector unsigned int c, d;
 543   static const __vector unsigned int float_exp_mask =
 544     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 545
 546   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 547   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 548   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 549   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 550   return ((__m128 ) vec_and (c, d));
 551 }
 552
 553 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 554 _mm_cmpunord_ps (__m128 __A, __m128 __B)
 555 {
 556   __vector unsigned int a, b;
 557   __vector unsigned int c, d;
 558   static const __vector unsigned int float_exp_mask =
 559     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 560
 561   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 562   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 563   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 564   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 565   return ((__m128 ) vec_or (c, d));
 566 }
 567
 568 /* Perform a comparison on the lower SPFP values of A and B.  If the
 569    comparison is true, place a mask of all ones in the result, otherwise a
 570    mask of zeros.  The upper three SPFP values are passed through from A.  */
 571 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 572 _mm_cmpeq_ss (__m128  __A, __m128  __B)
 573 {
 574   static const __vector unsigned int mask =
 575     { 0xffffffff, 0, 0, 0 };
 576   __v4sf a, b, c;
 577   /* PowerISA VMX does not allow partial (for just element 0)
 578    * results. So to insure we don't generate spurious exceptions
 579    * (from the upper elements) we splat the lower float
 580    * before we to the operation. */
 581   a = vec_splat ((__v4sf) __A, 0);
 582   b = vec_splat ((__v4sf) __B, 0);
 583   c = (__v4sf) vec_cmpeq(a, b);
 584   /* Then we merge the lower float result with the original upper
 585    * float elements from __A.  */
 586   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 587 }
 588
 589 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 590 _mm_cmplt_ss (__m128 __A, __m128 __B)
 591 {
 592   static const __vector unsigned int mask =
 593     { 0xffffffff, 0, 0, 0 };
 594   __v4sf a, b, c;
 595   /* PowerISA VMX does not allow partial (for just element 0)
 596    * results. So to insure we don't generate spurious exceptions
 597    * (from the upper elements) we splat the lower float
 598    * before we to the operation. */
 599   a = vec_splat ((__v4sf) __A, 0);
 600   b = vec_splat ((__v4sf) __B, 0);
 601   c = (__v4sf) vec_cmplt(a, b);
 602   /* Then we merge the lower float result with the original upper
 603    * float elements from __A.  */
 604   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 605 }
 606
 607 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 608 _mm_cmple_ss (__m128 __A, __m128 __B)
 609 {
 610   static const __vector unsigned int mask =
 611     { 0xffffffff, 0, 0, 0 };
 612   __v4sf a, b, c;
 613   /* PowerISA VMX does not allow partial (for just element 0)
 614    * results. So to insure we don't generate spurious exceptions
 615    * (from the upper elements) we splat the lower float
 616    * before we to the operation. */
 617   a = vec_splat ((__v4sf) __A, 0);
 618   b = vec_splat ((__v4sf) __B, 0);
 619   c = (__v4sf) vec_cmple(a, b);
 620   /* Then we merge the lower float result with the original upper
 621    * float elements from __A.  */
 622   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 623 }
 624
 625 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 626 _mm_cmpgt_ss (__m128 __A, __m128 __B)
 627 {
 628   static const __vector unsigned int mask =
 629     { 0xffffffff, 0, 0, 0 };
 630   __v4sf a, b, c;
 631   /* PowerISA VMX does not allow partial (for just element 0)
 632    * results. So to insure we don't generate spurious exceptions
 633    * (from the upper elements) we splat the lower float
 634    * before we to the operation. */
 635   a = vec_splat ((__v4sf) __A, 0);
 636   b = vec_splat ((__v4sf) __B, 0);
 637   c = (__v4sf) vec_cmpgt(a, b);
 638   /* Then we merge the lower float result with the original upper
 639    * float elements from __A.  */
 640   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 641 }
 642
 643 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 644 _mm_cmpge_ss (__m128 __A, __m128 __B)
 645 {
 646   static const __vector unsigned int mask =
 647     { 0xffffffff, 0, 0, 0 };
 648   __v4sf a, b, c;
 649   /* PowerISA VMX does not allow partial (for just element 0)
 650    * results. So to insure we don't generate spurious exceptions
 651    * (from the upper elements) we splat the lower float
 652    * before we to the operation. */
 653   a = vec_splat ((__v4sf) __A, 0);
 654   b = vec_splat ((__v4sf) __B, 0);
 655   c = (__v4sf) vec_cmpge(a, b);
 656   /* Then we merge the lower float result with the original upper
 657    * float elements from __A.  */
 658   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 659 }
 660
 661 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 662 _mm_cmpneq_ss (__m128 __A, __m128 __B)
 663 {
 664   static const __vector unsigned int mask =
 665     { 0xffffffff, 0, 0, 0 };
 666   __v4sf a, b, c;
 667   /* PowerISA VMX does not allow partial (for just element 0)
 668    * results. So to insure we don't generate spurious exceptions
 669    * (from the upper elements) we splat the lower float
 670    * before we to the operation. */
 671   a = vec_splat ((__v4sf) __A, 0);
 672   b = vec_splat ((__v4sf) __B, 0);
 673   c = (__v4sf) vec_cmpeq(a, b);
 674   c = vec_nor (c, c);
 675   /* Then we merge the lower float result with the original upper
 676    * float elements from __A.  */
 677   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 678 }
 679
 680 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 681 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
 682 {
 683   static const __vector unsigned int mask =
 684     { 0xffffffff, 0, 0, 0 };
 685   __v4sf a, b, c;
 686   /* PowerISA VMX does not allow partial (for just element 0)
 687    * results. So to insure we don't generate spurious exceptions
 688    * (from the upper elements) we splat the lower float
 689    * before we to the operation. */
 690   a = vec_splat ((__v4sf) __A, 0);
 691   b = vec_splat ((__v4sf) __B, 0);
 692   c = (__v4sf) vec_cmpge(a, b);
 693   /* Then we merge the lower float result with the original upper
 694    * float elements from __A.  */
 695   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 696 }
 697
 698 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 699 _mm_cmpnle_ss (__m128 __A, __m128 __B)
 700 {
 701   static const __vector unsigned int mask =
 702     { 0xffffffff, 0, 0, 0 };
 703   __v4sf a, b, c;
 704   /* PowerISA VMX does not allow partial (for just element 0)
 705    * results. So to insure we don't generate spurious exceptions
 706    * (from the upper elements) we splat the lower float
 707    * before we to the operation. */
 708   a = vec_splat ((__v4sf) __A, 0);
 709   b = vec_splat ((__v4sf) __B, 0);
 710   c = (__v4sf) vec_cmpgt(a, b);
 711   /* Then we merge the lower float result with the original upper
 712    * float elements from __A.  */
 713   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 714 }
 715
 716 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 717 _mm_cmpngt_ss (__m128 __A, __m128 __B)
 718 {
 719   static const __vector unsigned int mask =
 720     { 0xffffffff, 0, 0, 0 };
 721   __v4sf a, b, c;
 722   /* PowerISA VMX does not allow partial (for just element 0)
 723    * results. So to insure we don't generate spurious exceptions
 724    * (from the upper elements) we splat the lower float
 725    * before we to the operation. */
 726   a = vec_splat ((__v4sf) __A, 0);
 727   b = vec_splat ((__v4sf) __B, 0);
 728   c = (__v4sf) vec_cmple(a, b);
 729   /* Then we merge the lower float result with the original upper
 730    * float elements from __A.  */
 731   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 732 }
 733
 734 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 735 _mm_cmpnge_ss (__m128 __A, __m128 __B)
 736 {
 737   static const __vector unsigned int mask =
 738     { 0xffffffff, 0, 0, 0 };
 739   __v4sf a, b, c;
 740   /* PowerISA VMX does not allow partial (for just element 0)
 741    * results. So to insure we don't generate spurious exceptions
 742    * (from the upper elements) we splat the lower float
 743    * before we do the operation. */
 744   a = vec_splat ((__v4sf) __A, 0);
 745   b = vec_splat ((__v4sf) __B, 0);
 746   c = (__v4sf) vec_cmplt(a, b);
 747   /* Then we merge the lower float result with the original upper
 748    * float elements from __A.  */
 749   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
 750 }
 751
 752 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 753 _mm_cmpord_ss (__m128 __A, __m128 __B)
 754 {
 755   __vector unsigned int a, b;
 756   __vector unsigned int c, d;
 757   static const __vector unsigned int float_exp_mask =
 758     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 759   static const __vector unsigned int mask =
 760     { 0xffffffff, 0, 0, 0 };
 761
 762   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 763   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 764   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
 765   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
 766   c = vec_and (c, d);
 767   /* Then we merge the lower float result with the original upper
 768    * float elements from __A.  */
 769   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 770 }
 771
 772 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 773 _mm_cmpunord_ss (__m128 __A, __m128 __B)
 774 {
 775   __vector unsigned int a, b;
 776   __vector unsigned int c, d;
 777   static const __vector unsigned int float_exp_mask =
 778     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
 779   static const __vector unsigned int mask =
 780     { 0xffffffff, 0, 0, 0 };
 781
 782   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
 783   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
 784   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
 785   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
 786   c = vec_or (c, d);
 787   /* Then we merge the lower float result with the original upper
 788    * float elements from __A.  */
 789   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
 790 }
 791
 792 /* Compare the lower SPFP values of A and B and return 1 if true
 793    and 0 if false.  */
 794 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 795 _mm_comieq_ss (__m128 __A, __m128 __B)
 796 {
 797   return (__A[0] == __B[0]);
 798 }
 799
 800 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 801 _mm_comilt_ss (__m128 __A, __m128 __B)
 802 {
 803   return (__A[0] < __B[0]);
 804 }
 805
 806 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 807 _mm_comile_ss (__m128 __A, __m128 __B)
 808 {
 809   return (__A[0] <= __B[0]);
 810 }
 811
 812 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 813 _mm_comigt_ss (__m128 __A, __m128 __B)
 814 {
 815   return (__A[0] > __B[0]);
 816 }
 817
 818 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 819 _mm_comige_ss (__m128 __A, __m128 __B)
 820 {
 821   return (__A[0] >= __B[0]);
 822 }
 823
 824 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 825 _mm_comineq_ss (__m128 __A, __m128 __B)
 826 {
 827   return (__A[0] != __B[0]);
 828 }
 829
 830 /* FIXME
 831  * The __mm_ucomi??_ss implementations below are exactly the same as
 832  * __mm_comi??_ss because GCC for PowerPC only generates unordered
 833  * compares (scalar and vector).
 834  * Technically __mm_comieq_ss et al should be using the ordered
 835  * compare and signal for QNaNs.
 836  * The __mm_ucomieq_sd et all should be OK, as is.
 837  */
 838 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 839 _mm_ucomieq_ss (__m128 __A, __m128 __B)
 840 {
 841   return (__A[0] == __B[0]);
 842 }
 843
 844 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 845 _mm_ucomilt_ss (__m128 __A, __m128 __B)
 846 {
 847   return (__A[0] < __B[0]);
 848 }
 849
 850 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 851 _mm_ucomile_ss (__m128 __A, __m128 __B)
 852 {
 853   return (__A[0] <= __B[0]);
 854 }
 855
 856 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 857 _mm_ucomigt_ss (__m128 __A, __m128 __B)
 858 {
 859   return (__A[0] > __B[0]);
 860 }
 861
 862 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 863 _mm_ucomige_ss (__m128 __A, __m128 __B)
 864 {
 865   return (__A[0] >= __B[0]);
 866 }
 867
 868 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 869 _mm_ucomineq_ss (__m128 __A, __m128 __B)
 870 {
 871   return (__A[0] != __B[0]);
 872 }
 873
 874 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 875 _mm_cvtss_f32 (__m128 __A)
 876 {
 877   return ((__v4sf)__A)[0];
 878 }
 879
 880 /* Convert the lower SPFP value to a 32-bit integer according to the current
 881    rounding mode.  */
 882 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 883 _mm_cvtss_si32 (__m128 __A)
 884 {
 885   __m64 res = 0;
 886 #ifdef _ARCH_PWR8
 887   double dtmp;
 888   __asm__(
 889 #ifdef __LITTLE_ENDIAN__
 890       "xxsldwi %x0,%x0,%x0,3;\n"
 891 #endif
 892       "xscvspdp %x2,%x0;\n"
 893       "fctiw  %2,%2;\n"
 894       "mfvsrd  %1,%x2;\n"
 895       : "+wa" (__A),
 896         "=r" (res),
 897         "=f" (dtmp)
 898       : );
 899 #else
 900   res = __builtin_rint(__A[0]);
 901 #endif
 902   return (res);
 903 }
 904
 905 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 906 _mm_cvt_ss2si (__m128 __A)
 907 {
 908   return _mm_cvtss_si32 (__A);
 909 }
 910
 911 /* Convert the lower SPFP value to a 32-bit integer according to the
 912    current rounding mode.  */
 913
 914 /* Intel intrinsic.  */
 915 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 916 _mm_cvtss_si64 (__m128 __A)
 917 {
 918   __m64 res = 0;
 919 #ifdef _ARCH_PWR8
 920   double dtmp;
 921   __asm__(
 922 #ifdef __LITTLE_ENDIAN__
 923       "xxsldwi %x0,%x0,%x0,3;\n"
 924 #endif
 925       "xscvspdp %x2,%x0;\n"
 926       "fctid  %2,%2;\n"
 927       "mfvsrd  %1,%x2;\n"
 928       : "+wa" (__A),
 929         "=r" (res),
 930         "=f" (dtmp)
 931       : );
 932 #else
 933   res = __builtin_llrint(__A[0]);
 934 #endif
 935   return (res);
 936 }
 937
 938 /* Microsoft intrinsic.  */
 939 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 940 _mm_cvtss_si64x (__m128 __A)
 941 {
 942   return _mm_cvtss_si64 ((__v4sf) __A);
 943 }
 944
 945 /* Constants for use with _mm_prefetch.  */
 946 enum _mm_hint
 947 {
 948   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
 949   _MM_HINT_ET0 = 7,
 950   _MM_HINT_ET1 = 6,
 951   _MM_HINT_T0 = 3,
 952   _MM_HINT_T1 = 2,
 953   _MM_HINT_T2 = 1,
 954   _MM_HINT_NTA = 0
 955 };
 956
 957 /* Loads one cache line from address P to a location "closer" to the
 958    processor.  The selector I specifies the type of prefetch operation.  */
 959 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 960 _mm_prefetch (const void *__P, enum _mm_hint __I)
 961 {
 962   /* Current PowerPC will ignores the hint parameters.  */
 963   __builtin_prefetch (__P);
 964 }
 965
 966 /* Convert the two lower SPFP values to 32-bit integers according to the
 967    current rounding mode.  Return the integers in packed form.  */
 968 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 969 _mm_cvtps_pi32 (__m128 __A)
 970 {
 971   /* Splat two lower SPFP values to both halves.  */
 972   __v4sf temp, rounded;
 973   __vector unsigned long long result;
 974
 975   /* Splat two lower SPFP values to both halves.  */
 976   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
 977   rounded = vec_rint(temp);
 978   result = (__vector unsigned long long) vec_cts (rounded, 0);
 979
 980   return (__m64) ((__vector long long) result)[0];
 981 }
 982
 983 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 984 _mm_cvt_ps2pi (__m128 __A)
 985 {
 986   return _mm_cvtps_pi32 (__A);
 987 }
 988
 989 /* Truncate the lower SPFP value to a 32-bit integer.  */
 990 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 991 _mm_cvttss_si32 (__m128 __A)
 992 {
 993   /* Extract the lower float element.  */
 994   float temp = __A[0];
 995   /* truncate to 32-bit integer and return.  */
 996   return temp;
 997 }
 998
 999 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1000 _mm_cvtt_ss2si (__m128 __A)
1001 {
1002   return _mm_cvttss_si32 (__A);
1003 }
1004
1005 /* Intel intrinsic.  */
1006 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_cvttss_si64 (__m128 __A)
1008 {
1009   /* Extract the lower float element.  */
1010   float temp = __A[0];
1011   /* truncate to 32-bit integer and return.  */
1012   return temp;
1013 }
1014
1015 /* Microsoft intrinsic.  */
1016 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017 _mm_cvttss_si64x (__m128 __A)
1018 {
1019   /* Extract the lower float element.  */
1020   float temp = __A[0];
1021   /* truncate to 32-bit integer and return.  */
1022   return temp;
1023 }
1024
1025 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1026    integers in packed form.  */
1027 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028 _mm_cvttps_pi32 (__m128 __A)
1029 {
1030   __v4sf temp;
1031   __vector unsigned long long result;
1032
1033   /* Splat two lower SPFP values to both halves.  */
1034   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1035   result = (__vector unsigned long long) vec_cts (temp, 0);
1036
1037   return (__m64) ((__vector long long) result)[0];
1038 }
1039
1040 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1041 _mm_cvtt_ps2pi (__m128 __A)
1042 {
1043   return _mm_cvttps_pi32 (__A);
1044 }
1045
1046 /* Convert B to a SPFP value and insert it as element zero in A.  */
1047 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1048 _mm_cvtsi32_ss (__m128 __A, int __B)
1049 {
1050   float temp = __B;
1051   __A[0] = temp;
1052
1053   return __A;
1054 }
1055
1056 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1057 _mm_cvt_si2ss (__m128 __A, int __B)
1058 {
1059   return _mm_cvtsi32_ss (__A, __B);
1060 }
1061
1062 /* Convert B to a SPFP value and insert it as element zero in A.  */
1063 /* Intel intrinsic.  */
1064 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1065 _mm_cvtsi64_ss (__m128 __A, long long __B)
1066 {
1067   float temp = __B;
1068   __A[0] = temp;
1069
1070   return __A;
1071 }
1072
1073 /* Microsoft intrinsic.  */
1074 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1076 {
1077   return _mm_cvtsi64_ss (__A, __B);
1078 }
1079
1080 /* Convert the two 32-bit values in B to SPFP form and insert them
1081    as the two lower elements in A.  */
1082 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1083 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1084 {
1085   __vector signed int vm1;
1086   __vector float vf1;
1087
1088   vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1089   vf1 = (__vector float) vec_ctf (vm1, 0);
1090
1091   return ((__m128) (__vector unsigned long long)
1092     { ((__vector unsigned long long)vf1) [0],
1093         ((__vector unsigned long long)__A) [1]});
1094 }
1095
1096 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1097 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1098 {
1099   return _mm_cvtpi32_ps (__A, __B);
1100 }
1101
1102 /* Convert the four signed 16-bit values in A to SPFP form.  */
1103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1104 _mm_cvtpi16_ps (__m64 __A)
1105 {
1106   __vector signed short vs8;
1107   __vector signed int vi4;
1108   __vector float vf1;
1109
1110   vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1111   vi4 = vec_vupklsh (vs8);
1112   vf1 = (__vector float) vec_ctf (vi4, 0);
1113
1114   return (__m128) vf1;
1115 }
1116
1117 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119 _mm_cvtpu16_ps (__m64 __A)
1120 {
1121   const __vector unsigned short zero =
1122     { 0, 0, 0, 0, 0, 0, 0, 0 };
1123   __vector unsigned short vs8;
1124   __vector unsigned int vi4;
1125   __vector float vf1;
1126
1127   vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1128   vi4 = (__vector unsigned int) vec_mergel
1129 #ifdef __LITTLE_ENDIAN__
1130                                            (vs8, zero);
1131 #else
1132                                            (zero, vs8);
1133 #endif
1134   vf1 = (__vector float) vec_ctf (vi4, 0);
1135
1136   return (__m128) vf1;
1137 }
1138
1139 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1140 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _mm_cvtpi8_ps (__m64 __A)
1142 {
1143   __vector signed char vc16;
1144   __vector signed short vs8;
1145   __vector signed int vi4;
1146   __vector float vf1;
1147
1148   vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1149   vs8 = vec_vupkhsb (vc16);
1150   vi4 = vec_vupkhsh (vs8);
1151   vf1 = (__vector float) vec_ctf (vi4, 0);
1152
1153   return (__m128) vf1;
1154 }
1155
1156 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1157 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1158
1159 _mm_cvtpu8_ps (__m64  __A)
1160 {
1161   const __vector unsigned char zero =
1162     { 0, 0, 0, 0, 0, 0, 0, 0 };
1163   __vector unsigned char vc16;
1164   __vector unsigned short vs8;
1165   __vector unsigned int vi4;
1166   __vector float vf1;
1167
1168   vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1169 #ifdef __LITTLE_ENDIAN__
1170   vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1171   vi4 = (__vector unsigned int) vec_mergeh (vs8,
1172                                             (__vector unsigned short) zero);
1173 #else
1174   vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1175   vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1176                                             vs8);
1177 #endif
1178   vf1 = (__vector float) vec_ctf (vi4, 0);
1179
1180   return (__m128) vf1;
1181 }
1182
1183 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1184 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1185 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1186 {
1187   __vector signed int vi4;
1188   __vector float vf4;
1189
1190   vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1191   vf4 = (__vector float) vec_ctf (vi4, 0);
1192   return (__m128) vf4;
1193 }
1194
1195 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1196 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1197 _mm_cvtps_pi16 (__m128 __A)
1198 {
1199   __v4sf rounded;
1200   __vector signed int temp;
1201   __vector unsigned long long result;
1202
1203   rounded = vec_rint(__A);
1204   temp = vec_cts (rounded, 0);
1205   result = (__vector unsigned long long) vec_pack (temp, temp);
1206
1207   return (__m64) ((__vector long long) result)[0];
1208 }
1209
1210 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1212 _mm_cvtps_pi8 (__m128 __A)
1213 {
1214   __v4sf rounded;
1215   __vector signed int tmp_i;
1216   static const __vector signed int zero = {0, 0, 0, 0};
1217   __vector signed short tmp_s;
1218   __vector signed char res_v;
1219
1220   rounded = vec_rint(__A);
1221   tmp_i = vec_cts (rounded, 0);
1222   tmp_s = vec_pack (tmp_i, zero);
1223   res_v = vec_pack (tmp_s, tmp_s);
1224   return (__m64) ((__vector long long) res_v)[0];
1225 }
1226
1227 /* Selects four specific SPFP values from A and B based on MASK.  */
1228 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229
1230 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1231 {
1232   unsigned long element_selector_10 = __mask & 0x03;
1233   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1234   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1235   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1236   static const unsigned int permute_selectors[4] =
1237     {
1238 #ifdef __LITTLE_ENDIAN__
1239       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1240 #else
1241       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1242 #endif
1243     };
1244   __vector unsigned int t;
1245
1246   t[0] = permute_selectors[element_selector_10];
1247   t[1] = permute_selectors[element_selector_32];
1248   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1249   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1250   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1251 }
1252
1253 /* Selects and interleaves the upper two SPFP values from A and B.  */
1254 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1256 {
1257   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1258 }
1259
1260 /* Selects and interleaves the lower two SPFP values from A and B.  */
1261 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1263 {
1264   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1265 }
1266
1267 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1268    the lower two values are passed through from A.  */
1269 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1270 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1271 {
1272   __vector unsigned long long __a = (__vector unsigned long long)__A;
1273   __vector unsigned long long __p = vec_splats(*__P);
1274   __a [1] = __p [1];
1275
1276   return (__m128)__a;
1277 }
1278
1279 /* Stores the upper two SPFP values of A into P.  */
1280 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_storeh_pi (__m64 *__P, __m128 __A)
1282 {
1283   __vector unsigned long long __a = (__vector unsigned long long) __A;
1284
1285   *__P = __a[1];
1286 }
1287
1288 /* Moves the upper two values of B into the lower two values of A.  */
1289 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_movehl_ps (__m128 __A, __m128 __B)
1291 {
1292   return (__m128) vec_mergel ((__vector unsigned long long)__B,
1293                               (__vector unsigned long long)__A);
1294 }
1295
1296 /* Moves the lower two values of B into the upper two values of A.  */
1297 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1298 _mm_movelh_ps (__m128 __A, __m128 __B)
1299 {
1300   return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1301                               (__vector unsigned long long)__B);
1302 }
1303
1304 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1305    the upper two values are passed through from A.  */
1306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1308 {
1309   __vector unsigned long long __a = (__vector unsigned long long)__A;
1310   __vector unsigned long long __p = vec_splats(*__P);
1311   __a [0] = __p [0];
1312
1313   return (__m128)__a;
1314 }
1315
1316 /* Stores the lower two SPFP values of A into P.  */
1317 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1318 _mm_storel_pi (__m64 *__P, __m128 __A)
1319 {
1320   __vector unsigned long long __a = (__vector unsigned long long) __A;
1321
1322   *__P = __a[0];
1323 }
1324
1325 #ifdef _ARCH_PWR8
1326 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1327
1328 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1329 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330 _mm_movemask_ps (__m128  __A)
1331 {
1332   __vector unsigned long long result;
1333   static const __vector unsigned int perm_mask =
1334     {
1335 #ifdef __LITTLE_ENDIAN__
1336         0x00204060, 0x80808080, 0x80808080, 0x80808080
1337 #else
1338       0x80808080, 0x80808080, 0x80808080, 0x00204060
1339 #endif
1340     };
1341
1342   result = ((__vector unsigned long long)
1343             vec_vbpermq ((__vector unsigned char) __A,
1344                          (__vector unsigned char) perm_mask));
1345
1346 #ifdef __LITTLE_ENDIAN__
1347   return result[1];
1348 #else
1349   return result[0];
1350 #endif
1351 }
1352 #endif /* _ARCH_PWR8 */
1353
1354 /* Create a vector with all four elements equal to *P.  */
1355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1356 _mm_load1_ps (float const *__P)
1357 {
1358   return _mm_set1_ps (*__P);
1359 }
1360
1361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1362 _mm_load_ps1 (float const *__P)
1363 {
1364   return _mm_load1_ps (__P);
1365 }
1366
1367 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1368 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_extract_pi16 (__m64 const __A, int const __N)
1370 {
1371   unsigned int shiftr = __N & 3;
1372 #ifdef __BIG_ENDIAN__
1373   shiftr = 3 - shiftr;
1374 #endif
1375
1376   return ((__A >> (shiftr * 16)) & 0xffff);
1377 }
1378
1379 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380 _m_pextrw (__m64 const __A, int const __N)
1381 {
1382   return _mm_extract_pi16 (__A, __N);
1383 }
1384
1385 /* Inserts word D into one of four words of A.  The selector N must be
1386    immediate.  */
1387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1389 {
1390   const int shiftl = (__N & 3) * 16;
1391   const __m64 shiftD = (const __m64) __D << shiftl;
1392   const __m64 mask = 0xffffUL << shiftl;
1393   __m64 result = (__A & (~mask)) | (shiftD & mask);
1394
1395   return (result);
1396 }
1397
1398 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1399 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1400 {
1401   return _mm_insert_pi16 (__A, __D, __N);
1402 }
1403
1404 /* Compute the element-wise maximum of signed 16-bit values.  */
1405 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1406
1407 _mm_max_pi16 (__m64 __A, __m64 __B)
1408 {
1409 #if _ARCH_PWR8
1410   __vector signed short a, b, r;
1411   __vector __bool short c;
1412
1413   a = (__vector signed short)vec_splats (__A);
1414   b = (__vector signed short)vec_splats (__B);
1415   c = (__vector __bool short)vec_cmpgt (a, b);
1416   r = vec_sel (b, a, c);
1417   return (__m64) ((__vector long long) r)[0];
1418 #else
1419   __m64_union m1, m2, res;
1420
1421   m1.as_m64 = __A;
1422   m2.as_m64 = __B;
1423
1424   res.as_short[0] =
1425       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1426   res.as_short[1] =
1427       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1428   res.as_short[2] =
1429       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1430   res.as_short[3] =
1431       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1432
1433   return (__m64) res.as_m64;
1434 #endif
1435 }
1436
1437 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1438 _m_pmaxsw (__m64 __A, __m64 __B)
1439 {
1440   return _mm_max_pi16 (__A, __B);
1441 }
1442
1443 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1445 _mm_max_pu8 (__m64 __A, __m64 __B)
1446 {
1447 #if _ARCH_PWR8
1448   __vector unsigned char a, b, r;
1449   __vector __bool char c;
1450
1451   a = (__vector unsigned char)vec_splats (__A);
1452   b = (__vector unsigned char)vec_splats (__B);
1453   c = (__vector __bool char)vec_cmpgt (a, b);
1454   r = vec_sel (b, a, c);
1455   return (__m64) ((__vector long long) r)[0];
1456 #else
1457   __m64_union m1, m2, res;
1458   long i;
1459
1460   m1.as_m64 = __A;
1461   m2.as_m64 = __B;
1462
1463
1464   for (i = 0; i < 8; i++)
1465   res.as_char[i] =
1466       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1467           m1.as_char[i] : m2.as_char[i];
1468
1469   return (__m64) res.as_m64;
1470 #endif
1471 }
1472
1473 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1474 _m_pmaxub (__m64 __A, __m64 __B)
1475 {
1476   return _mm_max_pu8 (__A, __B);
1477 }
1478
1479 /* Compute the element-wise minimum of signed 16-bit values.  */
1480 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1481 _mm_min_pi16 (__m64 __A, __m64 __B)
1482 {
1483 #if _ARCH_PWR8
1484   __vector signed short a, b, r;
1485   __vector __bool short c;
1486
1487   a = (__vector signed short)vec_splats (__A);
1488   b = (__vector signed short)vec_splats (__B);
1489   c = (__vector __bool short)vec_cmplt (a, b);
1490   r = vec_sel (b, a, c);
1491   return (__m64) ((__vector long long) r)[0];
1492 #else
1493   __m64_union m1, m2, res;
1494
1495   m1.as_m64 = __A;
1496   m2.as_m64 = __B;
1497
1498   res.as_short[0] =
1499       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1500   res.as_short[1] =
1501       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1502   res.as_short[2] =
1503       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1504   res.as_short[3] =
1505       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1506
1507   return (__m64) res.as_m64;
1508 #endif
1509 }
1510
1511 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1512 _m_pminsw (__m64 __A, __m64 __B)
1513 {
1514   return _mm_min_pi16 (__A, __B);
1515 }
1516
1517 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1519 _mm_min_pu8 (__m64 __A, __m64 __B)
1520 {
1521 #if _ARCH_PWR8
1522   __vector unsigned char a, b, r;
1523   __vector __bool char c;
1524
1525   a = (__vector unsigned char)vec_splats (__A);
1526   b = (__vector unsigned char)vec_splats (__B);
1527   c = (__vector __bool char)vec_cmplt (a, b);
1528   r = vec_sel (b, a, c);
1529   return (__m64) ((__vector long long) r)[0];
1530 #else
1531   __m64_union m1, m2, res;
1532   long i;
1533
1534   m1.as_m64 = __A;
1535   m2.as_m64 = __B;
1536
1537
1538   for (i = 0; i < 8; i++)
1539   res.as_char[i] =
1540       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1541           m1.as_char[i] : m2.as_char[i];
1542
1543   return (__m64) res.as_m64;
1544 #endif
1545 }
1546
1547 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548 _m_pminub (__m64 __A, __m64 __B)
1549 {
1550   return _mm_min_pu8 (__A, __B);
1551 }
1552
1553 /* Create an 8-bit mask of the signs of 8-bit values.  */
1554 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1555 _mm_movemask_pi8 (__m64 __A)
1556 {
1557   unsigned long long p =
1558 #ifdef __LITTLE_ENDIAN__
1559                          0x0008101820283038UL; // permute control for sign bits
1560 #else
1561                          0x3830282018100800UL; // permute control for sign bits
1562 #endif
1563   return __builtin_bpermd (p, __A);
1564 }
1565
1566 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1567 _m_pmovmskb (__m64 __A)
1568 {
1569   return _mm_movemask_pi8 (__A);
1570 }
1571
1572 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1573    in B and produce the high 16 bits of the 32-bit results.  */
1574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1575 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1576 {
1577   __vector unsigned short a, b;
1578   __vector unsigned short c;
1579   __vector unsigned int w0, w1;
1580   __vector unsigned char xform1 = {
1581 #ifdef __LITTLE_ENDIAN__
1582       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1583       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1584 #else
1585       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1586       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1587 #endif
1588     };
1589
1590   a = (__vector unsigned short)vec_splats (__A);
1591   b = (__vector unsigned short)vec_splats (__B);
1592
1593   w0 = vec_vmuleuh (a, b);
1594   w1 = vec_vmulouh (a, b);
1595   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1596
1597   return (__m64) ((__vector long long) c)[0];
1598 }
1599
1600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601 _m_pmulhuw (__m64 __A, __m64 __B)
1602 {
1603   return _mm_mulhi_pu16 (__A, __B);
1604 }
1605
1606 /* Return a combination of the four 16-bit values in A.  The selector
1607    must be an immediate.  */
1608 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1609 _mm_shuffle_pi16 (__m64 __A, int const __N)
1610 {
1611   unsigned long element_selector_10 = __N & 0x03;
1612   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1613   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1614   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1615   static const unsigned short permute_selectors[4] =
1616     {
1617 #ifdef __LITTLE_ENDIAN__
1618               0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1619 #else
1620               0x0607, 0x0405, 0x0203, 0x0001
1621 #endif
1622     };
1623   __m64_union t;
1624   __vector unsigned long long a, p, r;
1625
1626 #ifdef __LITTLE_ENDIAN__
1627   t.as_short[0] = permute_selectors[element_selector_10];
1628   t.as_short[1] = permute_selectors[element_selector_32];
1629   t.as_short[2] = permute_selectors[element_selector_54];
1630   t.as_short[3] = permute_selectors[element_selector_76];
1631 #else
1632   t.as_short[3] = permute_selectors[element_selector_10];
1633   t.as_short[2] = permute_selectors[element_selector_32];
1634   t.as_short[1] = permute_selectors[element_selector_54];
1635   t.as_short[0] = permute_selectors[element_selector_76];
1636 #endif
1637   p = vec_splats (t.as_m64);
1638   a = vec_splats (__A);
1639   r = vec_perm (a, a, (__vector unsigned char)p);
1640   return (__m64) ((__vector long long) r)[0];
1641 }
1642
1643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1644 _m_pshufw (__m64 __A, int const __N)
1645 {
1646   return _mm_shuffle_pi16 (__A, __N);
1647 }
1648
1649 /* Conditionally store byte elements of A into P.  The high bit of each
1650    byte in the selector N determines whether the corresponding byte from
1651    A is stored.  */
1652 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1653 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1654 {
1655   __m64 hibit = 0x8080808080808080UL;
1656   __m64 mask, tmp;
1657   __m64 *p = (__m64*)__P;
1658
1659   tmp = *p;
1660   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1661   tmp = (tmp & (~mask)) | (__A & mask);
1662   *p = tmp;
1663 }
1664
1665 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1666 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1667 {
1668   _mm_maskmove_si64 (__A, __N, __P);
1669 }
1670
1671 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1673 _mm_avg_pu8 (__m64 __A, __m64 __B)
1674 {
1675   __vector unsigned char a, b, c;
1676
1677   a = (__vector unsigned char)vec_splats (__A);
1678   b = (__vector unsigned char)vec_splats (__B);
1679   c = vec_avg (a, b);
1680   return (__m64) ((__vector long long) c)[0];
1681 }
1682
1683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684 _m_pavgb (__m64 __A, __m64 __B)
1685 {
1686   return _mm_avg_pu8 (__A, __B);
1687 }
1688
1689 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1691 _mm_avg_pu16 (__m64 __A, __m64 __B)
1692 {
1693   __vector unsigned short a, b, c;
1694
1695   a = (__vector unsigned short)vec_splats (__A);
1696   b = (__vector unsigned short)vec_splats (__B);
1697   c = vec_avg (a, b);
1698   return (__m64) ((__vector long long) c)[0];
1699 }
1700
1701 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1702 _m_pavgw (__m64 __A, __m64 __B)
1703 {
1704   return _mm_avg_pu16 (__A, __B);
1705 }
1706
1707 /* Compute the sum of the absolute differences of the unsigned 8-bit
1708    values in A and B.  Return the value in the lower 16-bit word; the
1709    upper words are cleared.  */
1710 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1711 _mm_sad_pu8 (__m64  __A, __m64  __B)
1712 {
1713   __vector unsigned char a, b;
1714   __vector unsigned char vmin, vmax, vabsdiff;
1715   __vector signed int vsum;
1716   const __vector unsigned int zero =
1717     { 0, 0, 0, 0 };
1718   __m64_union result = {0};
1719
1720   a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1721   b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1722   vmin = vec_min (a, b);
1723   vmax = vec_max (a, b);
1724   vabsdiff = vec_sub (vmax, vmin);
1725   /* Sum four groups of bytes into integers.  */
1726   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1727   /* Sum across four integers with integer result.  */
1728   vsum = vec_sums (vsum, (__vector signed int) zero);
1729   /* The sum is in the right most 32-bits of the vector result.
1730      Transfer to a GPR and truncate to 16 bits.  */
1731   result.as_short[0] = vsum[3];
1732   return result.as_m64;
1733 }
1734
1735 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1736 _m_psadbw (__m64 __A, __m64 __B)
1737 {
1738   return _mm_sad_pu8 (__A, __B);
1739 }
1740
1741 /* Stores the data in A to the address P without polluting the caches.  */
1742 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1743 _mm_stream_pi (__m64 *__P, __m64 __A)
1744 {
1745   /* Use the data cache block touch for store transient.  */
1746   __asm__ (
1747     "   dcbtstt 0,%0"
1748     :
1749     : "b" (__P)
1750     : "memory"
1751   );
1752   *__P = __A;
1753 }
1754
1755 /* Likewise.  The address must be 16-byte aligned.  */
1756 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_stream_ps (float *__P, __m128 __A)
1758 {
1759   /* Use the data cache block touch for store transient.  */
1760   __asm__ (
1761     "   dcbtstt 0,%0"
1762     :
1763     : "b" (__P)
1764     : "memory"
1765   );
1766   _mm_store_ps (__P, __A);
1767 }
1768
1769 /* Guarantees that every preceding store is globally visible before
1770    any subsequent store.  */
1771 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1772 _mm_sfence (void)
1773 {
1774   /* Generate a light weight sync.  */
1775   __atomic_thread_fence (__ATOMIC_RELEASE);
1776 }
1777
1778 /* The execution of the next instruction is delayed by an implementation
1779    specific amount of time.  The instruction does not modify the
1780    architectural state.  This is after the pop_options pragma because
1781    it does not require SSE support in the processor--the encoding is a
1782    nop on processors that do not support it.  */
1783 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1784 _mm_pause (void)
1785 {
1786   /* There is no exact match with this construct, but the following is
1787      close to the desired effect.  */
1788 #if _ARCH_PWR8
1789   /* On power8 and later processors we can depend on Program Priority
1790      (PRI) and associated "very low" PPI setting.  Since we don't know
1791      what PPI this thread is running at we: 1) save the current PRI
1792      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1793      via the special or 31,31,31 encoding. 3) issue an "isync" to
1794      insure the PRI change takes effect before we execute any more
1795      instructions.
1796      Now we can execute a lwsync (release barrier) while we execute
1797      this thread at "very low" PRI.  Finally we restore the original
1798      PRI and continue execution.  */
1799   unsigned long __PPR;
1800
1801   __asm__ volatile (
1802     "   mfppr   %0;"
1803     "   or 31,31,31;"
1804     "   isync;"
1805     "   lwsync;"
1806     "   isync;"
1807     "   mtppr   %0;"
1808     : "=r" (__PPR)
1809     :
1810     : "memory"
1811   );
1812 #else
1813   /* For older processor where we may not even have Program Priority
1814      controls we can only depend on Heavy Weight Sync.  */
1815   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1816 #endif
1817 }
1818
1819 /* Transpose the 4x4 matrix composed of row[0-3].  */
1820 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                       \
1821 do {                                                                    \
1822   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);    \
1823   __v4sf __t0 = vec_vmrghw (__r0, __r1);                        \
1824   __v4sf __t1 = vec_vmrghw (__r2, __r3);                        \
1825   __v4sf __t2 = vec_vmrglw (__r0, __r1);                        \
1826   __v4sf __t3 = vec_vmrglw (__r2, __r3);                        \
1827   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,        \
1828                                (__vector long long)__t1);       \
1829   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,        \
1830                                (__vector long long)__t1);       \
1831   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,        \
1832                                (__vector long long)__t3);       \
1833   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,        \
1834                                (__vector long long)__t3);       \
1835 } while (0)
1836
1837 /* For backward source compatibility.  */
1838 //# include <emmintrin.h>
1839
1840 #else
1841 #include_next <xmmintrin.h>
1842 #endif /* defined(__linux__) && defined(__ppc64__) */
1843
1844 #endif /* _XMMINTRIN_H_INCLUDED */