contrib/llvm/tools/clang/lib/Headers/tmmintrin.h

   1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __TMMINTRIN_H
  25 #define __TMMINTRIN_H
  26
  27 #include <pmmintrin.h>
  28
  29 /* Define the default attributes for the functions in this file. */
  30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
  31 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
  32
  33 /// Computes the absolute value of each of the packed 8-bit signed
  34 ///    integers in the source operand and stores the 8-bit unsigned integer
  35 ///    results in the destination.
  36 ///
  37 /// \headerfile <x86intrin.h>
  38 ///
  39 /// This intrinsic corresponds to the \c PABSB instruction.
  40 ///
  41 /// \param __a
  42 ///    A 64-bit vector of [8 x i8].
  43 /// \returns A 64-bit integer vector containing the absolute values of the
  44 ///    elements in the operand.
  45 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  46 _mm_abs_pi8(__m64 __a)
  47 {
  48     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  49 }
  50
  51 /// Computes the absolute value of each of the packed 8-bit signed
  52 ///    integers in the source operand and stores the 8-bit unsigned integer
  53 ///    results in the destination.
  54 ///
  55 /// \headerfile <x86intrin.h>
  56 ///
  57 /// This intrinsic corresponds to the \c VPABSB instruction.
  58 ///
  59 /// \param __a
  60 ///    A 128-bit vector of [16 x i8].
  61 /// \returns A 128-bit integer vector containing the absolute values of the
  62 ///    elements in the operand.
  63 static __inline__ __m128i __DEFAULT_FN_ATTRS
  64 _mm_abs_epi8(__m128i __a)
  65 {
  66     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
  67 }
  68
  69 /// Computes the absolute value of each of the packed 16-bit signed
  70 ///    integers in the source operand and stores the 16-bit unsigned integer
  71 ///    results in the destination.
  72 ///
  73 /// \headerfile <x86intrin.h>
  74 ///
  75 /// This intrinsic corresponds to the \c PABSW instruction.
  76 ///
  77 /// \param __a
  78 ///    A 64-bit vector of [4 x i16].
  79 /// \returns A 64-bit integer vector containing the absolute values of the
  80 ///    elements in the operand.
  81 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
  82 _mm_abs_pi16(__m64 __a)
  83 {
  84     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  85 }
  86
  87 /// Computes the absolute value of each of the packed 16-bit signed
  88 ///    integers in the source operand and stores the 16-bit unsigned integer
  89 ///    results in the destination.
  90 ///
  91 /// \headerfile <x86intrin.h>
  92 ///
  93 /// This intrinsic corresponds to the \c VPABSW instruction.
  94 ///
  95 /// \param __a
  96 ///    A 128-bit vector of [8 x i16].
  97 /// \returns A 128-bit integer vector containing the absolute values of the
  98 ///    elements in the operand.
  99 static __inline__ __m128i __DEFAULT_FN_ATTRS
 100 _mm_abs_epi16(__m128i __a)
 101 {
 102     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 103 }
 104
 105 /// Computes the absolute value of each of the packed 32-bit signed
 106 ///    integers in the source operand and stores the 32-bit unsigned integer
 107 ///    results in the destination.
 108 ///
 109 /// \headerfile <x86intrin.h>
 110 ///
 111 /// This intrinsic corresponds to the \c PABSD instruction.
 112 ///
 113 /// \param __a
 114 ///    A 64-bit vector of [2 x i32].
 115 /// \returns A 64-bit integer vector containing the absolute values of the
 116 ///    elements in the operand.
 117 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 118 _mm_abs_pi32(__m64 __a)
 119 {
 120     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 121 }
 122
 123 /// Computes the absolute value of each of the packed 32-bit signed
 124 ///    integers in the source operand and stores the 32-bit unsigned integer
 125 ///    results in the destination.
 126 ///
 127 /// \headerfile <x86intrin.h>
 128 ///
 129 /// This intrinsic corresponds to the \c VPABSD instruction.
 130 ///
 131 /// \param __a
 132 ///    A 128-bit vector of [4 x i32].
 133 /// \returns A 128-bit integer vector containing the absolute values of the
 134 ///    elements in the operand.
 135 static __inline__ __m128i __DEFAULT_FN_ATTRS
 136 _mm_abs_epi32(__m128i __a)
 137 {
 138     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 139 }
 140
 141 /// Concatenates the two 128-bit integer vector operands, and
 142 ///    right-shifts the result by the number of bytes specified in the immediate
 143 ///    operand.
 144 ///
 145 /// \headerfile <x86intrin.h>
 146 ///
 147 /// \code
 148 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
 149 /// \endcode
 150 ///
 151 /// This intrinsic corresponds to the \c PALIGNR instruction.
 152 ///
 153 /// \param a
 154 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 155 /// \param b
 156 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 157 /// \param n
 158 ///    An immediate operand specifying how many bytes to right-shift the result.
 159 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 160 ///    value.
 161 #define _mm_alignr_epi8(a, b, n) \
 162   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
 163                                      (__v16qi)(__m128i)(b), (n))
 164
 165 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 166 ///    the result by the number of bytes specified in the immediate operand.
 167 ///
 168 /// \headerfile <x86intrin.h>
 169 ///
 170 /// \code
 171 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
 172 /// \endcode
 173 ///
 174 /// This intrinsic corresponds to the \c PALIGNR instruction.
 175 ///
 176 /// \param a
 177 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 178 /// \param b
 179 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 180 /// \param n
 181 ///    An immediate operand specifying how many bytes to right-shift the result.
 182 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 183 ///    value.
 184 #define _mm_alignr_pi8(a, b, n) \
 185   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
 186
 187 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 188 ///    128-bit vectors of [8 x i16].
 189 ///
 190 /// \headerfile <x86intrin.h>
 191 ///
 192 /// This intrinsic corresponds to the \c VPHADDW instruction.
 193 ///
 194 /// \param __a
 195 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 196 ///    horizontal sums of the values are stored in the lower bits of the
 197 ///    destination.
 198 /// \param __b
 199 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 200 ///    horizontal sums of the values are stored in the upper bits of the
 201 ///    destination.
 202 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 203 ///    both operands.
 204 static __inline__ __m128i __DEFAULT_FN_ATTRS
 205 _mm_hadd_epi16(__m128i __a, __m128i __b)
 206 {
 207     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 208 }
 209
 210 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 211 ///    128-bit vectors of [4 x i32].
 212 ///
 213 /// \headerfile <x86intrin.h>
 214 ///
 215 /// This intrinsic corresponds to the \c VPHADDD instruction.
 216 ///
 217 /// \param __a
 218 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 219 ///    horizontal sums of the values are stored in the lower bits of the
 220 ///    destination.
 221 /// \param __b
 222 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 223 ///    horizontal sums of the values are stored in the upper bits of the
 224 ///    destination.
 225 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 226 ///    both operands.
 227 static __inline__ __m128i __DEFAULT_FN_ATTRS
 228 _mm_hadd_epi32(__m128i __a, __m128i __b)
 229 {
 230     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 231 }
 232
 233 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 234 ///    64-bit vectors of [4 x i16].
 235 ///
 236 /// \headerfile <x86intrin.h>
 237 ///
 238 /// This intrinsic corresponds to the \c PHADDW instruction.
 239 ///
 240 /// \param __a
 241 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 242 ///    horizontal sums of the values are stored in the lower bits of the
 243 ///    destination.
 244 /// \param __b
 245 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 246 ///    horizontal sums of the values are stored in the upper bits of the
 247 ///    destination.
 248 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 249 ///    operands.
 250 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 251 _mm_hadd_pi16(__m64 __a, __m64 __b)
 252 {
 253     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 254 }
 255
 256 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 257 ///    64-bit vectors of [2 x i32].
 258 ///
 259 /// \headerfile <x86intrin.h>
 260 ///
 261 /// This intrinsic corresponds to the \c PHADDD instruction.
 262 ///
 263 /// \param __a
 264 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 265 ///    horizontal sums of the values are stored in the lower bits of the
 266 ///    destination.
 267 /// \param __b
 268 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 269 ///    horizontal sums of the values are stored in the upper bits of the
 270 ///    destination.
 271 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 272 ///    operands.
 273 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 274 _mm_hadd_pi32(__m64 __a, __m64 __b)
 275 {
 276     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 277 }
 278
 279 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 280 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
 281 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 282 ///    0x8000.
 283 ///
 284 /// \headerfile <x86intrin.h>
 285 ///
 286 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 287 ///
 288 /// \param __a
 289 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 290 ///    horizontal sums of the values are stored in the lower bits of the
 291 ///    destination.
 292 /// \param __b
 293 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 294 ///    horizontal sums of the values are stored in the upper bits of the
 295 ///    destination.
 296 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 297 ///    sums of both operands.
 298 static __inline__ __m128i __DEFAULT_FN_ATTRS
 299 _mm_hadds_epi16(__m128i __a, __m128i __b)
 300 {
 301     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 302 }
 303
 304 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 305 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
 306 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
 307 ///    0x8000.
 308 ///
 309 /// \headerfile <x86intrin.h>
 310 ///
 311 /// This intrinsic corresponds to the \c PHADDSW instruction.
 312 ///
 313 /// \param __a
 314 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 315 ///    horizontal sums of the values are stored in the lower bits of the
 316 ///    destination.
 317 /// \param __b
 318 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 319 ///    horizontal sums of the values are stored in the upper bits of the
 320 ///    destination.
 321 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 322 ///    sums of both operands.
 323 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 324 _mm_hadds_pi16(__m64 __a, __m64 __b)
 325 {
 326     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 327 }
 328
 329 /// Horizontally subtracts the adjacent pairs of values contained in 2
 330 ///    packed 128-bit vectors of [8 x i16].
 331 ///
 332 /// \headerfile <x86intrin.h>
 333 ///
 334 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 335 ///
 336 /// \param __a
 337 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 338 ///    horizontal differences between the values are stored in the lower bits of
 339 ///    the destination.
 340 /// \param __b
 341 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 342 ///    horizontal differences between the values are stored in the upper bits of
 343 ///    the destination.
 344 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 345 ///    of both operands.
 346 static __inline__ __m128i __DEFAULT_FN_ATTRS
 347 _mm_hsub_epi16(__m128i __a, __m128i __b)
 348 {
 349     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 350 }
 351
 352 /// Horizontally subtracts the adjacent pairs of values contained in 2
 353 ///    packed 128-bit vectors of [4 x i32].
 354 ///
 355 /// \headerfile <x86intrin.h>
 356 ///
 357 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 358 ///
 359 /// \param __a
 360 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 361 ///    horizontal differences between the values are stored in the lower bits of
 362 ///    the destination.
 363 /// \param __b
 364 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 365 ///    horizontal differences between the values are stored in the upper bits of
 366 ///    the destination.
 367 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 368 ///    of both operands.
 369 static __inline__ __m128i __DEFAULT_FN_ATTRS
 370 _mm_hsub_epi32(__m128i __a, __m128i __b)
 371 {
 372     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 373 }
 374
 375 /// Horizontally subtracts the adjacent pairs of values contained in 2
 376 ///    packed 64-bit vectors of [4 x i16].
 377 ///
 378 /// \headerfile <x86intrin.h>
 379 ///
 380 /// This intrinsic corresponds to the \c PHSUBW instruction.
 381 ///
 382 /// \param __a
 383 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 384 ///    horizontal differences between the values are stored in the lower bits of
 385 ///    the destination.
 386 /// \param __b
 387 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 388 ///    horizontal differences between the values are stored in the upper bits of
 389 ///    the destination.
 390 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 391 ///    of both operands.
 392 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 393 _mm_hsub_pi16(__m64 __a, __m64 __b)
 394 {
 395     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 396 }
 397
 398 /// Horizontally subtracts the adjacent pairs of values contained in 2
 399 ///    packed 64-bit vectors of [2 x i32].
 400 ///
 401 /// \headerfile <x86intrin.h>
 402 ///
 403 /// This intrinsic corresponds to the \c PHSUBD instruction.
 404 ///
 405 /// \param __a
 406 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 407 ///    horizontal differences between the values are stored in the lower bits of
 408 ///    the destination.
 409 /// \param __b
 410 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 411 ///    horizontal differences between the values are stored in the upper bits of
 412 ///    the destination.
 413 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 414 ///    of both operands.
 415 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 416 _mm_hsub_pi32(__m64 __a, __m64 __b)
 417 {
 418     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 419 }
 420
 421 /// Horizontally subtracts the adjacent pairs of values contained in 2
 422 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
 423 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 424 ///    saturated to 0x8000.
 425 ///
 426 /// \headerfile <x86intrin.h>
 427 ///
 428 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 429 ///
 430 /// \param __a
 431 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 432 ///    horizontal differences between the values are stored in the lower bits of
 433 ///    the destination.
 434 /// \param __b
 435 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 436 ///    horizontal differences between the values are stored in the upper bits of
 437 ///    the destination.
 438 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 439 ///    differences of both operands.
 440 static __inline__ __m128i __DEFAULT_FN_ATTRS
 441 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 442 {
 443     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 444 }
 445
 446 /// Horizontally subtracts the adjacent pairs of values contained in 2
 447 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
 448 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
 449 ///    saturated to 0x8000.
 450 ///
 451 /// \headerfile <x86intrin.h>
 452 ///
 453 /// This intrinsic corresponds to the \c PHSUBSW instruction.
 454 ///
 455 /// \param __a
 456 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 457 ///    horizontal differences between the values are stored in the lower bits of
 458 ///    the destination.
 459 /// \param __b
 460 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 461 ///    horizontal differences between the values are stored in the upper bits of
 462 ///    the destination.
 463 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 464 ///    differences of both operands.
 465 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 466 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 467 {
 468     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 469 }
 470
 471 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 472 ///    values contained in the first source operand and packed 8-bit signed
 473 ///    integer values contained in the second source operand, adds pairs of
 474 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 475 ///    the corresponding bits in the destination.
 476 ///
 477 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 478 ///    both operands are multiplied, and the sum of both results is written to
 479 ///    bits [15:0] of the destination.
 480 ///
 481 /// \headerfile <x86intrin.h>
 482 ///
 483 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 484 ///
 485 /// \param __a
 486 ///    A 128-bit integer vector containing the first source operand.
 487 /// \param __b
 488 ///    A 128-bit integer vector containing the second source operand.
 489 /// \returns A 128-bit integer vector containing the sums of products of both
 490 ///    operands: \n
 491 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 492 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 493 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 494 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
 495 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
 496 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 497 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 498 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 499 static __inline__ __m128i __DEFAULT_FN_ATTRS
 500 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 501 {
 502     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 503 }
 504
 505 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
 506 ///    values contained in the first source operand and packed 8-bit signed
 507 ///    integer values contained in the second source operand, adds pairs of
 508 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 509 ///    the corresponding bits in the destination.
 510 ///
 511 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 512 ///    both operands are multiplied, and the sum of both results is written to
 513 ///    bits [15:0] of the destination.
 514 ///
 515 /// \headerfile <x86intrin.h>
 516 ///
 517 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
 518 ///
 519 /// \param __a
 520 ///    A 64-bit integer vector containing the first source operand.
 521 /// \param __b
 522 ///    A 64-bit integer vector containing the second source operand.
 523 /// \returns A 64-bit integer vector containing the sums of products of both
 524 ///    operands: \n
 525 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 526 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 527 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 528 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 529 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 530 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 531 {
 532     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 533 }
 534
 535 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 536 ///    products to the 18 most significant bits by right-shifting, rounds the
 537 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 538 ///
 539 /// \headerfile <x86intrin.h>
 540 ///
 541 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 542 ///
 543 /// \param __a
 544 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 545 /// \param __b
 546 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 547 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 548 ///    products of both operands.
 549 static __inline__ __m128i __DEFAULT_FN_ATTRS
 550 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 551 {
 552     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 553 }
 554
 555 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 556 ///    products to the 18 most significant bits by right-shifting, rounds the
 557 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 558 ///
 559 /// \headerfile <x86intrin.h>
 560 ///
 561 /// This intrinsic corresponds to the \c PMULHRSW instruction.
 562 ///
 563 /// \param __a
 564 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 565 /// \param __b
 566 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 567 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 568 ///    products of both operands.
 569 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 570 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 571 {
 572     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 573 }
 574
 575 /// Copies the 8-bit integers from a 128-bit integer vector to the
 576 ///    destination or clears 8-bit values in the destination, as specified by
 577 ///    the second source operand.
 578 ///
 579 /// \headerfile <x86intrin.h>
 580 ///
 581 /// This intrinsic corresponds to the \c VPSHUFB instruction.
 582 ///
 583 /// \param __a
 584 ///    A 128-bit integer vector containing the values to be copied.
 585 /// \param __b
 586 ///    A 128-bit integer vector containing control bytes corresponding to
 587 ///    positions in the destination:
 588 ///    Bit 7: \n
 589 ///    1: Clear the corresponding byte in the destination. \n
 590 ///    0: Copy the selected source byte to the corresponding byte in the
 591 ///    destination. \n
 592 ///    Bits [6:4] Reserved.  \n
 593 ///    Bits [3:0] select the source byte to be copied.
 594 /// \returns A 128-bit integer vector containing the copied or cleared values.
 595 static __inline__ __m128i __DEFAULT_FN_ATTRS
 596 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 597 {
 598     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 599 }
 600
 601 /// Copies the 8-bit integers from a 64-bit integer vector to the
 602 ///    destination or clears 8-bit values in the destination, as specified by
 603 ///    the second source operand.
 604 ///
 605 /// \headerfile <x86intrin.h>
 606 ///
 607 /// This intrinsic corresponds to the \c PSHUFB instruction.
 608 ///
 609 /// \param __a
 610 ///    A 64-bit integer vector containing the values to be copied.
 611 /// \param __b
 612 ///    A 64-bit integer vector containing control bytes corresponding to
 613 ///    positions in the destination:
 614 ///    Bit 7: \n
 615 ///    1: Clear the corresponding byte in the destination. \n
 616 ///    0: Copy the selected source byte to the corresponding byte in the
 617 ///    destination. \n
 618 ///    Bits [3:0] select the source byte to be copied.
 619 /// \returns A 64-bit integer vector containing the copied or cleared values.
 620 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 621 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 622 {
 623     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 624 }
 625
 626 /// For each 8-bit integer in the first source operand, perform one of
 627 ///    the following actions as specified by the second source operand.
 628 ///
 629 ///    If the byte in the second source is negative, calculate the two's
 630 ///    complement of the corresponding byte in the first source, and write that
 631 ///    value to the destination. If the byte in the second source is positive,
 632 ///    copy the corresponding byte from the first source to the destination. If
 633 ///    the byte in the second source is zero, clear the corresponding byte in
 634 ///    the destination.
 635 ///
 636 /// \headerfile <x86intrin.h>
 637 ///
 638 /// This intrinsic corresponds to the \c VPSIGNB instruction.
 639 ///
 640 /// \param __a
 641 ///    A 128-bit integer vector containing the values to be copied.
 642 /// \param __b
 643 ///    A 128-bit integer vector containing control bytes corresponding to
 644 ///    positions in the destination.
 645 /// \returns A 128-bit integer vector containing the resultant values.
 646 static __inline__ __m128i __DEFAULT_FN_ATTRS
 647 _mm_sign_epi8(__m128i __a, __m128i __b)
 648 {
 649     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 650 }
 651
 652 /// For each 16-bit integer in the first source operand, perform one of
 653 ///    the following actions as specified by the second source operand.
 654 ///
 655 ///    If the word in the second source is negative, calculate the two's
 656 ///    complement of the corresponding word in the first source, and write that
 657 ///    value to the destination. If the word in the second source is positive,
 658 ///    copy the corresponding word from the first source to the destination. If
 659 ///    the word in the second source is zero, clear the corresponding word in
 660 ///    the destination.
 661 ///
 662 /// \headerfile <x86intrin.h>
 663 ///
 664 /// This intrinsic corresponds to the \c VPSIGNW instruction.
 665 ///
 666 /// \param __a
 667 ///    A 128-bit integer vector containing the values to be copied.
 668 /// \param __b
 669 ///    A 128-bit integer vector containing control words corresponding to
 670 ///    positions in the destination.
 671 /// \returns A 128-bit integer vector containing the resultant values.
 672 static __inline__ __m128i __DEFAULT_FN_ATTRS
 673 _mm_sign_epi16(__m128i __a, __m128i __b)
 674 {
 675     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 676 }
 677
 678 /// For each 32-bit integer in the first source operand, perform one of
 679 ///    the following actions as specified by the second source operand.
 680 ///
 681 ///    If the doubleword in the second source is negative, calculate the two's
 682 ///    complement of the corresponding word in the first source, and write that
 683 ///    value to the destination. If the doubleword in the second source is
 684 ///    positive, copy the corresponding word from the first source to the
 685 ///    destination. If the doubleword in the second source is zero, clear the
 686 ///    corresponding word in the destination.
 687 ///
 688 /// \headerfile <x86intrin.h>
 689 ///
 690 /// This intrinsic corresponds to the \c VPSIGND instruction.
 691 ///
 692 /// \param __a
 693 ///    A 128-bit integer vector containing the values to be copied.
 694 /// \param __b
 695 ///    A 128-bit integer vector containing control doublewords corresponding to
 696 ///    positions in the destination.
 697 /// \returns A 128-bit integer vector containing the resultant values.
 698 static __inline__ __m128i __DEFAULT_FN_ATTRS
 699 _mm_sign_epi32(__m128i __a, __m128i __b)
 700 {
 701     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 702 }
 703
 704 /// For each 8-bit integer in the first source operand, perform one of
 705 ///    the following actions as specified by the second source operand.
 706 ///
 707 ///    If the byte in the second source is negative, calculate the two's
 708 ///    complement of the corresponding byte in the first source, and write that
 709 ///    value to the destination. If the byte in the second source is positive,
 710 ///    copy the corresponding byte from the first source to the destination. If
 711 ///    the byte in the second source is zero, clear the corresponding byte in
 712 ///    the destination.
 713 ///
 714 /// \headerfile <x86intrin.h>
 715 ///
 716 /// This intrinsic corresponds to the \c PSIGNB instruction.
 717 ///
 718 /// \param __a
 719 ///    A 64-bit integer vector containing the values to be copied.
 720 /// \param __b
 721 ///    A 64-bit integer vector containing control bytes corresponding to
 722 ///    positions in the destination.
 723 /// \returns A 64-bit integer vector containing the resultant values.
 724 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 725 _mm_sign_pi8(__m64 __a, __m64 __b)
 726 {
 727     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 728 }
 729
 730 /// For each 16-bit integer in the first source operand, perform one of
 731 ///    the following actions as specified by the second source operand.
 732 ///
 733 ///    If the word in the second source is negative, calculate the two's
 734 ///    complement of the corresponding word in the first source, and write that
 735 ///    value to the destination. If the word in the second source is positive,
 736 ///    copy the corresponding word from the first source to the destination. If
 737 ///    the word in the second source is zero, clear the corresponding word in
 738 ///    the destination.
 739 ///
 740 /// \headerfile <x86intrin.h>
 741 ///
 742 /// This intrinsic corresponds to the \c PSIGNW instruction.
 743 ///
 744 /// \param __a
 745 ///    A 64-bit integer vector containing the values to be copied.
 746 /// \param __b
 747 ///    A 64-bit integer vector containing control words corresponding to
 748 ///    positions in the destination.
 749 /// \returns A 64-bit integer vector containing the resultant values.
 750 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 751 _mm_sign_pi16(__m64 __a, __m64 __b)
 752 {
 753     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 754 }
 755
 756 /// For each 32-bit integer in the first source operand, perform one of
 757 ///    the following actions as specified by the second source operand.
 758 ///
 759 ///    If the doubleword in the second source is negative, calculate the two's
 760 ///    complement of the corresponding doubleword in the first source, and
 761 ///    write that value to the destination. If the doubleword in the second
 762 ///    source is positive, copy the corresponding doubleword from the first
 763 ///    source to the destination. If the doubleword in the second source is
 764 ///    zero, clear the corresponding doubleword in the destination.
 765 ///
 766 /// \headerfile <x86intrin.h>
 767 ///
 768 /// This intrinsic corresponds to the \c PSIGND instruction.
 769 ///
 770 /// \param __a
 771 ///    A 64-bit integer vector containing the values to be copied.
 772 /// \param __b
 773 ///    A 64-bit integer vector containing two control doublewords corresponding
 774 ///    to positions in the destination.
 775 /// \returns A 64-bit integer vector containing the resultant values.
 776 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 777 _mm_sign_pi32(__m64 __a, __m64 __b)
 778 {
 779     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 780 }
 781
 782 #undef __DEFAULT_FN_ATTRS
 783 #undef __DEFAULT_FN_ATTRS_MMX
 784
 785 #endif /* __TMMINTRIN_H */