contrib/llvm/tools/clang/lib/Headers/tmmintrin.h

   1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __TMMINTRIN_H
  25 #define __TMMINTRIN_H
  26
  27 #include <pmmintrin.h>
  28
  29 /* Define the default attributes for the functions in this file. */
  30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
  31
  32 /// \brief Computes the absolute value of each of the packed 8-bit signed
  33 ///    integers in the source operand and stores the 8-bit unsigned integer
  34 ///    results in the destination.
  35 ///
  36 /// \headerfile <x86intrin.h>
  37 ///
  38 /// This intrinsic corresponds to the \c PABSB instruction.
  39 ///
  40 /// \param __a
  41 ///    A 64-bit vector of [8 x i8].
  42 /// \returns A 64-bit integer vector containing the absolute values of the
  43 ///    elements in the operand.
  44 static __inline__ __m64 __DEFAULT_FN_ATTRS
  45 _mm_abs_pi8(__m64 __a)
  46 {
  47     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  48 }
  49
  50 /// \brief Computes the absolute value of each of the packed 8-bit signed
  51 ///    integers in the source operand and stores the 8-bit unsigned integer
  52 ///    results in the destination.
  53 ///
  54 /// \headerfile <x86intrin.h>
  55 ///
  56 /// This intrinsic corresponds to the \c VPABSB instruction.
  57 ///
  58 /// \param __a
  59 ///    A 128-bit vector of [16 x i8].
  60 /// \returns A 128-bit integer vector containing the absolute values of the
  61 ///    elements in the operand.
  62 static __inline__ __m128i __DEFAULT_FN_ATTRS
  63 _mm_abs_epi8(__m128i __a)
  64 {
  65     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
  66 }
  67
  68 /// \brief Computes the absolute value of each of the packed 16-bit signed
  69 ///    integers in the source operand and stores the 16-bit unsigned integer
  70 ///    results in the destination.
  71 ///
  72 /// \headerfile <x86intrin.h>
  73 ///
  74 /// This intrinsic corresponds to the \c PABSW instruction.
  75 ///
  76 /// \param __a
  77 ///    A 64-bit vector of [4 x i16].
  78 /// \returns A 64-bit integer vector containing the absolute values of the
  79 ///    elements in the operand.
  80 static __inline__ __m64 __DEFAULT_FN_ATTRS
  81 _mm_abs_pi16(__m64 __a)
  82 {
  83     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  84 }
  85
  86 /// \brief Computes the absolute value of each of the packed 16-bit signed
  87 ///    integers in the source operand and stores the 16-bit unsigned integer
  88 ///    results in the destination.
  89 ///
  90 /// \headerfile <x86intrin.h>
  91 ///
  92 /// This intrinsic corresponds to the \c VPABSW instruction.
  93 ///
  94 /// \param __a
  95 ///    A 128-bit vector of [8 x i16].
  96 /// \returns A 128-bit integer vector containing the absolute values of the
  97 ///    elements in the operand.
  98 static __inline__ __m128i __DEFAULT_FN_ATTRS
  99 _mm_abs_epi16(__m128i __a)
 100 {
 101     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 102 }
 103
 104 /// \brief Computes the absolute value of each of the packed 32-bit signed
 105 ///    integers in the source operand and stores the 32-bit unsigned integer
 106 ///    results in the destination.
 107 ///
 108 /// \headerfile <x86intrin.h>
 109 ///
 110 /// This intrinsic corresponds to the \c PABSD instruction.
 111 ///
 112 /// \param __a
 113 ///    A 64-bit vector of [2 x i32].
 114 /// \returns A 64-bit integer vector containing the absolute values of the
 115 ///    elements in the operand.
 116 static __inline__ __m64 __DEFAULT_FN_ATTRS
 117 _mm_abs_pi32(__m64 __a)
 118 {
 119     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 120 }
 121
 122 /// \brief Computes the absolute value of each of the packed 32-bit signed
 123 ///    integers in the source operand and stores the 32-bit unsigned integer
 124 ///    results in the destination.
 125 ///
 126 /// \headerfile <x86intrin.h>
 127 ///
 128 /// This intrinsic corresponds to the \c VPABSD instruction.
 129 ///
 130 /// \param __a
 131 ///    A 128-bit vector of [4 x i32].
 132 /// \returns A 128-bit integer vector containing the absolute values of the
 133 ///    elements in the operand.
 134 static __inline__ __m128i __DEFAULT_FN_ATTRS
 135 _mm_abs_epi32(__m128i __a)
 136 {
 137     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 138 }
 139
 140 /// \brief Concatenates the two 128-bit integer vector operands, and
 141 ///    right-shifts the result by the number of bytes specified in the immediate
 142 ///    operand.
 143 ///
 144 /// \headerfile <x86intrin.h>
 145 ///
 146 /// \code
 147 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
 148 /// \endcode
 149 ///
 150 /// This intrinsic corresponds to the \c PALIGNR instruction.
 151 ///
 152 /// \param a
 153 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 154 /// \param b
 155 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 156 /// \param n
 157 ///    An immediate operand specifying how many bytes to right-shift the result.
 158 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 159 ///    value.
 160 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
 161   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
 162                                      (__v16qi)(__m128i)(b), (n)); })
 163
 164 /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
 165 ///    the result by the number of bytes specified in the immediate operand.
 166 ///
 167 /// \headerfile <x86intrin.h>
 168 ///
 169 /// \code
 170 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
 171 /// \endcode
 172 ///
 173 /// This intrinsic corresponds to the \c PALIGNR instruction.
 174 ///
 175 /// \param a
 176 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 177 /// \param b
 178 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 179 /// \param n
 180 ///    An immediate operand specifying how many bytes to right-shift the result.
 181 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 182 ///    value.
 183 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
 184   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
 185
 186 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 187 ///    128-bit vectors of [8 x i16].
 188 ///
 189 /// \headerfile <x86intrin.h>
 190 ///
 191 /// This intrinsic corresponds to the \c VPHADDW instruction.
 192 ///
 193 /// \param __a
 194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 195 ///    horizontal sums of the values are stored in the lower bits of the
 196 ///    destination.
 197 /// \param __b
 198 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 199 ///    horizontal sums of the values are stored in the upper bits of the
 200 ///    destination.
 201 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 202 ///    both operands.
 203 static __inline__ __m128i __DEFAULT_FN_ATTRS
 204 _mm_hadd_epi16(__m128i __a, __m128i __b)
 205 {
 206     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 207 }
 208
 209 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 210 ///    128-bit vectors of [4 x i32].
 211 ///
 212 /// \headerfile <x86intrin.h>
 213 ///
 214 /// This intrinsic corresponds to the \c VPHADDD instruction.
 215 ///
 216 /// \param __a
 217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 218 ///    horizontal sums of the values are stored in the lower bits of the
 219 ///    destination.
 220 /// \param __b
 221 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 222 ///    horizontal sums of the values are stored in the upper bits of the
 223 ///    destination.
 224 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 225 ///    both operands.
 226 static __inline__ __m128i __DEFAULT_FN_ATTRS
 227 _mm_hadd_epi32(__m128i __a, __m128i __b)
 228 {
 229     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 230 }
 231
 232 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 233 ///    64-bit vectors of [4 x i16].
 234 ///
 235 /// \headerfile <x86intrin.h>
 236 ///
 237 /// This intrinsic corresponds to the \c PHADDW instruction.
 238 ///
 239 /// \param __a
 240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 241 ///    horizontal sums of the values are stored in the lower bits of the
 242 ///    destination.
 243 /// \param __b
 244 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 245 ///    horizontal sums of the values are stored in the upper bits of the
 246 ///    destination.
 247 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 248 ///    operands.
 249 static __inline__ __m64 __DEFAULT_FN_ATTRS
 250 _mm_hadd_pi16(__m64 __a, __m64 __b)
 251 {
 252     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 253 }
 254
 255 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 256 ///    64-bit vectors of [2 x i32].
 257 ///
 258 /// \headerfile <x86intrin.h>
 259 ///
 260 /// This intrinsic corresponds to the \c PHADDD instruction.
 261 ///
 262 /// \param __a
 263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 264 ///    horizontal sums of the values are stored in the lower bits of the
 265 ///    destination.
 266 /// \param __b
 267 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 268 ///    horizontal sums of the values are stored in the upper bits of the
 269 ///    destination.
 270 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 271 ///    operands.
 272 static __inline__ __m64 __DEFAULT_FN_ATTRS
 273 _mm_hadd_pi32(__m64 __a, __m64 __b)
 274 {
 275     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 276 }
 277
 278 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 279 ///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 280 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 281 ///
 282 /// \headerfile <x86intrin.h>
 283 ///
 284 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 285 ///
 286 /// \param __a
 287 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 288 ///    horizontal sums of the values are stored in the lower bits of the
 289 ///    destination.
 290 /// \param __b
 291 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 292 ///    horizontal sums of the values are stored in the upper bits of the
 293 ///    destination.
 294 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 295 ///    sums of both operands.
 296 static __inline__ __m128i __DEFAULT_FN_ATTRS
 297 _mm_hadds_epi16(__m128i __a, __m128i __b)
 298 {
 299     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 300 }
 301
 302 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 303 ///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
 304 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 305 ///
 306 /// \headerfile <x86intrin.h>
 307 ///
 308 /// This intrinsic corresponds to the \c PHADDSW instruction.
 309 ///
 310 /// \param __a
 311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 312 ///    horizontal sums of the values are stored in the lower bits of the
 313 ///    destination.
 314 /// \param __b
 315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 316 ///    horizontal sums of the values are stored in the upper bits of the
 317 ///    destination.
 318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 319 ///    sums of both operands.
 320 static __inline__ __m64 __DEFAULT_FN_ATTRS
 321 _mm_hadds_pi16(__m64 __a, __m64 __b)
 322 {
 323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 324 }
 325
 326 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 327 ///    packed 128-bit vectors of [8 x i16].
 328 ///
 329 /// \headerfile <x86intrin.h>
 330 ///
 331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 332 ///
 333 /// \param __a
 334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 335 ///    horizontal differences between the values are stored in the lower bits of
 336 ///    the destination.
 337 /// \param __b
 338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 339 ///    horizontal differences between the values are stored in the upper bits of
 340 ///    the destination.
 341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 342 ///    of both operands.
 343 static __inline__ __m128i __DEFAULT_FN_ATTRS
 344 _mm_hsub_epi16(__m128i __a, __m128i __b)
 345 {
 346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 347 }
 348
 349 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 350 ///    packed 128-bit vectors of [4 x i32].
 351 ///
 352 /// \headerfile <x86intrin.h>
 353 ///
 354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 355 ///
 356 /// \param __a
 357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 358 ///    horizontal differences between the values are stored in the lower bits of
 359 ///    the destination.
 360 /// \param __b
 361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 362 ///    horizontal differences between the values are stored in the upper bits of
 363 ///    the destination.
 364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 365 ///    of both operands.
 366 static __inline__ __m128i __DEFAULT_FN_ATTRS
 367 _mm_hsub_epi32(__m128i __a, __m128i __b)
 368 {
 369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 370 }
 371
 372 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 373 ///    packed 64-bit vectors of [4 x i16].
 374 ///
 375 /// \headerfile <x86intrin.h>
 376 ///
 377 /// This intrinsic corresponds to the \c PHSUBW instruction.
 378 ///
 379 /// \param __a
 380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 381 ///    horizontal differences between the values are stored in the lower bits of
 382 ///    the destination.
 383 /// \param __b
 384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 385 ///    horizontal differences between the values are stored in the upper bits of
 386 ///    the destination.
 387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 388 ///    of both operands.
 389 static __inline__ __m64 __DEFAULT_FN_ATTRS
 390 _mm_hsub_pi16(__m64 __a, __m64 __b)
 391 {
 392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 393 }
 394
 395 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 396 ///    packed 64-bit vectors of [2 x i32].
 397 ///
 398 /// \headerfile <x86intrin.h>
 399 ///
 400 /// This intrinsic corresponds to the \c PHSUBD instruction.
 401 ///
 402 /// \param __a
 403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 404 ///    horizontal differences between the values are stored in the lower bits of
 405 ///    the destination.
 406 /// \param __b
 407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 408 ///    horizontal differences between the values are stored in the upper bits of
 409 ///    the destination.
 410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 411 ///    of both operands.
 412 static __inline__ __m64 __DEFAULT_FN_ATTRS
 413 _mm_hsub_pi32(__m64 __a, __m64 __b)
 414 {
 415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 416 }
 417
 418 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 419 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
 420 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 421 ///    saturated to 8000h.
 422 ///
 423 /// \headerfile <x86intrin.h>
 424 ///
 425 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 426 ///
 427 /// \param __a
 428 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 429 ///    horizontal differences between the values are stored in the lower bits of
 430 ///    the destination.
 431 /// \param __b
 432 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 433 ///    horizontal differences between the values are stored in the upper bits of
 434 ///    the destination.
 435 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 436 ///    differences of both operands.
 437 static __inline__ __m128i __DEFAULT_FN_ATTRS
 438 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 439 {
 440     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 441 }
 442
 443 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 444 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
 445 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 446 ///    saturated to 8000h.
 447 ///
 448 /// \headerfile <x86intrin.h>
 449 ///
 450 /// This intrinsic corresponds to the \c PHSUBSW instruction.
 451 ///
 452 /// \param __a
 453 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 454 ///    horizontal differences between the values are stored in the lower bits of
 455 ///    the destination.
 456 /// \param __b
 457 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 458 ///    horizontal differences between the values are stored in the upper bits of
 459 ///    the destination.
 460 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 461 ///    differences of both operands.
 462 static __inline__ __m64 __DEFAULT_FN_ATTRS
 463 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 464 {
 465     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 466 }
 467
 468 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
 469 ///    values contained in the first source operand and packed 8-bit signed
 470 ///    integer values contained in the second source operand, adds pairs of
 471 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 472 ///    the corresponding bits in the destination. For example, bits [7:0] of
 473 ///    both operands are multiplied, bits [15:8] of both operands are
 474 ///    multiplied, and the sum of both results is written to bits [15:0] of the
 475 ///    destination.
 476 ///
 477 /// \headerfile <x86intrin.h>
 478 ///
 479 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 480 ///
 481 /// \param __a
 482 ///    A 128-bit integer vector containing the first source operand.
 483 /// \param __b
 484 ///    A 128-bit integer vector containing the second source operand.
 485 /// \returns A 128-bit integer vector containing the sums of products of both
 486 ///    operands: \n
 487 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 488 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 489 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 490 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
 491 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
 492 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 493 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 494 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 495 static __inline__ __m128i __DEFAULT_FN_ATTRS
 496 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 497 {
 498     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 499 }
 500
 501 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
 502 ///    values contained in the first source operand and packed 8-bit signed
 503 ///    integer values contained in the second source operand, adds pairs of
 504 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 505 ///    the corresponding bits in the destination. For example, bits [7:0] of
 506 ///    both operands are multiplied, bits [15:8] of both operands are
 507 ///    multiplied, and the sum of both results is written to bits [15:0] of the
 508 ///    destination.
 509 ///
 510 /// \headerfile <x86intrin.h>
 511 ///
 512 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
 513 ///
 514 /// \param __a
 515 ///    A 64-bit integer vector containing the first source operand.
 516 /// \param __b
 517 ///    A 64-bit integer vector containing the second source operand.
 518 /// \returns A 64-bit integer vector containing the sums of products of both
 519 ///    operands: \n
 520 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 521 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 522 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 523 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 524 static __inline__ __m64 __DEFAULT_FN_ATTRS
 525 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 526 {
 527     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 528 }
 529
 530 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
 531 ///    products to the 18 most significant bits by right-shifting, rounds the
 532 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 533 ///
 534 /// \headerfile <x86intrin.h>
 535 ///
 536 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 537 ///
 538 /// \param __a
 539 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 540 /// \param __b
 541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 542 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 543 ///    products of both operands.
 544 static __inline__ __m128i __DEFAULT_FN_ATTRS
 545 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 546 {
 547     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 548 }
 549
 550 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
 551 ///    products to the 18 most significant bits by right-shifting, rounds the
 552 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 553 ///
 554 /// \headerfile <x86intrin.h>
 555 ///
 556 /// This intrinsic corresponds to the \c PMULHRSW instruction.
 557 ///
 558 /// \param __a
 559 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 560 /// \param __b
 561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 562 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 563 ///    products of both operands.
 564 static __inline__ __m64 __DEFAULT_FN_ATTRS
 565 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 566 {
 567     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 568 }
 569
 570 /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
 571 ///    destination or clears 8-bit values in the destination, as specified by
 572 ///    the second source operand.
 573 ///
 574 /// \headerfile <x86intrin.h>
 575 ///
 576 /// This intrinsic corresponds to the \c VPSHUFB instruction.
 577 ///
 578 /// \param __a
 579 ///    A 128-bit integer vector containing the values to be copied.
 580 /// \param __b
 581 ///    A 128-bit integer vector containing control bytes corresponding to
 582 ///    positions in the destination:
 583 ///    Bit 7: \n
 584 ///    1: Clear the corresponding byte in the destination. \n
 585 ///    0: Copy the selected source byte to the corresponding byte in the
 586 ///    destination. \n
 587 ///    Bits [6:4] Reserved.  \n
 588 ///    Bits [3:0] select the source byte to be copied.
 589 /// \returns A 128-bit integer vector containing the copied or cleared values.
 590 static __inline__ __m128i __DEFAULT_FN_ATTRS
 591 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 592 {
 593     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 594 }
 595
 596 /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
 597 ///    destination or clears 8-bit values in the destination, as specified by
 598 ///    the second source operand.
 599 ///
 600 /// \headerfile <x86intrin.h>
 601 ///
 602 /// This intrinsic corresponds to the \c PSHUFB instruction.
 603 ///
 604 /// \param __a
 605 ///    A 64-bit integer vector containing the values to be copied.
 606 /// \param __b
 607 ///    A 64-bit integer vector containing control bytes corresponding to
 608 ///    positions in the destination:
 609 ///    Bit 7: \n
 610 ///    1: Clear the corresponding byte in the destination. \n
 611 ///    0: Copy the selected source byte to the corresponding byte in the
 612 ///    destination. \n
 613 ///    Bits [3:0] select the source byte to be copied.
 614 /// \returns A 64-bit integer vector containing the copied or cleared values.
 615 static __inline__ __m64 __DEFAULT_FN_ATTRS
 616 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 617 {
 618     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 619 }
 620
 621 /// \brief For each 8-bit integer in the first source operand, perform one of
 622 ///    the following actions as specified by the second source operand: If the
 623 ///    byte in the second source is negative, calculate the two's complement of
 624 ///    the corresponding byte in the first source, and write that value to the
 625 ///    destination. If the byte in the second source is positive, copy the
 626 ///    corresponding byte from the first source to the destination. If the byte
 627 ///    in the second source is zero, clear the corresponding byte in the
 628 ///    destination.
 629 ///
 630 /// \headerfile <x86intrin.h>
 631 ///
 632 /// This intrinsic corresponds to the \c VPSIGNB instruction.
 633 ///
 634 /// \param __a
 635 ///    A 128-bit integer vector containing the values to be copied.
 636 /// \param __b
 637 ///    A 128-bit integer vector containing control bytes corresponding to
 638 ///    positions in the destination.
 639 /// \returns A 128-bit integer vector containing the resultant values.
 640 static __inline__ __m128i __DEFAULT_FN_ATTRS
 641 _mm_sign_epi8(__m128i __a, __m128i __b)
 642 {
 643     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 644 }
 645
 646 /// \brief For each 16-bit integer in the first source operand, perform one of
 647 ///    the following actions as specified by the second source operand: If the
 648 ///    word in the second source is negative, calculate the two's complement of
 649 ///    the corresponding word in the first source, and write that value to the
 650 ///    destination. If the word in the second source is positive, copy the
 651 ///    corresponding word from the first source to the destination. If the word
 652 ///    in the second source is zero, clear the corresponding word in the
 653 ///    destination.
 654 ///
 655 /// \headerfile <x86intrin.h>
 656 ///
 657 /// This intrinsic corresponds to the \c VPSIGNW instruction.
 658 ///
 659 /// \param __a
 660 ///    A 128-bit integer vector containing the values to be copied.
 661 /// \param __b
 662 ///    A 128-bit integer vector containing control words corresponding to
 663 ///    positions in the destination.
 664 /// \returns A 128-bit integer vector containing the resultant values.
 665 static __inline__ __m128i __DEFAULT_FN_ATTRS
 666 _mm_sign_epi16(__m128i __a, __m128i __b)
 667 {
 668     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 669 }
 670
 671 /// \brief For each 32-bit integer in the first source operand, perform one of
 672 ///    the following actions as specified by the second source operand: If the
 673 ///    doubleword in the second source is negative, calculate the two's
 674 ///    complement of the corresponding word in the first source, and write that
 675 ///    value to the destination. If the doubleword in the second source is
 676 ///    positive, copy the corresponding word from the first source to the
 677 ///    destination. If the doubleword in the second source is zero, clear the
 678 ///    corresponding word in the destination.
 679 ///
 680 /// \headerfile <x86intrin.h>
 681 ///
 682 /// This intrinsic corresponds to the \c VPSIGND instruction.
 683 ///
 684 /// \param __a
 685 ///    A 128-bit integer vector containing the values to be copied.
 686 /// \param __b
 687 ///    A 128-bit integer vector containing control doublewords corresponding to
 688 ///    positions in the destination.
 689 /// \returns A 128-bit integer vector containing the resultant values.
 690 static __inline__ __m128i __DEFAULT_FN_ATTRS
 691 _mm_sign_epi32(__m128i __a, __m128i __b)
 692 {
 693     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 694 }
 695
 696 /// \brief For each 8-bit integer in the first source operand, perform one of
 697 ///    the following actions as specified by the second source operand: If the
 698 ///    byte in the second source is negative, calculate the two's complement of
 699 ///    the corresponding byte in the first source, and write that value to the
 700 ///    destination. If the byte in the second source is positive, copy the
 701 ///    corresponding byte from the first source to the destination. If the byte
 702 ///    in the second source is zero, clear the corresponding byte in the
 703 ///    destination.
 704 ///
 705 /// \headerfile <x86intrin.h>
 706 ///
 707 /// This intrinsic corresponds to the \c PSIGNB instruction.
 708 ///
 709 /// \param __a
 710 ///    A 64-bit integer vector containing the values to be copied.
 711 /// \param __b
 712 ///    A 64-bit integer vector containing control bytes corresponding to
 713 ///    positions in the destination.
 714 /// \returns A 64-bit integer vector containing the resultant values.
 715 static __inline__ __m64 __DEFAULT_FN_ATTRS
 716 _mm_sign_pi8(__m64 __a, __m64 __b)
 717 {
 718     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 719 }
 720
 721 /// \brief For each 16-bit integer in the first source operand, perform one of
 722 ///    the following actions as specified by the second source operand: If the
 723 ///    word in the second source is negative, calculate the two's complement of
 724 ///    the corresponding word in the first source, and write that value to the
 725 ///    destination. If the word in the second source is positive, copy the
 726 ///    corresponding word from the first source to the destination. If the word
 727 ///    in the second source is zero, clear the corresponding word in the
 728 ///    destination.
 729 ///
 730 /// \headerfile <x86intrin.h>
 731 ///
 732 /// This intrinsic corresponds to the \c PSIGNW instruction.
 733 ///
 734 /// \param __a
 735 ///    A 64-bit integer vector containing the values to be copied.
 736 /// \param __b
 737 ///    A 64-bit integer vector containing control words corresponding to
 738 ///    positions in the destination.
 739 /// \returns A 64-bit integer vector containing the resultant values.
 740 static __inline__ __m64 __DEFAULT_FN_ATTRS
 741 _mm_sign_pi16(__m64 __a, __m64 __b)
 742 {
 743     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 744 }
 745
 746 /// \brief For each 32-bit integer in the first source operand, perform one of
 747 ///    the following actions as specified by the second source operand: If the
 748 ///    doubleword in the second source is negative, calculate the two's
 749 ///    complement of the corresponding doubleword in the first source, and
 750 ///    write that value to the destination. If the doubleword in the second
 751 ///    source is positive, copy the corresponding doubleword from the first
 752 ///    source to the destination. If the doubleword in the second source is
 753 ///    zero, clear the corresponding doubleword in the destination.
 754 ///
 755 /// \headerfile <x86intrin.h>
 756 ///
 757 /// This intrinsic corresponds to the \c PSIGND instruction.
 758 ///
 759 /// \param __a
 760 ///    A 64-bit integer vector containing the values to be copied.
 761 /// \param __b
 762 ///    A 64-bit integer vector containing two control doublewords corresponding
 763 ///    to positions in the destination.
 764 /// \returns A 64-bit integer vector containing the resultant values.
 765 static __inline__ __m64 __DEFAULT_FN_ATTRS
 766 _mm_sign_pi32(__m64 __a, __m64 __b)
 767 {
 768     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 769 }
 770
 771 #undef __DEFAULT_FN_ATTRS
 772
 773 #endif /* __TMMINTRIN_H */