contrib/llvm/tools/clang/lib/Headers/tmmintrin.h

   1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __TMMINTRIN_H
  25 #define __TMMINTRIN_H
  26
  27 #include <pmmintrin.h>
  28
  29 /* Define the default attributes for the functions in this file. */
  30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
  31
  32 /// \brief Computes the absolute value of each of the packed 8-bit signed
  33 ///    integers in the source operand and stores the 8-bit unsigned integer
  34 ///    results in the destination.
  35 ///
  36 /// \headerfile <x86intrin.h>
  37 ///
  38 /// This intrinsic corresponds to the \c PABSB instruction.
  39 ///
  40 /// \param __a
  41 ///    A 64-bit vector of [8 x i8].
  42 /// \returns A 64-bit integer vector containing the absolute values of the
  43 ///    elements in the operand.
  44 static __inline__ __m64 __DEFAULT_FN_ATTRS
  45 _mm_abs_pi8(__m64 __a)
  46 {
  47     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  48 }
  49
  50 /// \brief Computes the absolute value of each of the packed 8-bit signed
  51 ///    integers in the source operand and stores the 8-bit unsigned integer
  52 ///    results in the destination.
  53 ///
  54 /// \headerfile <x86intrin.h>
  55 ///
  56 /// This intrinsic corresponds to the \c VPABSB instruction.
  57 ///
  58 /// \param __a
  59 ///    A 128-bit vector of [16 x i8].
  60 /// \returns A 128-bit integer vector containing the absolute values of the
  61 ///    elements in the operand.
  62 static __inline__ __m128i __DEFAULT_FN_ATTRS
  63 _mm_abs_epi8(__m128i __a)
  64 {
  65     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
  66 }
  67
  68 /// \brief Computes the absolute value of each of the packed 16-bit signed
  69 ///    integers in the source operand and stores the 16-bit unsigned integer
  70 ///    results in the destination.
  71 ///
  72 /// \headerfile <x86intrin.h>
  73 ///
  74 /// This intrinsic corresponds to the \c PABSW instruction.
  75 ///
  76 /// \param __a
  77 ///    A 64-bit vector of [4 x i16].
  78 /// \returns A 64-bit integer vector containing the absolute values of the
  79 ///    elements in the operand.
  80 static __inline__ __m64 __DEFAULT_FN_ATTRS
  81 _mm_abs_pi16(__m64 __a)
  82 {
  83     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  84 }
  85
  86 /// \brief Computes the absolute value of each of the packed 16-bit signed
  87 ///    integers in the source operand and stores the 16-bit unsigned integer
  88 ///    results in the destination.
  89 ///
  90 /// \headerfile <x86intrin.h>
  91 ///
  92 /// This intrinsic corresponds to the \c VPABSW instruction.
  93 ///
  94 /// \param __a
  95 ///    A 128-bit vector of [8 x i16].
  96 /// \returns A 128-bit integer vector containing the absolute values of the
  97 ///    elements in the operand.
  98 static __inline__ __m128i __DEFAULT_FN_ATTRS
  99 _mm_abs_epi16(__m128i __a)
 100 {
 101     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 102 }
 103
 104 /// \brief Computes the absolute value of each of the packed 32-bit signed
 105 ///    integers in the source operand and stores the 32-bit unsigned integer
 106 ///    results in the destination.
 107 ///
 108 /// \headerfile <x86intrin.h>
 109 ///
 110 /// This intrinsic corresponds to the \c PABSD instruction.
 111 ///
 112 /// \param __a
 113 ///    A 64-bit vector of [2 x i32].
 114 /// \returns A 64-bit integer vector containing the absolute values of the
 115 ///    elements in the operand.
 116 static __inline__ __m64 __DEFAULT_FN_ATTRS
 117 _mm_abs_pi32(__m64 __a)
 118 {
 119     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 120 }
 121
 122 /// \brief Computes the absolute value of each of the packed 32-bit signed
 123 ///    integers in the source operand and stores the 32-bit unsigned integer
 124 ///    results in the destination.
 125 ///
 126 /// \headerfile <x86intrin.h>
 127 ///
 128 /// This intrinsic corresponds to the \c VPABSD instruction.
 129 ///
 130 /// \param __a
 131 ///    A 128-bit vector of [4 x i32].
 132 /// \returns A 128-bit integer vector containing the absolute values of the
 133 ///    elements in the operand.
 134 static __inline__ __m128i __DEFAULT_FN_ATTRS
 135 _mm_abs_epi32(__m128i __a)
 136 {
 137     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 138 }
 139
 140 /// \brief Concatenates the two 128-bit integer vector operands, and
 141 ///    right-shifts the result by the number of bytes specified in the immediate
 142 ///    operand.
 143 ///
 144 /// \headerfile <x86intrin.h>
 145 ///
 146 /// \code
 147 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
 148 /// \endcode
 149 ///
 150 /// This intrinsic corresponds to the \c PALIGNR instruction.
 151 ///
 152 /// \param a
 153 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 154 /// \param b
 155 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
 156 /// \param n
 157 ///    An immediate operand specifying how many bytes to right-shift the result.
 158 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 159 ///    value.
 160 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
 161   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
 162                                      (__v16qi)(__m128i)(b), (n)); })
 163
 164 /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
 165 ///    the result by the number of bytes specified in the immediate operand.
 166 ///
 167 /// \headerfile <x86intrin.h>
 168 ///
 169 /// \code
 170 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
 171 /// \endcode
 172 ///
 173 /// This intrinsic corresponds to the \c PALIGNR instruction.
 174 ///
 175 /// \param a
 176 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 177 /// \param b
 178 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
 179 /// \param n
 180 ///    An immediate operand specifying how many bytes to right-shift the result.
 181 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 182 ///    value.
 183 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
 184   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
 185
 186 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 187 ///    128-bit vectors of [8 x i16].
 188 ///
 189 /// \headerfile <x86intrin.h>
 190 ///
 191 /// This intrinsic corresponds to the \c VPHADDW instruction.
 192 ///
 193 /// \param __a
 194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 195 ///    horizontal sums of the values are stored in the lower bits of the
 196 ///    destination.
 197 /// \param __b
 198 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 199 ///    horizontal sums of the values are stored in the upper bits of the
 200 ///    destination.
 201 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
 202 ///    both operands.
 203 static __inline__ __m128i __DEFAULT_FN_ATTRS
 204 _mm_hadd_epi16(__m128i __a, __m128i __b)
 205 {
 206     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 207 }
 208
 209 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 210 ///    128-bit vectors of [4 x i32].
 211 ///
 212 /// \headerfile <x86intrin.h>
 213 ///
 214 /// This intrinsic corresponds to the \c VPHADDD instruction.
 215 ///
 216 /// \param __a
 217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 218 ///    horizontal sums of the values are stored in the lower bits of the
 219 ///    destination.
 220 /// \param __b
 221 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 222 ///    horizontal sums of the values are stored in the upper bits of the
 223 ///    destination.
 224 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
 225 ///    both operands.
 226 static __inline__ __m128i __DEFAULT_FN_ATTRS
 227 _mm_hadd_epi32(__m128i __a, __m128i __b)
 228 {
 229     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 230 }
 231
 232 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 233 ///    64-bit vectors of [4 x i16].
 234 ///
 235 /// \headerfile <x86intrin.h>
 236 ///
 237 /// This intrinsic corresponds to the \c PHADDW instruction.
 238 ///
 239 /// \param __a
 240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 241 ///    horizontal sums of the values are stored in the lower bits of the
 242 ///    destination.
 243 /// \param __b
 244 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 245 ///    horizontal sums of the values are stored in the upper bits of the
 246 ///    destination.
 247 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 248 ///    operands.
 249 static __inline__ __m64 __DEFAULT_FN_ATTRS
 250 _mm_hadd_pi16(__m64 __a, __m64 __b)
 251 {
 252     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 253 }
 254
 255 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 256 ///    64-bit vectors of [2 x i32].
 257 ///
 258 /// \headerfile <x86intrin.h>
 259 ///
 260 /// This intrinsic corresponds to the \c PHADDD instruction.
 261 ///
 262 /// \param __a
 263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 264 ///    horizontal sums of the values are stored in the lower bits of the
 265 ///    destination.
 266 /// \param __b
 267 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 268 ///    horizontal sums of the values are stored in the upper bits of the
 269 ///    destination.
 270 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 271 ///    operands.
 272 static __inline__ __m64 __DEFAULT_FN_ATTRS
 273 _mm_hadd_pi32(__m64 __a, __m64 __b)
 274 {
 275     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 276 }
 277
 278 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 279 ///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 280 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 281 ///
 282 /// \headerfile <x86intrin.h>
 283 ///
 284 /// This intrinsic corresponds to the \c VPHADDSW instruction.
 285 ///
 286 /// \param __a
 287 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 288 ///    horizontal sums of the values are stored in the lower bits of the
 289 ///    destination.
 290 /// \param __b
 291 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 292 ///    horizontal sums of the values are stored in the upper bits of the
 293 ///    destination.
 294 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 295 ///    sums of both operands.
 296 static __inline__ __m128i __DEFAULT_FN_ATTRS
 297 _mm_hadds_epi16(__m128i __a, __m128i __b)
 298 {
 299     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 300 }
 301
 302 /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
 303 ///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
 304 ///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
 305 ///
 306 /// \headerfile <x86intrin.h>
 307 ///
 308 /// This intrinsic corresponds to the \c PHADDSW instruction.
 309 ///
 310 /// \param __a
 311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 312 ///    horizontal sums of the values are stored in the lower bits of the
 313 ///    destination.
 314 /// \param __b
 315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 316 ///    horizontal sums of the values are stored in the upper bits of the
 317 ///    destination.
 318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 319 ///    sums of both operands.
 320 static __inline__ __m64 __DEFAULT_FN_ATTRS
 321 _mm_hadds_pi16(__m64 __a, __m64 __b)
 322 {
 323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 324 }
 325
 326 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 327 ///    packed 128-bit vectors of [8 x i16].
 328 ///
 329 /// \headerfile <x86intrin.h>
 330 ///
 331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
 332 ///
 333 /// \param __a
 334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 335 ///    horizontal differences between the values are stored in the lower bits of
 336 ///    the destination.
 337 /// \param __b
 338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 339 ///    horizontal differences between the values are stored in the upper bits of
 340 ///    the destination.
 341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
 342 ///    of both operands.
 343 static __inline__ __m128i __DEFAULT_FN_ATTRS
 344 _mm_hsub_epi16(__m128i __a, __m128i __b)
 345 {
 346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 347 }
 348
 349 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 350 ///    packed 128-bit vectors of [4 x i32].
 351 ///
 352 /// \headerfile <x86intrin.h>
 353 ///
 354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
 355 ///
 356 /// \param __a
 357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 358 ///    horizontal differences between the values are stored in the lower bits of
 359 ///    the destination.
 360 /// \param __b
 361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
 362 ///    horizontal differences between the values are stored in the upper bits of
 363 ///    the destination.
 364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
 365 ///    of both operands.
 366 static __inline__ __m128i __DEFAULT_FN_ATTRS
 367 _mm_hsub_epi32(__m128i __a, __m128i __b)
 368 {
 369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 370 }
 371
 372 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 373 ///    packed 64-bit vectors of [4 x i16].
 374 ///
 375 /// \headerfile <x86intrin.h>
 376 ///
 377 /// This intrinsic corresponds to the \c PHSUBW instruction.
 378 ///
 379 /// \param __a
 380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 381 ///    horizontal differences between the values are stored in the lower bits of
 382 ///    the destination.
 383 /// \param __b
 384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 385 ///    horizontal differences between the values are stored in the upper bits of
 386 ///    the destination.
 387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 388 ///    of both operands.
 389 static __inline__ __m64 __DEFAULT_FN_ATTRS
 390 _mm_hsub_pi16(__m64 __a, __m64 __b)
 391 {
 392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 393 }
 394
 395 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 396 ///    packed 64-bit vectors of [2 x i32].
 397 ///
 398 /// \headerfile <x86intrin.h>
 399 ///
 400 /// This intrinsic corresponds to the \c PHSUBD instruction.
 401 ///
 402 /// \param __a
 403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 404 ///    horizontal differences between the values are stored in the lower bits of
 405 ///    the destination.
 406 /// \param __b
 407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
 408 ///    horizontal differences between the values are stored in the upper bits of
 409 ///    the destination.
 410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 411 ///    of both operands.
 412 static __inline__ __m64 __DEFAULT_FN_ATTRS
 413 _mm_hsub_pi32(__m64 __a, __m64 __b)
 414 {
 415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 416 }
 417
 418 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 419 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
 420 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 421 ///    saturated to 8000h.
 422 ///
 423 /// \headerfile <x86intrin.h>
 424 ///
 425 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
 426 ///
 427 /// \param __a
 428 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 429 ///    horizontal differences between the values are stored in the lower bits of
 430 ///    the destination.
 431 /// \param __b
 432 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
 433 ///    horizontal differences between the values are stored in the upper bits of
 434 ///    the destination.
 435 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
 436 ///    differences of both operands.
 437 static __inline__ __m128i __DEFAULT_FN_ATTRS
 438 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 439 {
 440     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 441 }
 442
 443 /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
 444 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
 445 ///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 446 ///    saturated to 8000h.
 447 ///
 448 /// \headerfile <x86intrin.h>
 449 ///
 450 /// This intrinsic corresponds to the \c PHSUBSW instruction.
 451 ///
 452 /// \param __a
 453 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 454 ///    horizontal differences between the values are stored in the lower bits of
 455 ///    the destination.
 456 /// \param __b
 457 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
 458 ///    horizontal differences between the values are stored in the upper bits of
 459 ///    the destination.
 460 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 461 ///    differences of both operands.
 462 static __inline__ __m64 __DEFAULT_FN_ATTRS
 463 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 464 {
 465     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 466 }
 467
 468 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
 469 ///    values contained in the first source operand and packed 8-bit signed
 470 ///    integer values contained in the second source operand, adds pairs of
 471 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 472 ///    the corresponding bits in the destination.
 473 ///
 474 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 475 ///    both operands are multiplied, and the sum of both results is written to
 476 ///    bits [15:0] of the destination.
 477 ///
 478 /// \headerfile <x86intrin.h>
 479 ///
 480 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
 481 ///
 482 /// \param __a
 483 ///    A 128-bit integer vector containing the first source operand.
 484 /// \param __b
 485 ///    A 128-bit integer vector containing the second source operand.
 486 /// \returns A 128-bit integer vector containing the sums of products of both
 487 ///    operands: \n
 488 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 489 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 490 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 491 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
 492 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
 493 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
 494 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
 495 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
 496 static __inline__ __m128i __DEFAULT_FN_ATTRS
 497 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 498 {
 499     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 500 }
 501
 502 /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
 503 ///    values contained in the first source operand and packed 8-bit signed
 504 ///    integer values contained in the second source operand, adds pairs of
 505 ///    contiguous products with signed saturation, and writes the 16-bit sums to
 506 ///    the corresponding bits in the destination.
 507 ///
 508 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
 509 ///    both operands are multiplied, and the sum of both results is written to
 510 ///    bits [15:0] of the destination.
 511 ///
 512 /// \headerfile <x86intrin.h>
 513 ///
 514 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
 515 ///
 516 /// \param __a
 517 ///    A 64-bit integer vector containing the first source operand.
 518 /// \param __b
 519 ///    A 64-bit integer vector containing the second source operand.
 520 /// \returns A 64-bit integer vector containing the sums of products of both
 521 ///    operands: \n
 522 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
 523 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 524 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 525 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
 526 static __inline__ __m64 __DEFAULT_FN_ATTRS
 527 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 528 {
 529     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 530 }
 531
 532 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
 533 ///    products to the 18 most significant bits by right-shifting, rounds the
 534 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 535 ///
 536 /// \headerfile <x86intrin.h>
 537 ///
 538 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
 539 ///
 540 /// \param __a
 541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 542 /// \param __b
 543 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
 544 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
 545 ///    products of both operands.
 546 static __inline__ __m128i __DEFAULT_FN_ATTRS
 547 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 548 {
 549     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 550 }
 551
 552 /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
 553 ///    products to the 18 most significant bits by right-shifting, rounds the
 554 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 555 ///
 556 /// \headerfile <x86intrin.h>
 557 ///
 558 /// This intrinsic corresponds to the \c PMULHRSW instruction.
 559 ///
 560 /// \param __a
 561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 562 /// \param __b
 563 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 564 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 565 ///    products of both operands.
 566 static __inline__ __m64 __DEFAULT_FN_ATTRS
 567 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 568 {
 569     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 570 }
 571
 572 /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
 573 ///    destination or clears 8-bit values in the destination, as specified by
 574 ///    the second source operand.
 575 ///
 576 /// \headerfile <x86intrin.h>
 577 ///
 578 /// This intrinsic corresponds to the \c VPSHUFB instruction.
 579 ///
 580 /// \param __a
 581 ///    A 128-bit integer vector containing the values to be copied.
 582 /// \param __b
 583 ///    A 128-bit integer vector containing control bytes corresponding to
 584 ///    positions in the destination:
 585 ///    Bit 7: \n
 586 ///    1: Clear the corresponding byte in the destination. \n
 587 ///    0: Copy the selected source byte to the corresponding byte in the
 588 ///    destination. \n
 589 ///    Bits [6:4] Reserved.  \n
 590 ///    Bits [3:0] select the source byte to be copied.
 591 /// \returns A 128-bit integer vector containing the copied or cleared values.
 592 static __inline__ __m128i __DEFAULT_FN_ATTRS
 593 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 594 {
 595     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 596 }
 597
 598 /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
 599 ///    destination or clears 8-bit values in the destination, as specified by
 600 ///    the second source operand.
 601 ///
 602 /// \headerfile <x86intrin.h>
 603 ///
 604 /// This intrinsic corresponds to the \c PSHUFB instruction.
 605 ///
 606 /// \param __a
 607 ///    A 64-bit integer vector containing the values to be copied.
 608 /// \param __b
 609 ///    A 64-bit integer vector containing control bytes corresponding to
 610 ///    positions in the destination:
 611 ///    Bit 7: \n
 612 ///    1: Clear the corresponding byte in the destination. \n
 613 ///    0: Copy the selected source byte to the corresponding byte in the
 614 ///    destination. \n
 615 ///    Bits [3:0] select the source byte to be copied.
 616 /// \returns A 64-bit integer vector containing the copied or cleared values.
 617 static __inline__ __m64 __DEFAULT_FN_ATTRS
 618 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 619 {
 620     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 621 }
 622
 623 /// \brief For each 8-bit integer in the first source operand, perform one of
 624 ///    the following actions as specified by the second source operand.
 625 ///
 626 ///    If the byte in the second source is negative, calculate the two's
 627 ///    complement of the corresponding byte in the first source, and write that
 628 ///    value to the destination. If the byte in the second source is positive,
 629 ///    copy the corresponding byte from the first source to the destination. If
 630 ///    the byte in the second source is zero, clear the corresponding byte in
 631 ///    the destination.
 632 ///
 633 /// \headerfile <x86intrin.h>
 634 ///
 635 /// This intrinsic corresponds to the \c VPSIGNB instruction.
 636 ///
 637 /// \param __a
 638 ///    A 128-bit integer vector containing the values to be copied.
 639 /// \param __b
 640 ///    A 128-bit integer vector containing control bytes corresponding to
 641 ///    positions in the destination.
 642 /// \returns A 128-bit integer vector containing the resultant values.
 643 static __inline__ __m128i __DEFAULT_FN_ATTRS
 644 _mm_sign_epi8(__m128i __a, __m128i __b)
 645 {
 646     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 647 }
 648
 649 /// \brief For each 16-bit integer in the first source operand, perform one of
 650 ///    the following actions as specified by the second source operand.
 651 ///
 652 ///    If the word in the second source is negative, calculate the two's
 653 ///    complement of the corresponding word in the first source, and write that
 654 ///    value to the destination. If the word in the second source is positive,
 655 ///    copy the corresponding word from the first source to the destination. If
 656 ///    the word in the second source is zero, clear the corresponding word in
 657 ///    the destination.
 658 ///
 659 /// \headerfile <x86intrin.h>
 660 ///
 661 /// This intrinsic corresponds to the \c VPSIGNW instruction.
 662 ///
 663 /// \param __a
 664 ///    A 128-bit integer vector containing the values to be copied.
 665 /// \param __b
 666 ///    A 128-bit integer vector containing control words corresponding to
 667 ///    positions in the destination.
 668 /// \returns A 128-bit integer vector containing the resultant values.
 669 static __inline__ __m128i __DEFAULT_FN_ATTRS
 670 _mm_sign_epi16(__m128i __a, __m128i __b)
 671 {
 672     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 673 }
 674
 675 /// \brief For each 32-bit integer in the first source operand, perform one of
 676 ///    the following actions as specified by the second source operand.
 677 ///
 678 ///    If the doubleword in the second source is negative, calculate the two's
 679 ///    complement of the corresponding word in the first source, and write that
 680 ///    value to the destination. If the doubleword in the second source is
 681 ///    positive, copy the corresponding word from the first source to the
 682 ///    destination. If the doubleword in the second source is zero, clear the
 683 ///    corresponding word in the destination.
 684 ///
 685 /// \headerfile <x86intrin.h>
 686 ///
 687 /// This intrinsic corresponds to the \c VPSIGND instruction.
 688 ///
 689 /// \param __a
 690 ///    A 128-bit integer vector containing the values to be copied.
 691 /// \param __b
 692 ///    A 128-bit integer vector containing control doublewords corresponding to
 693 ///    positions in the destination.
 694 /// \returns A 128-bit integer vector containing the resultant values.
 695 static __inline__ __m128i __DEFAULT_FN_ATTRS
 696 _mm_sign_epi32(__m128i __a, __m128i __b)
 697 {
 698     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 699 }
 700
 701 /// \brief For each 8-bit integer in the first source operand, perform one of
 702 ///    the following actions as specified by the second source operand.
 703 ///
 704 ///    If the byte in the second source is negative, calculate the two's
 705 ///    complement of the corresponding byte in the first source, and write that
 706 ///    value to the destination. If the byte in the second source is positive,
 707 ///    copy the corresponding byte from the first source to the destination. If
 708 ///    the byte in the second source is zero, clear the corresponding byte in
 709 ///    the destination.
 710 ///
 711 /// \headerfile <x86intrin.h>
 712 ///
 713 /// This intrinsic corresponds to the \c PSIGNB instruction.
 714 ///
 715 /// \param __a
 716 ///    A 64-bit integer vector containing the values to be copied.
 717 /// \param __b
 718 ///    A 64-bit integer vector containing control bytes corresponding to
 719 ///    positions in the destination.
 720 /// \returns A 64-bit integer vector containing the resultant values.
 721 static __inline__ __m64 __DEFAULT_FN_ATTRS
 722 _mm_sign_pi8(__m64 __a, __m64 __b)
 723 {
 724     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 725 }
 726
 727 /// \brief For each 16-bit integer in the first source operand, perform one of
 728 ///    the following actions as specified by the second source operand.
 729 ///
 730 ///    If the word in the second source is negative, calculate the two's
 731 ///    complement of the corresponding word in the first source, and write that
 732 ///    value to the destination. If the word in the second source is positive,
 733 ///    copy the corresponding word from the first source to the destination. If
 734 ///    the word in the second source is zero, clear the corresponding word in
 735 ///    the destination.
 736 ///
 737 /// \headerfile <x86intrin.h>
 738 ///
 739 /// This intrinsic corresponds to the \c PSIGNW instruction.
 740 ///
 741 /// \param __a
 742 ///    A 64-bit integer vector containing the values to be copied.
 743 /// \param __b
 744 ///    A 64-bit integer vector containing control words corresponding to
 745 ///    positions in the destination.
 746 /// \returns A 64-bit integer vector containing the resultant values.
 747 static __inline__ __m64 __DEFAULT_FN_ATTRS
 748 _mm_sign_pi16(__m64 __a, __m64 __b)
 749 {
 750     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 751 }
 752
 753 /// \brief For each 32-bit integer in the first source operand, perform one of
 754 ///    the following actions as specified by the second source operand.
 755 ///
 756 ///    If the doubleword in the second source is negative, calculate the two's
 757 ///    complement of the corresponding doubleword in the first source, and
 758 ///    write that value to the destination. If the doubleword in the second
 759 ///    source is positive, copy the corresponding doubleword from the first
 760 ///    source to the destination. If the doubleword in the second source is
 761 ///    zero, clear the corresponding doubleword in the destination.
 762 ///
 763 /// \headerfile <x86intrin.h>
 764 ///
 765 /// This intrinsic corresponds to the \c PSIGND instruction.
 766 ///
 767 /// \param __a
 768 ///    A 64-bit integer vector containing the values to be copied.
 769 /// \param __b
 770 ///    A 64-bit integer vector containing two control doublewords corresponding
 771 ///    to positions in the destination.
 772 /// \returns A 64-bit integer vector containing the resultant values.
 773 static __inline__ __m64 __DEFAULT_FN_ATTRS
 774 _mm_sign_pi32(__m64 __a, __m64 __b)
 775 {
 776     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 777 }
 778
 779 #undef __DEFAULT_FN_ATTRS
 780
 781 #endif /* __TMMINTRIN_H */