contrib/llvm/tools/clang/lib/Headers/mmintrin.h

   1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __MMINTRIN_H
  25 #define __MMINTRIN_H
  26
  27 typedef long long __m64 __attribute__((__vector_size__(8)));
  28
  29 typedef long long __v1di __attribute__((__vector_size__(8)));
  30 typedef int __v2si __attribute__((__vector_size__(8)));
  31 typedef short __v4hi __attribute__((__vector_size__(8)));
  32 typedef char __v8qi __attribute__((__vector_size__(8)));
  33
  34 /* Define the default attributes for the functions in this file. */
  35 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
  36
  37 /// Clears the MMX state by setting the state of the x87 stack registers
  38 ///    to empty.
  39 ///
  40 /// \headerfile <x86intrin.h>
  41 ///
  42 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
  43 ///
  44 static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
  45 _mm_empty(void)
  46 {
  47     __builtin_ia32_emms();
  48 }
  49
  50 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
  51 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
  52 ///
  53 /// \headerfile <x86intrin.h>
  54 ///
  55 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
  56 ///
  57 /// \param __i
  58 ///    A 32-bit integer value.
  59 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
  60 ///    parameter. The upper 32 bits are set to 0.
  61 static __inline__ __m64 __DEFAULT_FN_ATTRS
  62 _mm_cvtsi32_si64(int __i)
  63 {
  64     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
  65 }
  66
  67 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
  68 ///    signed integer.
  69 ///
  70 /// \headerfile <x86intrin.h>
  71 ///
  72 /// This intrinsic corresponds to the <c> MOVD </c> instruction.
  73 ///
  74 /// \param __m
  75 ///    A 64-bit integer vector.
  76 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
  77 ///    parameter.
  78 static __inline__ int __DEFAULT_FN_ATTRS
  79 _mm_cvtsi64_si32(__m64 __m)
  80 {
  81     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
  82 }
  83
  84 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
  85 ///
  86 /// \headerfile <x86intrin.h>
  87 ///
  88 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
  89 ///
  90 /// \param __i
  91 ///    A 64-bit signed integer.
  92 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
  93 ///    parameter.
  94 static __inline__ __m64 __DEFAULT_FN_ATTRS
  95 _mm_cvtsi64_m64(long long __i)
  96 {
  97     return (__m64)__i;
  98 }
  99
 100 /// Casts a 64-bit integer vector into a 64-bit signed integer value.
 101 ///
 102 /// \headerfile <x86intrin.h>
 103 ///
 104 /// This intrinsic corresponds to the <c> MOVQ </c> instruction.
 105 ///
 106 /// \param __m
 107 ///    A 64-bit integer vector.
 108 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 109 ///    parameter.
 110 static __inline__ long long __DEFAULT_FN_ATTRS
 111 _mm_cvtm64_si64(__m64 __m)
 112 {
 113     return (long long)__m;
 114 }
 115
 116 /// Converts 16-bit signed integers from both 64-bit integer vector
 117 ///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
 118 ///    a 64-bit integer vector of [8 x i8] as the result. Positive values
 119 ///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
 120 ///    are saturated to 0x80.
 121 ///
 122 /// \headerfile <x86intrin.h>
 123 ///
 124 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 125 ///
 126 /// \param __m1
 127 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
 128 ///    16-bit signed integer and is converted to an 8-bit signed integer with
 129 ///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
 130 ///    Negative values less than 0x80 are saturated to 0x80. The converted
 131 ///    [4 x i8] values are written to the lower 32 bits of the result.
 132 /// \param __m2
 133 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
 134 ///    16-bit signed integer and is converted to an 8-bit signed integer with
 135 ///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
 136 ///    Negative values less than 0x80 are saturated to 0x80. The converted
 137 ///    [4 x i8] values are written to the upper 32 bits of the result.
 138 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 139 ///    values.
 140 static __inline__ __m64 __DEFAULT_FN_ATTRS
 141 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 142 {
 143     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 144 }
 145
 146 /// Converts 32-bit signed integers from both 64-bit integer vector
 147 ///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
 148 ///    a 64-bit integer vector of [4 x i16] as the result. Positive values
 149 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
 150 ///    0x8000 are saturated to 0x8000.
 151 ///
 152 /// \headerfile <x86intrin.h>
 153 ///
 154 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 155 ///
 156 /// \param __m1
 157 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
 158 ///    32-bit signed integer and is converted to a 16-bit signed integer with
 159 ///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
 160 ///    Negative values less than 0x8000 are saturated to 0x8000. The converted
 161 ///    [2 x i16] values are written to the lower 32 bits of the result.
 162 /// \param __m2
 163 ///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
 164 ///    32-bit signed integer and is converted to a 16-bit signed integer with
 165 ///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
 166 ///    Negative values less than 0x8000 are saturated to 0x8000. The converted
 167 ///    [2 x i16] values are written to the upper 32 bits of the result.
 168 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 169 ///    values.
 170 static __inline__ __m64 __DEFAULT_FN_ATTRS
 171 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 172 {
 173     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 174 }
 175
 176 /// Converts 16-bit signed integers from both 64-bit integer vector
 177 ///    parameters of [4 x i16] into 8-bit unsigned integer values, and
 178 ///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
 179 ///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
 180 ///    to 0.
 181 ///
 182 /// \headerfile <x86intrin.h>
 183 ///
 184 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 185 ///
 186 /// \param __m1
 187 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
 188 ///    16-bit signed integer and is converted to an 8-bit unsigned integer with
 189 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
 190 ///    than 0 are saturated to 0. The converted [4 x i8] values are written to
 191 ///    the lower 32 bits of the result.
 192 /// \param __m2
 193 ///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
 194 ///    16-bit signed integer and is converted to an 8-bit unsigned integer with
 195 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
 196 ///    than 0 are saturated to 0. The converted [4 x i8] values are written to
 197 ///    the upper 32 bits of the result.
 198 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 199 ///    values.
 200 static __inline__ __m64 __DEFAULT_FN_ATTRS
 201 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 202 {
 203     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 204 }
 205
 206 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
 207 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 208 ///
 209 /// \headerfile <x86intrin.h>
 210 ///
 211 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
 212 ///
 213 /// \param __m1
 214 ///    A 64-bit integer vector of [8 x i8]. \n
 215 ///    Bits [39:32] are written to bits [7:0] of the result. \n
 216 ///    Bits [47:40] are written to bits [23:16] of the result. \n
 217 ///    Bits [55:48] are written to bits [39:32] of the result. \n
 218 ///    Bits [63:56] are written to bits [55:48] of the result.
 219 /// \param __m2
 220 ///    A 64-bit integer vector of [8 x i8].
 221 ///    Bits [39:32] are written to bits [15:8] of the result. \n
 222 ///    Bits [47:40] are written to bits [31:24] of the result. \n
 223 ///    Bits [55:48] are written to bits [47:40] of the result. \n
 224 ///    Bits [63:56] are written to bits [63:56] of the result.
 225 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 226 ///    values.
 227 static __inline__ __m64 __DEFAULT_FN_ATTRS
 228 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 229 {
 230     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 231 }
 232
 233 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
 234 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 235 ///
 236 /// \headerfile <x86intrin.h>
 237 ///
 238 /// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
 239 ///
 240 /// \param __m1
 241 ///    A 64-bit integer vector of [4 x i16].
 242 ///    Bits [47:32] are written to bits [15:0] of the result. \n
 243 ///    Bits [63:48] are written to bits [47:32] of the result.
 244 /// \param __m2
 245 ///    A 64-bit integer vector of [4 x i16].
 246 ///    Bits [47:32] are written to bits [31:16] of the result. \n
 247 ///    Bits [63:48] are written to bits [63:48] of the result.
 248 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 249 ///    values.
 250 static __inline__ __m64 __DEFAULT_FN_ATTRS
 251 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 252 {
 253     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 254 }
 255
 256 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
 257 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 258 ///
 259 /// \headerfile <x86intrin.h>
 260 ///
 261 /// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
 262 ///
 263 /// \param __m1
 264 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
 265 ///    the lower 32 bits of the result.
 266 /// \param __m2
 267 ///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
 268 ///    the upper 32 bits of the result.
 269 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 270 ///    values.
 271 static __inline__ __m64 __DEFAULT_FN_ATTRS
 272 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 273 {
 274     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 275 }
 276
 277 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
 278 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 279 ///
 280 /// \headerfile <x86intrin.h>
 281 ///
 282 /// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
 283 ///
 284 /// \param __m1
 285 ///    A 64-bit integer vector of [8 x i8].
 286 ///    Bits [7:0] are written to bits [7:0] of the result. \n
 287 ///    Bits [15:8] are written to bits [23:16] of the result. \n
 288 ///    Bits [23:16] are written to bits [39:32] of the result. \n
 289 ///    Bits [31:24] are written to bits [55:48] of the result.
 290 /// \param __m2
 291 ///    A 64-bit integer vector of [8 x i8].
 292 ///    Bits [7:0] are written to bits [15:8] of the result. \n
 293 ///    Bits [15:8] are written to bits [31:24] of the result. \n
 294 ///    Bits [23:16] are written to bits [47:40] of the result. \n
 295 ///    Bits [31:24] are written to bits [63:56] of the result.
 296 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 297 ///    values.
 298 static __inline__ __m64 __DEFAULT_FN_ATTRS
 299 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 300 {
 301     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 302 }
 303
 304 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
 305 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 306 ///
 307 /// \headerfile <x86intrin.h>
 308 ///
 309 /// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
 310 ///
 311 /// \param __m1
 312 ///    A 64-bit integer vector of [4 x i16].
 313 ///    Bits [15:0] are written to bits [15:0] of the result. \n
 314 ///    Bits [31:16] are written to bits [47:32] of the result.
 315 /// \param __m2
 316 ///    A 64-bit integer vector of [4 x i16].
 317 ///    Bits [15:0] are written to bits [31:16] of the result. \n
 318 ///    Bits [31:16] are written to bits [63:48] of the result.
 319 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 320 ///    values.
 321 static __inline__ __m64 __DEFAULT_FN_ATTRS
 322 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 323 {
 324     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 325 }
 326
 327 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
 328 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 329 ///
 330 /// \headerfile <x86intrin.h>
 331 ///
 332 /// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
 333 ///
 334 /// \param __m1
 335 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
 336 ///    the lower 32 bits of the result.
 337 /// \param __m2
 338 ///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
 339 ///    the upper 32 bits of the result.
 340 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 341 ///    values.
 342 static __inline__ __m64 __DEFAULT_FN_ATTRS
 343 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 344 {
 345     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 346 }
 347
 348 /// Adds each 8-bit integer element of the first 64-bit integer vector
 349 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
 350 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
 351 ///    packed into a 64-bit integer vector of [8 x i8].
 352 ///
 353 /// \headerfile <x86intrin.h>
 354 ///
 355 /// This intrinsic corresponds to the <c> PADDB </c> instruction.
 356 ///
 357 /// \param __m1
 358 ///    A 64-bit integer vector of [8 x i8].
 359 /// \param __m2
 360 ///    A 64-bit integer vector of [8 x i8].
 361 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 362 ///    parameters.
 363 static __inline__ __m64 __DEFAULT_FN_ATTRS
 364 _mm_add_pi8(__m64 __m1, __m64 __m2)
 365 {
 366     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 367 }
 368
 369 /// Adds each 16-bit integer element of the first 64-bit integer vector
 370 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
 371 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
 372 ///    packed into a 64-bit integer vector of [4 x i16].
 373 ///
 374 /// \headerfile <x86intrin.h>
 375 ///
 376 /// This intrinsic corresponds to the <c> PADDW </c> instruction.
 377 ///
 378 /// \param __m1
 379 ///    A 64-bit integer vector of [4 x i16].
 380 /// \param __m2
 381 ///    A 64-bit integer vector of [4 x i16].
 382 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 383 ///    parameters.
 384 static __inline__ __m64 __DEFAULT_FN_ATTRS
 385 _mm_add_pi16(__m64 __m1, __m64 __m2)
 386 {
 387     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 388 }
 389
 390 /// Adds each 32-bit integer element of the first 64-bit integer vector
 391 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
 392 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
 393 ///    packed into a 64-bit integer vector of [2 x i32].
 394 ///
 395 /// \headerfile <x86intrin.h>
 396 ///
 397 /// This intrinsic corresponds to the <c> PADDD </c> instruction.
 398 ///
 399 /// \param __m1
 400 ///    A 64-bit integer vector of [2 x i32].
 401 /// \param __m2
 402 ///    A 64-bit integer vector of [2 x i32].
 403 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 404 ///    parameters.
 405 static __inline__ __m64 __DEFAULT_FN_ATTRS
 406 _mm_add_pi32(__m64 __m1, __m64 __m2)
 407 {
 408     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 409 }
 410
 411 /// Adds each 8-bit signed integer element of the first 64-bit integer
 412 ///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
 413 ///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
 414 ///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
 415 ///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
 416 ///
 417 /// \headerfile <x86intrin.h>
 418 ///
 419 /// This intrinsic corresponds to the <c> PADDSB </c> instruction.
 420 ///
 421 /// \param __m1
 422 ///    A 64-bit integer vector of [8 x i8].
 423 /// \param __m2
 424 ///    A 64-bit integer vector of [8 x i8].
 425 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 426 ///    of both parameters.
 427 static __inline__ __m64 __DEFAULT_FN_ATTRS
 428 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 429 {
 430     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 431 }
 432
 433 /// Adds each 16-bit signed integer element of the first 64-bit integer
 434 ///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
 435 ///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
 436 ///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
 437 ///    saturated to 0x8000. The results are packed into a 64-bit integer vector
 438 ///    of [4 x i16].
 439 ///
 440 /// \headerfile <x86intrin.h>
 441 ///
 442 /// This intrinsic corresponds to the <c> PADDSW </c> instruction.
 443 ///
 444 /// \param __m1
 445 ///    A 64-bit integer vector of [4 x i16].
 446 /// \param __m2
 447 ///    A 64-bit integer vector of [4 x i16].
 448 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 449 ///    of both parameters.
 450 static __inline__ __m64 __DEFAULT_FN_ATTRS
 451 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 452 {
 453     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 454 }
 455
 456 /// Adds each 8-bit unsigned integer element of the first 64-bit integer
 457 ///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
 458 ///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
 459 ///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
 460 ///    [8 x i8].
 461 ///
 462 /// \headerfile <x86intrin.h>
 463 ///
 464 /// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
 465 ///
 466 /// \param __m1
 467 ///    A 64-bit integer vector of [8 x i8].
 468 /// \param __m2
 469 ///    A 64-bit integer vector of [8 x i8].
 470 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 471 ///    unsigned sums of both parameters.
 472 static __inline__ __m64 __DEFAULT_FN_ATTRS
 473 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 474 {
 475     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 476 }
 477
 478 /// Adds each 16-bit unsigned integer element of the first 64-bit integer
 479 ///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
 480 ///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
 481 ///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
 482 ///    integer vector of [4 x i16].
 483 ///
 484 /// \headerfile <x86intrin.h>
 485 ///
 486 /// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
 487 ///
 488 /// \param __m1
 489 ///    A 64-bit integer vector of [4 x i16].
 490 /// \param __m2
 491 ///    A 64-bit integer vector of [4 x i16].
 492 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 493 ///    unsigned sums of both parameters.
 494 static __inline__ __m64 __DEFAULT_FN_ATTRS
 495 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 496 {
 497     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 498 }
 499
 500 /// Subtracts each 8-bit integer element of the second 64-bit integer
 501 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
 502 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
 503 ///    are packed into a 64-bit integer vector of [8 x i8].
 504 ///
 505 /// \headerfile <x86intrin.h>
 506 ///
 507 /// This intrinsic corresponds to the <c> PSUBB </c> instruction.
 508 ///
 509 /// \param __m1
 510 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
 511 /// \param __m2
 512 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 513 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 514 ///    both parameters.
 515 static __inline__ __m64 __DEFAULT_FN_ATTRS
 516 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 517 {
 518     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 519 }
 520
 521 /// Subtracts each 16-bit integer element of the second 64-bit integer
 522 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
 523 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
 524 ///    results are packed into a 64-bit integer vector of [4 x i16].
 525 ///
 526 /// \headerfile <x86intrin.h>
 527 ///
 528 /// This intrinsic corresponds to the <c> PSUBW </c> instruction.
 529 ///
 530 /// \param __m1
 531 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
 532 /// \param __m2
 533 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 534 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 535 ///    both parameters.
 536 static __inline__ __m64 __DEFAULT_FN_ATTRS
 537 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 538 {
 539     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 540 }
 541
 542 /// Subtracts each 32-bit integer element of the second 64-bit integer
 543 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
 544 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
 545 ///    results are packed into a 64-bit integer vector of [2 x i32].
 546 ///
 547 /// \headerfile <x86intrin.h>
 548 ///
 549 /// This intrinsic corresponds to the <c> PSUBD </c> instruction.
 550 ///
 551 /// \param __m1
 552 ///    A 64-bit integer vector of [2 x i32] containing the minuends.
 553 /// \param __m2
 554 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 555 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 556 ///    both parameters.
 557 static __inline__ __m64 __DEFAULT_FN_ATTRS
 558 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 559 {
 560     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 561 }
 562
 563 /// Subtracts each 8-bit signed integer element of the second 64-bit
 564 ///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
 565 ///    element of the first 64-bit integer vector of [8 x i8]. Positive results
 566 ///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
 567 ///    are saturated to 0x80. The results are packed into a 64-bit integer
 568 ///    vector of [8 x i8].
 569 ///
 570 /// \headerfile <x86intrin.h>
 571 ///
 572 /// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
 573 ///
 574 /// \param __m1
 575 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
 576 /// \param __m2
 577 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 578 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 579 ///    differences of both parameters.
 580 static __inline__ __m64 __DEFAULT_FN_ATTRS
 581 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 582 {
 583     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 584 }
 585
 586 /// Subtracts each 16-bit signed integer element of the second 64-bit
 587 ///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
 588 ///    element of the first 64-bit integer vector of [4 x i16]. Positive results
 589 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
 590 ///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
 591 ///    integer vector of [4 x i16].
 592 ///
 593 /// \headerfile <x86intrin.h>
 594 ///
 595 /// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
 596 ///
 597 /// \param __m1
 598 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
 599 /// \param __m2
 600 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 601 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 602 ///    differences of both parameters.
 603 static __inline__ __m64 __DEFAULT_FN_ATTRS
 604 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 605 {
 606     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 607 }
 608
 609 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
 610 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
 611 ///    element of the first 64-bit integer vector of [8 x i8].
 612 ///
 613 ///    If an element of the first vector is less than the corresponding element
 614 ///    of the second vector, the result is saturated to 0. The results are
 615 ///    packed into a 64-bit integer vector of [8 x i8].
 616 ///
 617 /// \headerfile <x86intrin.h>
 618 ///
 619 /// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
 620 ///
 621 /// \param __m1
 622 ///    A 64-bit integer vector of [8 x i8] containing the minuends.
 623 /// \param __m2
 624 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 625 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 626 ///    differences of both parameters.
 627 static __inline__ __m64 __DEFAULT_FN_ATTRS
 628 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 629 {
 630     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 631 }
 632
 633 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
 634 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
 635 ///    integer element of the first 64-bit integer vector of [4 x i16].
 636 ///
 637 ///    If an element of the first vector is less than the corresponding element
 638 ///    of the second vector, the result is saturated to 0. The results are
 639 ///    packed into a 64-bit integer vector of [4 x i16].
 640 ///
 641 /// \headerfile <x86intrin.h>
 642 ///
 643 /// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
 644 ///
 645 /// \param __m1
 646 ///    A 64-bit integer vector of [4 x i16] containing the minuends.
 647 /// \param __m2
 648 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 649 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 650 ///    differences of both parameters.
 651 static __inline__ __m64 __DEFAULT_FN_ATTRS
 652 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 653 {
 654     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 655 }
 656
 657 /// Multiplies each 16-bit signed integer element of the first 64-bit
 658 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 659 ///    element of the second 64-bit integer vector of [4 x i16] and get four
 660 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
 661 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
 662 ///    vector of [2 x i32].
 663 ///
 664 ///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
 665 ///    of both parameters are multiplied, and the sum of both results is written
 666 ///    to bits [31:0] of the result.
 667 ///
 668 /// \headerfile <x86intrin.h>
 669 ///
 670 /// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
 671 ///
 672 /// \param __m1
 673 ///    A 64-bit integer vector of [4 x i16].
 674 /// \param __m2
 675 ///    A 64-bit integer vector of [4 x i16].
 676 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 677 ///    products of both parameters.
 678 static __inline__ __m64 __DEFAULT_FN_ATTRS
 679 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 680 {
 681     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 682 }
 683
 684 /// Multiplies each 16-bit signed integer element of the first 64-bit
 685 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 686 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
 687 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
 688 ///
 689 /// \headerfile <x86intrin.h>
 690 ///
 691 /// This intrinsic corresponds to the <c> PMULHW </c> instruction.
 692 ///
 693 /// \param __m1
 694 ///    A 64-bit integer vector of [4 x i16].
 695 /// \param __m2
 696 ///    A 64-bit integer vector of [4 x i16].
 697 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 698 ///    of the products of both parameters.
 699 static __inline__ __m64 __DEFAULT_FN_ATTRS
 700 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 701 {
 702     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 703 }
 704
 705 /// Multiplies each 16-bit signed integer element of the first 64-bit
 706 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 707 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
 708 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
 709 ///
 710 /// \headerfile <x86intrin.h>
 711 ///
 712 /// This intrinsic corresponds to the <c> PMULLW </c> instruction.
 713 ///
 714 /// \param __m1
 715 ///    A 64-bit integer vector of [4 x i16].
 716 /// \param __m2
 717 ///    A 64-bit integer vector of [4 x i16].
 718 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 719 ///    of the products of both parameters.
 720 static __inline__ __m64 __DEFAULT_FN_ATTRS
 721 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 722 {
 723     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 724 }
 725
 726 /// Left-shifts each 16-bit signed integer element of the first
 727 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
 728 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 729 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
 730 ///    [4 x i16].
 731 ///
 732 /// \headerfile <x86intrin.h>
 733 ///
 734 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 735 ///
 736 /// \param __m
 737 ///    A 64-bit integer vector of [4 x i16].
 738 /// \param __count
 739 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 740 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 741 ///    values. If \a __count is greater or equal to 16, the result is set to all
 742 ///    0.
 743 static __inline__ __m64 __DEFAULT_FN_ATTRS
 744 _mm_sll_pi16(__m64 __m, __m64 __count)
 745 {
 746     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 747 }
 748
 749 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
 750 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
 751 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
 752 ///    of [4 x i16].
 753 ///
 754 /// \headerfile <x86intrin.h>
 755 ///
 756 /// This intrinsic corresponds to the <c> PSLLW </c> instruction.
 757 ///
 758 /// \param __m
 759 ///    A 64-bit integer vector of [4 x i16].
 760 /// \param __count
 761 ///    A 32-bit integer value.
 762 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 763 ///    values. If \a __count is greater or equal to 16, the result is set to all
 764 ///    0.
 765 static __inline__ __m64 __DEFAULT_FN_ATTRS
 766 _mm_slli_pi16(__m64 __m, int __count)
 767 {
 768     return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 769 }
 770
 771 /// Left-shifts each 32-bit signed integer element of the first
 772 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
 773 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 774 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
 775 ///    [2 x i32].
 776 ///
 777 /// \headerfile <x86intrin.h>
 778 ///
 779 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 780 ///
 781 /// \param __m
 782 ///    A 64-bit integer vector of [2 x i32].
 783 /// \param __count
 784 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 785 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 786 ///    values. If \a __count is greater or equal to 32, the result is set to all
 787 ///    0.
 788 static __inline__ __m64 __DEFAULT_FN_ATTRS
 789 _mm_sll_pi32(__m64 __m, __m64 __count)
 790 {
 791     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 792 }
 793
 794 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
 795 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
 796 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
 797 ///    of [2 x i32].
 798 ///
 799 /// \headerfile <x86intrin.h>
 800 ///
 801 /// This intrinsic corresponds to the <c> PSLLD </c> instruction.
 802 ///
 803 /// \param __m
 804 ///    A 64-bit integer vector of [2 x i32].
 805 /// \param __count
 806 ///    A 32-bit integer value.
 807 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 808 ///    values. If \a __count is greater or equal to 32, the result is set to all
 809 ///    0.
 810 static __inline__ __m64 __DEFAULT_FN_ATTRS
 811 _mm_slli_pi32(__m64 __m, int __count)
 812 {
 813     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 814 }
 815
 816 /// Left-shifts the first 64-bit integer parameter by the number of bits
 817 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
 818 ///    result are returned.
 819 ///
 820 /// \headerfile <x86intrin.h>
 821 ///
 822 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 823 ///
 824 /// \param __m
 825 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 826 /// \param __count
 827 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 828 /// \returns A 64-bit integer vector containing the left-shifted value. If
 829 ///     \a __count is greater or equal to 64, the result is set to 0.
 830 static __inline__ __m64 __DEFAULT_FN_ATTRS
 831 _mm_sll_si64(__m64 __m, __m64 __count)
 832 {
 833     return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
 834 }
 835
 836 /// Left-shifts the first parameter, which is a 64-bit integer, by the
 837 ///    number of bits specified by the second parameter, which is a 32-bit
 838 ///    integer. The lower 64 bits of result are returned.
 839 ///
 840 /// \headerfile <x86intrin.h>
 841 ///
 842 /// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
 843 ///
 844 /// \param __m
 845 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 846 /// \param __count
 847 ///    A 32-bit integer value.
 848 /// \returns A 64-bit integer vector containing the left-shifted value. If
 849 ///     \a __count is greater or equal to 64, the result is set to 0.
 850 static __inline__ __m64 __DEFAULT_FN_ATTRS
 851 _mm_slli_si64(__m64 __m, int __count)
 852 {
 853     return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
 854 }
 855
 856 /// Right-shifts each 16-bit integer element of the first parameter,
 857 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
 858 ///    specified by the second parameter, which is a 64-bit integer.
 859 ///
 860 ///    High-order bits are filled with the sign bit of the initial value of each
 861 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
 862 ///    vector of [4 x i16].
 863 ///
 864 /// \headerfile <x86intrin.h>
 865 ///
 866 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 867 ///
 868 /// \param __m
 869 ///    A 64-bit integer vector of [4 x i16].
 870 /// \param __count
 871 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 872 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 873 ///    values.
 874 static __inline__ __m64 __DEFAULT_FN_ATTRS
 875 _mm_sra_pi16(__m64 __m, __m64 __count)
 876 {
 877     return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 878 }
 879
 880 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
 881 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
 882 ///
 883 ///    High-order bits are filled with the sign bit of the initial value of each
 884 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
 885 ///    vector of [4 x i16].
 886 ///
 887 /// \headerfile <x86intrin.h>
 888 ///
 889 /// This intrinsic corresponds to the <c> PSRAW </c> instruction.
 890 ///
 891 /// \param __m
 892 ///    A 64-bit integer vector of [4 x i16].
 893 /// \param __count
 894 ///    A 32-bit integer value.
 895 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 896 ///    values.
 897 static __inline__ __m64 __DEFAULT_FN_ATTRS
 898 _mm_srai_pi16(__m64 __m, int __count)
 899 {
 900     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 901 }
 902
 903 /// Right-shifts each 32-bit integer element of the first parameter,
 904 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
 905 ///    specified by the second parameter, which is a 64-bit integer.
 906 ///
 907 ///    High-order bits are filled with the sign bit of the initial value of each
 908 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
 909 ///    vector of [2 x i32].
 910 ///
 911 /// \headerfile <x86intrin.h>
 912 ///
 913 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 914 ///
 915 /// \param __m
 916 ///    A 64-bit integer vector of [2 x i32].
 917 /// \param __count
 918 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 919 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 920 ///    values.
 921 static __inline__ __m64 __DEFAULT_FN_ATTRS
 922 _mm_sra_pi32(__m64 __m, __m64 __count)
 923 {
 924     return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 925 }
 926
 927 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
 928 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
 929 ///
 930 ///    High-order bits are filled with the sign bit of the initial value of each
 931 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
 932 ///    vector of [2 x i32].
 933 ///
 934 /// \headerfile <x86intrin.h>
 935 ///
 936 /// This intrinsic corresponds to the <c> PSRAD </c> instruction.
 937 ///
 938 /// \param __m
 939 ///    A 64-bit integer vector of [2 x i32].
 940 /// \param __count
 941 ///    A 32-bit integer value.
 942 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 943 ///    values.
 944 static __inline__ __m64 __DEFAULT_FN_ATTRS
 945 _mm_srai_pi32(__m64 __m, int __count)
 946 {
 947     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 948 }
 949
 950 /// Right-shifts each 16-bit integer element of the first parameter,
 951 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
 952 ///    specified by the second parameter, which is a 64-bit integer.
 953 ///
 954 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
 955 ///    integer vector of [4 x i16].
 956 ///
 957 /// \headerfile <x86intrin.h>
 958 ///
 959 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 960 ///
 961 /// \param __m
 962 ///    A 64-bit integer vector of [4 x i16].
 963 /// \param __count
 964 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 965 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 966 ///    values.
 967 static __inline__ __m64 __DEFAULT_FN_ATTRS
 968 _mm_srl_pi16(__m64 __m, __m64 __count)
 969 {
 970     return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 971 }
 972
 973 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
 974 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
 975 ///
 976 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
 977 ///    integer vector of [4 x i16].
 978 ///
 979 /// \headerfile <x86intrin.h>
 980 ///
 981 /// This intrinsic corresponds to the <c> PSRLW </c> instruction.
 982 ///
 983 /// \param __m
 984 ///    A 64-bit integer vector of [4 x i16].
 985 /// \param __count
 986 ///    A 32-bit integer value.
 987 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 988 ///    values.
 989 static __inline__ __m64 __DEFAULT_FN_ATTRS
 990 _mm_srli_pi16(__m64 __m, int __count)
 991 {
 992     return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 993 }
 994
 995 /// Right-shifts each 32-bit integer element of the first parameter,
 996 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
 997 ///    specified by the second parameter, which is a 64-bit integer.
 998 ///
 999 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1000 ///    integer vector of [2 x i32].
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1005 ///
1006 /// \param __m
1007 ///    A 64-bit integer vector of [2 x i32].
1008 /// \param __count
1009 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1010 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1011 ///    values.
1012 static __inline__ __m64 __DEFAULT_FN_ATTRS
1013 _mm_srl_pi32(__m64 __m, __m64 __count)
1014 {
1015     return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
1016 }
1017
1018 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
1019 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
1020 ///
1021 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
1022 ///    integer vector of [2 x i32].
1023 ///
1024 /// \headerfile <x86intrin.h>
1025 ///
1026 /// This intrinsic corresponds to the <c> PSRLD </c> instruction.
1027 ///
1028 /// \param __m
1029 ///    A 64-bit integer vector of [2 x i32].
1030 /// \param __count
1031 ///    A 32-bit integer value.
1032 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1033 ///    values.
1034 static __inline__ __m64 __DEFAULT_FN_ATTRS
1035 _mm_srli_pi32(__m64 __m, int __count)
1036 {
1037     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
1038 }
1039
1040 /// Right-shifts the first 64-bit integer parameter by the number of bits
1041 ///    specified by the second 64-bit integer parameter.
1042 ///
1043 ///    High-order bits are cleared.
1044 ///
1045 /// \headerfile <x86intrin.h>
1046 ///
1047 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1048 ///
1049 /// \param __m
1050 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1051 /// \param __count
1052 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1053 /// \returns A 64-bit integer vector containing the right-shifted value.
1054 static __inline__ __m64 __DEFAULT_FN_ATTRS
1055 _mm_srl_si64(__m64 __m, __m64 __count)
1056 {
1057     return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
1058 }
1059
1060 /// Right-shifts the first parameter, which is a 64-bit integer, by the
1061 ///    number of bits specified by the second parameter, which is a 32-bit
1062 ///    integer.
1063 ///
1064 ///    High-order bits are cleared.
1065 ///
1066 /// \headerfile <x86intrin.h>
1067 ///
1068 /// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
1069 ///
1070 /// \param __m
1071 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
1072 /// \param __count
1073 ///    A 32-bit integer value.
1074 /// \returns A 64-bit integer vector containing the right-shifted value.
1075 static __inline__ __m64 __DEFAULT_FN_ATTRS
1076 _mm_srli_si64(__m64 __m, int __count)
1077 {
1078     return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
1079 }
1080
1081 /// Performs a bitwise AND of two 64-bit integer vectors.
1082 ///
1083 /// \headerfile <x86intrin.h>
1084 ///
1085 /// This intrinsic corresponds to the <c> PAND </c> instruction.
1086 ///
1087 /// \param __m1
1088 ///    A 64-bit integer vector.
1089 /// \param __m2
1090 ///    A 64-bit integer vector.
1091 /// \returns A 64-bit integer vector containing the bitwise AND of both
1092 ///    parameters.
1093 static __inline__ __m64 __DEFAULT_FN_ATTRS
1094 _mm_and_si64(__m64 __m1, __m64 __m2)
1095 {
1096     return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
1097 }
1098
1099 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
1100 ///    performs a bitwise AND of the intermediate result and the second 64-bit
1101 ///    integer vector.
1102 ///
1103 /// \headerfile <x86intrin.h>
1104 ///
1105 /// This intrinsic corresponds to the <c> PANDN </c> instruction.
1106 ///
1107 /// \param __m1
1108 ///    A 64-bit integer vector. The one's complement of this parameter is used
1109 ///    in the bitwise AND.
1110 /// \param __m2
1111 ///    A 64-bit integer vector.
1112 /// \returns A 64-bit integer vector containing the bitwise AND of the second
1113 ///    parameter and the one's complement of the first parameter.
1114 static __inline__ __m64 __DEFAULT_FN_ATTRS
1115 _mm_andnot_si64(__m64 __m1, __m64 __m2)
1116 {
1117     return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
1118 }
1119
1120 /// Performs a bitwise OR of two 64-bit integer vectors.
1121 ///
1122 /// \headerfile <x86intrin.h>
1123 ///
1124 /// This intrinsic corresponds to the <c> POR </c> instruction.
1125 ///
1126 /// \param __m1
1127 ///    A 64-bit integer vector.
1128 /// \param __m2
1129 ///    A 64-bit integer vector.
1130 /// \returns A 64-bit integer vector containing the bitwise OR of both
1131 ///    parameters.
1132 static __inline__ __m64 __DEFAULT_FN_ATTRS
1133 _mm_or_si64(__m64 __m1, __m64 __m2)
1134 {
1135     return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
1136 }
1137
1138 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
1139 ///
1140 /// \headerfile <x86intrin.h>
1141 ///
1142 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1143 ///
1144 /// \param __m1
1145 ///    A 64-bit integer vector.
1146 /// \param __m2
1147 ///    A 64-bit integer vector.
1148 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1149 ///    parameters.
1150 static __inline__ __m64 __DEFAULT_FN_ATTRS
1151 _mm_xor_si64(__m64 __m1, __m64 __m2)
1152 {
1153     return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
1154 }
1155
1156 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1157 ///    [8 x i8] to determine if the element of the first vector is equal to the
1158 ///    corresponding element of the second vector.
1159 ///
1160 ///    The comparison yields 0 for false, 0xFF for true.
1161 ///
1162 /// \headerfile <x86intrin.h>
1163 ///
1164 /// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
1165 ///
1166 /// \param __m1
1167 ///    A 64-bit integer vector of [8 x i8].
1168 /// \param __m2
1169 ///    A 64-bit integer vector of [8 x i8].
1170 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1171 ///    results.
1172 static __inline__ __m64 __DEFAULT_FN_ATTRS
1173 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1174 {
1175     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
1176 }
1177
1178 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1179 ///    [4 x i16] to determine if the element of the first vector is equal to the
1180 ///    corresponding element of the second vector.
1181 ///
1182 ///    The comparison yields 0 for false, 0xFFFF for true.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
1187 ///
1188 /// \param __m1
1189 ///    A 64-bit integer vector of [4 x i16].
1190 /// \param __m2
1191 ///    A 64-bit integer vector of [4 x i16].
1192 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1193 ///    results.
1194 static __inline__ __m64 __DEFAULT_FN_ATTRS
1195 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1196 {
1197     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
1198 }
1199
1200 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1201 ///    [2 x i32] to determine if the element of the first vector is equal to the
1202 ///    corresponding element of the second vector.
1203 ///
1204 ///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1205 ///
1206 /// \headerfile <x86intrin.h>
1207 ///
1208 /// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
1209 ///
1210 /// \param __m1
1211 ///    A 64-bit integer vector of [2 x i32].
1212 /// \param __m2
1213 ///    A 64-bit integer vector of [2 x i32].
1214 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1215 ///    results.
1216 static __inline__ __m64 __DEFAULT_FN_ATTRS
1217 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1218 {
1219     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
1220 }
1221
1222 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
1223 ///    [8 x i8] to determine if the element of the first vector is greater than
1224 ///    the corresponding element of the second vector.
1225 ///
1226 ///    The comparison yields 0 for false, 0xFF for true.
1227 ///
1228 /// \headerfile <x86intrin.h>
1229 ///
1230 /// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
1231 ///
1232 /// \param __m1
1233 ///    A 64-bit integer vector of [8 x i8].
1234 /// \param __m2
1235 ///    A 64-bit integer vector of [8 x i8].
1236 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1237 ///    results.
1238 static __inline__ __m64 __DEFAULT_FN_ATTRS
1239 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1240 {
1241     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
1242 }
1243
1244 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
1245 ///    [4 x i16] to determine if the element of the first vector is greater than
1246 ///    the corresponding element of the second vector.
1247 ///
1248 ///    The comparison yields 0 for false, 0xFFFF for true.
1249 ///
1250 /// \headerfile <x86intrin.h>
1251 ///
1252 /// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
1253 ///
1254 /// \param __m1
1255 ///    A 64-bit integer vector of [4 x i16].
1256 /// \param __m2
1257 ///    A 64-bit integer vector of [4 x i16].
1258 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1259 ///    results.
1260 static __inline__ __m64 __DEFAULT_FN_ATTRS
1261 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1262 {
1263     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
1264 }
1265
1266 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
1267 ///    [2 x i32] to determine if the element of the first vector is greater than
1268 ///    the corresponding element of the second vector.
1269 ///
1270 ///    The comparison yields 0 for false, 0xFFFFFFFF for true.
1271 ///
1272 /// \headerfile <x86intrin.h>
1273 ///
1274 /// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
1275 ///
1276 /// \param __m1
1277 ///    A 64-bit integer vector of [2 x i32].
1278 /// \param __m2
1279 ///    A 64-bit integer vector of [2 x i32].
1280 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1281 ///    results.
1282 static __inline__ __m64 __DEFAULT_FN_ATTRS
1283 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1284 {
1285     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
1286 }
1287
1288 /// Constructs a 64-bit integer vector initialized to zero.
1289 ///
1290 /// \headerfile <x86intrin.h>
1291 ///
1292 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
1293 ///
1294 /// \returns An initialized 64-bit integer vector with all elements set to zero.
1295 static __inline__ __m64 __DEFAULT_FN_ATTRS
1296 _mm_setzero_si64(void)
1297 {
1298     return __extension__ (__m64){ 0LL };
1299 }
1300
1301 /// Constructs a 64-bit integer vector initialized with the specified
1302 ///    32-bit integer values.
1303 ///
1304 /// \headerfile <x86intrin.h>
1305 ///
1306 /// This intrinsic is a utility function and does not correspond to a specific
1307 ///    instruction.
1308 ///
1309 /// \param __i1
1310 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1311 ///    result.
1312 /// \param __i0
1313 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1314 ///    result.
1315 /// \returns An initialized 64-bit integer vector.
1316 static __inline__ __m64 __DEFAULT_FN_ATTRS
1317 _mm_set_pi32(int __i1, int __i0)
1318 {
1319     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
1320 }
1321
1322 /// Constructs a 64-bit integer vector initialized with the specified
1323 ///    16-bit integer values.
1324 ///
1325 /// \headerfile <x86intrin.h>
1326 ///
1327 /// This intrinsic is a utility function and does not correspond to a specific
1328 ///    instruction.
1329 ///
1330 /// \param __s3
1331 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1332 /// \param __s2
1333 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1334 /// \param __s1
1335 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1336 /// \param __s0
1337 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1338 /// \returns An initialized 64-bit integer vector.
1339 static __inline__ __m64 __DEFAULT_FN_ATTRS
1340 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1341 {
1342     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
1343 }
1344
1345 /// Constructs a 64-bit integer vector initialized with the specified
1346 ///    8-bit integer values.
1347 ///
1348 /// \headerfile <x86intrin.h>
1349 ///
1350 /// This intrinsic is a utility function and does not correspond to a specific
1351 ///    instruction.
1352 ///
1353 /// \param __b7
1354 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1355 /// \param __b6
1356 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1357 /// \param __b5
1358 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1359 /// \param __b4
1360 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1361 /// \param __b3
1362 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1363 /// \param __b2
1364 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1365 /// \param __b1
1366 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1367 /// \param __b0
1368 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1369 /// \returns An initialized 64-bit integer vector.
1370 static __inline__ __m64 __DEFAULT_FN_ATTRS
1371 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1372             char __b1, char __b0)
1373 {
1374     return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
1375                                                __b4, __b5, __b6, __b7);
1376 }
1377
1378 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
1379 ///    32-bit integer vector elements set to the specified 32-bit integer
1380 ///    value.
1381 ///
1382 /// \headerfile <x86intrin.h>
1383 ///
1384 /// This intrinsic is a utility function and does not correspond to a specific
1385 ///    instruction.
1386 ///
1387 /// \param __i
1388 ///    A 32-bit integer value used to initialize each vector element of the
1389 ///    result.
1390 /// \returns An initialized 64-bit integer vector of [2 x i32].
1391 static __inline__ __m64 __DEFAULT_FN_ATTRS
1392 _mm_set1_pi32(int __i)
1393 {
1394     return _mm_set_pi32(__i, __i);
1395 }
1396
1397 /// Constructs a 64-bit integer vector of [4 x i16], with each of the
1398 ///    16-bit integer vector elements set to the specified 16-bit integer
1399 ///    value.
1400 ///
1401 /// \headerfile <x86intrin.h>
1402 ///
1403 /// This intrinsic is a utility function and does not correspond to a specific
1404 ///    instruction.
1405 ///
1406 /// \param __w
1407 ///    A 16-bit integer value used to initialize each vector element of the
1408 ///    result.
1409 /// \returns An initialized 64-bit integer vector of [4 x i16].
1410 static __inline__ __m64 __DEFAULT_FN_ATTRS
1411 _mm_set1_pi16(short __w)
1412 {
1413     return _mm_set_pi16(__w, __w, __w, __w);
1414 }
1415
1416 /// Constructs a 64-bit integer vector of [8 x i8], with each of the
1417 ///    8-bit integer vector elements set to the specified 8-bit integer value.
1418 ///
1419 /// \headerfile <x86intrin.h>
1420 ///
1421 /// This intrinsic is a utility function and does not correspond to a specific
1422 ///    instruction.
1423 ///
1424 /// \param __b
1425 ///    An 8-bit integer value used to initialize each vector element of the
1426 ///    result.
1427 /// \returns An initialized 64-bit integer vector of [8 x i8].
1428 static __inline__ __m64 __DEFAULT_FN_ATTRS
1429 _mm_set1_pi8(char __b)
1430 {
1431     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1432 }
1433
1434 /// Constructs a 64-bit integer vector, initialized in reverse order with
1435 ///    the specified 32-bit integer values.
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic is a utility function and does not correspond to a specific
1440 ///    instruction.
1441 ///
1442 /// \param __i0
1443 ///    A 32-bit integer value used to initialize the lower 32 bits of the
1444 ///    result.
1445 /// \param __i1
1446 ///    A 32-bit integer value used to initialize the upper 32 bits of the
1447 ///    result.
1448 /// \returns An initialized 64-bit integer vector.
1449 static __inline__ __m64 __DEFAULT_FN_ATTRS
1450 _mm_setr_pi32(int __i0, int __i1)
1451 {
1452     return _mm_set_pi32(__i1, __i0);
1453 }
1454
1455 /// Constructs a 64-bit integer vector, initialized in reverse order with
1456 ///    the specified 16-bit integer values.
1457 ///
1458 /// \headerfile <x86intrin.h>
1459 ///
1460 /// This intrinsic is a utility function and does not correspond to a specific
1461 ///    instruction.
1462 ///
1463 /// \param __w0
1464 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
1465 /// \param __w1
1466 ///    A 16-bit integer value used to initialize bits [31:16] of the result.
1467 /// \param __w2
1468 ///    A 16-bit integer value used to initialize bits [47:32] of the result.
1469 /// \param __w3
1470 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
1471 /// \returns An initialized 64-bit integer vector.
1472 static __inline__ __m64 __DEFAULT_FN_ATTRS
1473 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1474 {
1475     return _mm_set_pi16(__w3, __w2, __w1, __w0);
1476 }
1477
1478 /// Constructs a 64-bit integer vector, initialized in reverse order with
1479 ///    the specified 8-bit integer values.
1480 ///
1481 /// \headerfile <x86intrin.h>
1482 ///
1483 /// This intrinsic is a utility function and does not correspond to a specific
1484 ///    instruction.
1485 ///
1486 /// \param __b0
1487 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
1488 /// \param __b1
1489 ///    An 8-bit integer value used to initialize bits [15:8] of the result.
1490 /// \param __b2
1491 ///    An 8-bit integer value used to initialize bits [23:16] of the result.
1492 /// \param __b3
1493 ///    An 8-bit integer value used to initialize bits [31:24] of the result.
1494 /// \param __b4
1495 ///    An 8-bit integer value used to initialize bits [39:32] of the result.
1496 /// \param __b5
1497 ///    An 8-bit integer value used to initialize bits [47:40] of the result.
1498 /// \param __b6
1499 ///    An 8-bit integer value used to initialize bits [55:48] of the result.
1500 /// \param __b7
1501 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
1502 /// \returns An initialized 64-bit integer vector.
1503 static __inline__ __m64 __DEFAULT_FN_ATTRS
1504 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1505              char __b6, char __b7)
1506 {
1507     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1508 }
1509
1510 #undef __DEFAULT_FN_ATTRS
1511
1512 /* Aliases for compatibility. */
1513 #define _m_empty _mm_empty
1514 #define _m_from_int _mm_cvtsi32_si64
1515 #define _m_from_int64 _mm_cvtsi64_m64
1516 #define _m_to_int _mm_cvtsi64_si32
1517 #define _m_to_int64 _mm_cvtm64_si64
1518 #define _m_packsswb _mm_packs_pi16
1519 #define _m_packssdw _mm_packs_pi32
1520 #define _m_packuswb _mm_packs_pu16
1521 #define _m_punpckhbw _mm_unpackhi_pi8
1522 #define _m_punpckhwd _mm_unpackhi_pi16
1523 #define _m_punpckhdq _mm_unpackhi_pi32
1524 #define _m_punpcklbw _mm_unpacklo_pi8
1525 #define _m_punpcklwd _mm_unpacklo_pi16
1526 #define _m_punpckldq _mm_unpacklo_pi32
1527 #define _m_paddb _mm_add_pi8
1528 #define _m_paddw _mm_add_pi16
1529 #define _m_paddd _mm_add_pi32
1530 #define _m_paddsb _mm_adds_pi8
1531 #define _m_paddsw _mm_adds_pi16
1532 #define _m_paddusb _mm_adds_pu8
1533 #define _m_paddusw _mm_adds_pu16
1534 #define _m_psubb _mm_sub_pi8
1535 #define _m_psubw _mm_sub_pi16
1536 #define _m_psubd _mm_sub_pi32
1537 #define _m_psubsb _mm_subs_pi8
1538 #define _m_psubsw _mm_subs_pi16
1539 #define _m_psubusb _mm_subs_pu8
1540 #define _m_psubusw _mm_subs_pu16
1541 #define _m_pmaddwd _mm_madd_pi16
1542 #define _m_pmulhw _mm_mulhi_pi16
1543 #define _m_pmullw _mm_mullo_pi16
1544 #define _m_psllw _mm_sll_pi16
1545 #define _m_psllwi _mm_slli_pi16
1546 #define _m_pslld _mm_sll_pi32
1547 #define _m_pslldi _mm_slli_pi32
1548 #define _m_psllq _mm_sll_si64
1549 #define _m_psllqi _mm_slli_si64
1550 #define _m_psraw _mm_sra_pi16
1551 #define _m_psrawi _mm_srai_pi16
1552 #define _m_psrad _mm_sra_pi32
1553 #define _m_psradi _mm_srai_pi32
1554 #define _m_psrlw _mm_srl_pi16
1555 #define _m_psrlwi _mm_srli_pi16
1556 #define _m_psrld _mm_srl_pi32
1557 #define _m_psrldi _mm_srli_pi32
1558 #define _m_psrlq _mm_srl_si64
1559 #define _m_psrlqi _mm_srli_si64
1560 #define _m_pand _mm_and_si64
1561 #define _m_pandn _mm_andnot_si64
1562 #define _m_por _mm_or_si64
1563 #define _m_pxor _mm_xor_si64
1564 #define _m_pcmpeqb _mm_cmpeq_pi8
1565 #define _m_pcmpeqw _mm_cmpeq_pi16
1566 #define _m_pcmpeqd _mm_cmpeq_pi32
1567 #define _m_pcmpgtb _mm_cmpgt_pi8
1568 #define _m_pcmpgtw _mm_cmpgt_pi16
1569 #define _m_pcmpgtd _mm_cmpgt_pi32
1570
1571 #endif /* __MMINTRIN_H */
1572