contrib/llvm/tools/clang/lib/Headers/pmmintrin.h

   1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
   2  *
   3  * Permission is hereby granted, free of charge, to any person obtaining a copy
   4  * of this software and associated documentation files (the "Software"), to deal
   5  * in the Software without restriction, including without limitation the rights
   6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   7  * copies of the Software, and to permit persons to whom the Software is
   8  * furnished to do so, subject to the following conditions:
   9  *
  10  * The above copyright notice and this permission notice shall be included in
  11  * all copies or substantial portions of the Software.
  12  *
  13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19  * THE SOFTWARE.
  20  *
  21  *===-----------------------------------------------------------------------===
  22  */
  23
  24 #ifndef __PMMINTRIN_H
  25 #define __PMMINTRIN_H
  26
  27 #include <emmintrin.h>
  28
  29 /* Define the default attributes for the functions in this file. */
  30 #define __DEFAULT_FN_ATTRS \
  31   __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
  32
  33 /// Loads data from an unaligned memory location to elements in a 128-bit
  34 ///    vector.
  35 ///
  36 ///    If the address of the data is not 16-byte aligned, the instruction may
  37 ///    read two adjacent aligned blocks of memory to retrieve the requested
  38 ///    data.
  39 ///
  40 /// \headerfile <x86intrin.h>
  41 ///
  42 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
  43 ///
  44 /// \param __p
  45 ///    A pointer to a 128-bit integer vector containing integer values.
  46 /// \returns A 128-bit vector containing the moved values.
  47 static __inline__ __m128i __DEFAULT_FN_ATTRS
  48 _mm_lddqu_si128(__m128i const *__p)
  49 {
  50   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
  51 }
  52
  53 /// Adds the even-indexed values and subtracts the odd-indexed values of
  54 ///    two 128-bit vectors of [4 x float].
  55 ///
  56 /// \headerfile <x86intrin.h>
  57 ///
  58 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
  59 ///
  60 /// \param __a
  61 ///    A 128-bit vector of [4 x float] containing the left source operand.
  62 /// \param __b
  63 ///    A 128-bit vector of [4 x float] containing the right source operand.
  64 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
  65 ///    differences of both operands.
  66 static __inline__ __m128 __DEFAULT_FN_ATTRS
  67 _mm_addsub_ps(__m128 __a, __m128 __b)
  68 {
  69   return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
  70 }
  71
  72 /// Horizontally adds the adjacent pairs of values contained in two
  73 ///    128-bit vectors of [4 x float].
  74 ///
  75 /// \headerfile <x86intrin.h>
  76 ///
  77 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
  78 ///
  79 /// \param __a
  80 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  81 ///    The horizontal sums of the values are stored in the lower bits of the
  82 ///    destination.
  83 /// \param __b
  84 ///    A 128-bit vector of [4 x float] containing one of the source operands.
  85 ///    The horizontal sums of the values are stored in the upper bits of the
  86 ///    destination.
  87 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
  88 ///    both operands.
  89 static __inline__ __m128 __DEFAULT_FN_ATTRS
  90 _mm_hadd_ps(__m128 __a, __m128 __b)
  91 {
  92   return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
  93 }
  94
  95 /// Horizontally subtracts the adjacent pairs of values contained in two
  96 ///    128-bit vectors of [4 x float].
  97 ///
  98 /// \headerfile <x86intrin.h>
  99 ///
 100 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 101 ///
 102 /// \param __a
 103 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 104 ///    The horizontal differences between the values are stored in the lower
 105 ///    bits of the destination.
 106 /// \param __b
 107 ///    A 128-bit vector of [4 x float] containing one of the source operands.
 108 ///    The horizontal differences between the values are stored in the upper
 109 ///    bits of the destination.
 110 /// \returns A 128-bit vector of [4 x float] containing the horizontal
 111 ///    differences of both operands.
 112 static __inline__ __m128 __DEFAULT_FN_ATTRS
 113 _mm_hsub_ps(__m128 __a, __m128 __b)
 114 {
 115   return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 116 }
 117
 118 /// Moves and duplicates odd-indexed values from a 128-bit vector
 119 ///    of [4 x float] to float values stored in a 128-bit vector of
 120 ///    [4 x float].
 121 ///
 122 /// \headerfile <x86intrin.h>
 123 ///
 124 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
 125 ///
 126 /// \param __a
 127 ///    A 128-bit vector of [4 x float]. \n
 128 ///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
 129 ///    the destination. \n
 130 ///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
 131 ///    destination.
 132 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 133 ///    values.
 134 static __inline__ __m128 __DEFAULT_FN_ATTRS
 135 _mm_movehdup_ps(__m128 __a)
 136 {
 137   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 138 }
 139
 140 /// Duplicates even-indexed values from a 128-bit vector of
 141 ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 142 ///
 143 /// \headerfile <x86intrin.h>
 144 ///
 145 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
 146 ///
 147 /// \param __a
 148 ///    A 128-bit vector of [4 x float] \n
 149 ///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
 150 ///    the destination. \n
 151 ///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
 152 ///    destination.
 153 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
 154 ///    values.
 155 static __inline__ __m128 __DEFAULT_FN_ATTRS
 156 _mm_moveldup_ps(__m128 __a)
 157 {
 158   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
 159 }
 160
 161 /// Adds the even-indexed values and subtracts the odd-indexed values of
 162 ///    two 128-bit vectors of [2 x double].
 163 ///
 164 /// \headerfile <x86intrin.h>
 165 ///
 166 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 167 ///
 168 /// \param __a
 169 ///    A 128-bit vector of [2 x double] containing the left source operand.
 170 /// \param __b
 171 ///    A 128-bit vector of [2 x double] containing the right source operand.
 172 /// \returns A 128-bit vector of [2 x double] containing the alternating sums
 173 ///    and differences of both operands.
 174 static __inline__ __m128d __DEFAULT_FN_ATTRS
 175 _mm_addsub_pd(__m128d __a, __m128d __b)
 176 {
 177   return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 178 }
 179
 180 /// Horizontally adds the pairs of values contained in two 128-bit
 181 ///    vectors of [2 x double].
 182 ///
 183 /// \headerfile <x86intrin.h>
 184 ///
 185 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 186 ///
 187 /// \param __a
 188 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 189 ///    The horizontal sum of the values is stored in the lower bits of the
 190 ///    destination.
 191 /// \param __b
 192 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 193 ///    The horizontal sum of the values is stored in the upper bits of the
 194 ///    destination.
 195 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
 196 ///    both operands.
 197 static __inline__ __m128d __DEFAULT_FN_ATTRS
 198 _mm_hadd_pd(__m128d __a, __m128d __b)
 199 {
 200   return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 201 }
 202
 203 /// Horizontally subtracts the pairs of values contained in two 128-bit
 204 ///    vectors of [2 x double].
 205 ///
 206 /// \headerfile <x86intrin.h>
 207 ///
 208 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 209 ///
 210 /// \param __a
 211 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 212 ///    The horizontal difference of the values is stored in the lower bits of
 213 ///    the destination.
 214 /// \param __b
 215 ///    A 128-bit vector of [2 x double] containing one of the source operands.
 216 ///    The horizontal difference of the values is stored in the upper bits of
 217 ///    the destination.
 218 /// \returns A 128-bit vector of [2 x double] containing the horizontal
 219 ///    differences of both operands.
 220 static __inline__ __m128d __DEFAULT_FN_ATTRS
 221 _mm_hsub_pd(__m128d __a, __m128d __b)
 222 {
 223   return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 224 }
 225
 226 /// Moves and duplicates one double-precision value to double-precision
 227 ///    values stored in a 128-bit vector of [2 x double].
 228 ///
 229 /// \headerfile <x86intrin.h>
 230 ///
 231 /// \code
 232 /// __m128d _mm_loaddup_pd(double const *dp);
 233 /// \endcode
 234 ///
 235 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 236 ///
 237 /// \param dp
 238 ///    A pointer to a double-precision value to be moved and duplicated.
 239 /// \returns A 128-bit vector of [2 x double] containing the moved and
 240 ///    duplicated values.
 241 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
 242
 243 /// Moves and duplicates the double-precision value in the lower bits of
 244 ///    a 128-bit vector of [2 x double] to double-precision values stored in a
 245 ///    128-bit vector of [2 x double].
 246 ///
 247 /// \headerfile <x86intrin.h>
 248 ///
 249 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
 250 ///
 251 /// \param __a
 252 ///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
 253 ///    [127:64] and [63:0] of the destination.
 254 /// \returns A 128-bit vector of [2 x double] containing the moved and
 255 ///    duplicated values.
 256 static __inline__ __m128d __DEFAULT_FN_ATTRS
 257 _mm_movedup_pd(__m128d __a)
 258 {
 259   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 260 }
 261
 262 /// Establishes a linear address memory range to be monitored and puts
 263 ///    the processor in the monitor event pending state. Data stored in the
 264 ///    monitored address range causes the processor to exit the pending state.
 265 ///
 266 /// \headerfile <x86intrin.h>
 267 ///
 268 /// This intrinsic corresponds to the <c> MONITOR </c> instruction.
 269 ///
 270 /// \param __p
 271 ///    The memory range to be monitored. The size of the range is determined by
 272 ///    CPUID function 0000_0005h.
 273 /// \param __extensions
 274 ///    Optional extensions for the monitoring state.
 275 /// \param __hints
 276 ///    Optional hints for the monitoring state.
 277 static __inline__ void __DEFAULT_FN_ATTRS
 278 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 279 {
 280   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 281 }
 282
 283 /// Used with the MONITOR instruction to wait while the processor is in
 284 ///    the monitor event pending state. Data stored in the monitored address
 285 ///    range causes the processor to exit the pending state.
 286 ///
 287 /// \headerfile <x86intrin.h>
 288 ///
 289 /// This intrinsic corresponds to the <c> MWAIT </c> instruction.
 290 ///
 291 /// \param __extensions
 292 ///    Optional extensions for the monitoring state, which may vary by
 293 ///    processor.
 294 /// \param __hints
 295 ///    Optional hints for the monitoring state, which may vary by processor.
 296 static __inline__ void __DEFAULT_FN_ATTRS
 297 _mm_mwait(unsigned __extensions, unsigned __hints)
 298 {
 299   __builtin_ia32_mwait(__extensions, __hints);
 300 }
 301
 302 #undef __DEFAULT_FN_ATTRS
 303
 304 #endif /* __PMMINTRIN_H */