1 /*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 *===-----------------------------------------------------------------------===
25 #error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
28 #ifndef __AVX512BWINTRIN_H
29 #define __AVX512BWINTRIN_H
31 typedef unsigned int __mmask32;
32 typedef unsigned long long __mmask64;
33 typedef char __v64qi __attribute__ ((__vector_size__ (64)));
34 typedef short __v32hi __attribute__ ((__vector_size__ (64)));
36 static __inline __v64qi __attribute__ ((__always_inline__, __nodebug__))
37 _mm512_setzero_qi (void) {
38 return (__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
39 0, 0, 0, 0, 0, 0, 0, 0,
40 0, 0, 0, 0, 0, 0, 0, 0,
41 0, 0, 0, 0, 0, 0, 0, 0,
42 0, 0, 0, 0, 0, 0, 0, 0,
43 0, 0, 0, 0, 0, 0, 0, 0,
44 0, 0, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0 };
48 static __inline __v32hi __attribute__ ((__always_inline__, __nodebug__))
49 _mm512_setzero_hi (void) {
50 return (__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
51 0, 0, 0, 0, 0, 0, 0, 0,
52 0, 0, 0, 0, 0, 0, 0, 0,
53 0, 0, 0, 0, 0, 0, 0, 0 };
58 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
59 _mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
60 return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
64 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
65 _mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
66 return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b,
70 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
71 _mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
72 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
76 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
77 _mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
78 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0,
82 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
83 _mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
84 return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
88 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
89 _mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
90 return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b,
94 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
95 _mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
96 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
100 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
101 _mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
102 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0,
106 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
107 _mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
108 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
112 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
113 _mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
114 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
118 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
119 _mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
120 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
124 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
125 _mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
126 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5,
130 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
131 _mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
132 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
136 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
137 _mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
138 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
142 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
143 _mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
144 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
148 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
149 _mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
150 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5,
154 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
155 _mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
156 return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
160 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
161 _mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
162 return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b,
166 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
167 _mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
168 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
172 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
173 _mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
174 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6,
178 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
179 _mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
180 return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
184 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
185 _mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
186 return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b,
190 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
191 _mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
192 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
196 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
197 _mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
198 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6,
202 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
203 _mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
204 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
208 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
209 _mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
210 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
214 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
215 _mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
216 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
220 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
221 _mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
222 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2,
226 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
227 _mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
228 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
232 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
233 _mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
234 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
238 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
239 _mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
240 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
244 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
245 _mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
246 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2,
250 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
251 _mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
252 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
256 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
257 _mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
258 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
262 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
263 _mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
264 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
268 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
269 _mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
270 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1,
274 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
275 _mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
276 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
280 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
281 _mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
282 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
286 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
287 _mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
288 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
292 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
293 _mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
294 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1,
298 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
299 _mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
300 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
304 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
305 _mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
306 return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
310 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
311 _mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
312 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
316 static __inline__ __mmask64 __attribute__((__always_inline__, __nodebug__))
317 _mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
318 return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4,
322 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
323 _mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
324 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
328 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
329 _mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
330 return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
334 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
335 _mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
336 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
340 static __inline__ __mmask32 __attribute__((__always_inline__, __nodebug__))
341 _mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
342 return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4,
346 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
347 _mm512_add_epi8 (__m512i __A, __m512i __B) {
348 return (__m512i) ((__v64qi) __A + (__v64qi) __B);
351 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
352 _mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
353 return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
359 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
360 _mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
361 return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
364 _mm512_setzero_qi (),
368 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
369 _mm512_sub_epi8 (__m512i __A, __m512i __B) {
370 return (__m512i) ((__v64qi) __A - (__v64qi) __B);
373 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
374 _mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
375 return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
381 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
382 _mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) {
383 return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
386 _mm512_setzero_qi (),
390 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
391 _mm512_add_epi16 (__m512i __A, __m512i __B) {
392 return (__m512i) ((__v32hi) __A + (__v32hi) __B);
395 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
396 _mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
397 return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
403 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
404 _mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
405 return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
408 _mm512_setzero_hi (),
412 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
413 _mm512_sub_epi16 (__m512i __A, __m512i __B) {
414 return (__m512i) ((__v32hi) __A - (__v32hi) __B);
417 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
418 _mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
419 return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
425 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
426 _mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
427 return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
430 _mm512_setzero_hi (),
434 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
435 _mm512_mullo_epi16 (__m512i __A, __m512i __B) {
436 return (__m512i) ((__v32hi) __A * (__v32hi) __B);
439 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
440 _mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
441 return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
447 static __inline__ __m512i __attribute__((__always_inline__, __nodebug__))
448 _mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
449 return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
452 _mm512_setzero_hi (),
456 #define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
457 (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
458 (__v64qi)(__m512i)(b), \
459 (p), (__mmask64)-1); })
461 #define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
462 (__mmask16)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
463 (__v64qi)(__m512i)(b), \
464 (p), (__mmask64)(m)); })
466 #define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
467 (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
468 (__v64qi)(__m512i)(b), \
469 (p), (__mmask64)-1); })
471 #define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
472 (__mmask16)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
473 (__v64qi)(__m512i)(b), \
474 (p), (__mmask64)(m)); })
476 #define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
477 (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
478 (__v32hi)(__m512i)(b), \
479 (p), (__mmask32)-1); })
481 #define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
482 (__mmask16)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
483 (__v32hi)(__m512i)(b), \
484 (p), (__mmask32)(m)); })
486 #define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
487 (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
488 (__v32hi)(__m512i)(b), \
489 (p), (__mmask32)-1); })
491 #define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
492 (__mmask16)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
493 (__v32hi)(__m512i)(b), \
494 (p), (__mmask32)(m)); })