1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
4 declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
6 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
7 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
9 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
10 ; CHECK-NEXT: kmovw %edi, %k1
11 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
12 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
13 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
14 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
17 %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
18 %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
19 %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
20 %res3 = fadd <16 x float> %res, %res1
21 %res4 = fadd <16 x float> %res2, %res3
22 ret <16 x float> %res4
25 declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
27 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
28 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
30 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
31 ; CHECK-NEXT: kmovw %edi, %k1
32 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
33 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
34 ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
35 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
38 %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
39 %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
40 %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
41 %res3 = fadd <8 x double> %res, %res1
42 %res4 = fadd <8 x double> %res2, %res3
43 ret <8 x double> %res4
46 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
48 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
49 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
51 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
52 ; CHECK-NEXT: kmovw %edi, %k1
53 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
54 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
55 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
56 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
58 %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
59 %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
60 %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
61 %res3 = add <16 x i32> %res, %res1
62 %res4 = add <16 x i32> %res2, %res3
66 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
68 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
69 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
71 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
72 ; CHECK-NEXT: kmovw %edi, %k1
73 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
74 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
75 ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
76 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
78 %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
79 %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
80 %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
81 %res3 = add <8 x i64> %res, %res1
82 %res4 = add <8 x i64> %res2, %res3
86 declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
88 define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
89 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
91 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
92 ; CHECK-NEXT: kmovw %edi, %k1
93 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
94 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
95 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
96 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
98 %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
99 %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
100 %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
101 %res3 = fadd <16 x float> %res, %res1
102 %res4 = fadd <16 x float> %res2, %res3
103 ret <16 x float> %res4
106 declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
108 define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
109 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
111 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
112 ; CHECK-NEXT: kmovw %edi, %k1
113 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
114 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
115 ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
116 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
118 %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
119 %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
120 %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
121 %res3 = fadd <16 x float> %res, %res1
122 %res4 = fadd <16 x float> %res2, %res3
123 ret <16 x float> %res4
126 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
128 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
129 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
131 ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
132 ; CHECK-NEXT: kmovw %edi, %k1
133 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
134 ; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
135 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1
136 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
138 %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
139 %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
140 %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
141 %res3 = fadd <8 x double> %res, %res1
142 %res4 = fadd <8 x double> %res2, %res3
143 ret <8 x double> %res4
146 declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
148 define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
149 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
151 ; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
152 ; CHECK-NEXT: kmovw %esi, %k1
153 ; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
154 ; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
155 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
156 ; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
158 %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
159 %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
160 %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
161 %res3 = fadd <8 x double> %res, %res1
162 %res4 = fadd <8 x double> %res3, %res2
163 ret <8 x double> %res4
166 declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
168 define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
169 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
171 ; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
172 ; CHECK-NEXT: kmovw %esi, %k1
173 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
174 ; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
175 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
176 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
178 %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
179 %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
180 %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
181 %res3 = add <8 x i64> %res, %res1
182 %res4 = add <8 x i64> %res3, %res2
186 define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
187 ; CHECK-LABEL: test_store1:
189 ; CHECK-NEXT: kmovw %edx, %k1
190 ; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
191 ; CHECK-NEXT: vmovups %zmm0, (%rsi)
193 call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
194 call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
198 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
200 define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
201 ; CHECK-LABEL: test_store2:
203 ; CHECK-NEXT: kmovw %edx, %k1
204 ; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
205 ; CHECK-NEXT: vmovupd %zmm0, (%rsi)
207 call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
208 call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
212 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
214 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
215 ; CHECK-LABEL: test_mask_store_aligned_ps:
217 ; CHECK-NEXT: kmovw %edx, %k1
218 ; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
219 ; CHECK-NEXT: vmovaps %zmm0, (%rsi)
221 call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
222 call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
226 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
228 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
229 ; CHECK-LABEL: test_mask_store_aligned_pd:
231 ; CHECK-NEXT: kmovw %edx, %k1
232 ; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
233 ; CHECK-NEXT: vmovapd %zmm0, (%rsi)
235 call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
236 call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
240 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
242 define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
243 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
245 ; CHECK-NEXT: kmovw %edx, %k1
246 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
247 ; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
249 call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
250 call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
254 declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
256 define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
257 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
259 ; CHECK-NEXT: kmovw %edx, %k1
260 ; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
261 ; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi)
263 call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
264 call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
268 declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
270 define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
271 ; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
273 ; CHECK-NEXT: kmovw %edx, %k1
274 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
275 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
277 call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
278 call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
282 declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
284 define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
285 ; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
287 ; CHECK-NEXT: kmovw %edx, %k1
288 ; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
289 ; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi)
291 call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
292 call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
296 declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
298 define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
299 ; CHECK-LABEL: test_mask_load_aligned_ps:
301 ; CHECK-NEXT: vmovaps (%rdi), %zmm0
302 ; CHECK-NEXT: kmovw %esi, %k1
303 ; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
304 ; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
305 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
307 %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
308 %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
309 %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
310 %res4 = fadd <16 x float> %res2, %res1
311 ret <16 x float> %res4
314 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
316 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
317 ; CHECK-LABEL: test_mask_load_unaligned_ps:
319 ; CHECK-NEXT: vmovups (%rdi), %zmm0
320 ; CHECK-NEXT: kmovw %esi, %k1
321 ; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
322 ; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
323 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
325 %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
326 %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
327 %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
328 %res4 = fadd <16 x float> %res2, %res1
329 ret <16 x float> %res4
332 declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
334 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
335 ; CHECK-LABEL: test_mask_load_aligned_pd:
337 ; CHECK-NEXT: vmovapd (%rdi), %zmm0
338 ; CHECK-NEXT: kmovw %esi, %k1
339 ; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
340 ; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
341 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
343 %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
344 %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
345 %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
346 %res4 = fadd <8 x double> %res2, %res1
347 ret <8 x double> %res4
350 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
352 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
353 ; CHECK-LABEL: test_mask_load_unaligned_pd:
355 ; CHECK-NEXT: vmovupd (%rdi), %zmm0
356 ; CHECK-NEXT: kmovw %esi, %k1
357 ; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
358 ; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
359 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
361 %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
362 %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
363 %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
364 %res4 = fadd <8 x double> %res2, %res1
365 ret <8 x double> %res4
368 declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
370 declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
372 define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
373 ; CHECK-LABEL: test_mask_load_unaligned_d:
375 ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
376 ; CHECK-NEXT: kmovw %edx, %k1
377 ; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
378 ; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z}
379 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
381 %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
382 %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
383 %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
384 %res4 = add <16 x i32> %res2, %res1
388 declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
390 define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
391 ; CHECK-LABEL: test_mask_load_unaligned_q:
393 ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
394 ; CHECK-NEXT: kmovw %edx, %k1
395 ; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
396 ; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z}
397 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
399 %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
400 %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
401 %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
402 %res4 = add <8 x i64> %res2, %res1
406 declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
408 define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
409 ; CHECK-LABEL: test_mask_load_aligned_d:
411 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
412 ; CHECK-NEXT: kmovw %esi, %k1
413 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1}
414 ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 {%k1} {z}
415 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
417 %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
418 %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
419 %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
420 %res4 = add <16 x i32> %res2, %res1
424 declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
426 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
427 ; CHECK-LABEL: test_mask_load_aligned_q:
429 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
430 ; CHECK-NEXT: kmovw %esi, %k1
431 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1}
432 ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 {%k1} {z}
433 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
435 %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
436 %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
437 %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
438 %res4 = add <8 x i64> %res2, %res1
442 declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
444 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
445 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
447 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
448 ; CHECK-NEXT: kmovw %edi, %k1
449 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
450 ; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
451 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
452 ; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0
454 %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
455 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
456 %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
457 %res3 = fadd <8 x double> %res, %res1
458 %res4 = fadd <8 x double> %res3, %res2
459 ret <8 x double> %res4
462 declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
464 define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
465 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
467 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
468 ; CHECK-NEXT: kmovw %edi, %k1
469 ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
470 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
471 ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
472 ; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
474 %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
475 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
476 %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
477 %res3 = fadd <16 x float> %res, %res1
478 %res4 = fadd <16 x float> %res3, %res2
479 ret <16 x float> %res4
482 declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
484 define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
485 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
487 ; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
488 ; CHECK-NEXT: kmovw %esi, %k1
489 ; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
490 ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
491 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
492 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
494 %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
495 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
496 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
497 %res3 = add <16 x i32> %res, %res1
498 %res4 = add <16 x i32> %res3, %res2
502 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
503 ; CHECK-LABEL: test_pcmpeq_d:
505 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
506 ; CHECK-NEXT: kmovw %k0, %eax
508 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
512 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
513 ; CHECK-LABEL: test_mask_pcmpeq_d:
515 ; CHECK-NEXT: kmovw %edi, %k1
516 ; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
517 ; CHECK-NEXT: kmovw %k0, %eax
519 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
523 declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
525 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
526 ; CHECK-LABEL: test_pcmpeq_q:
528 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
529 ; CHECK-NEXT: kmovw %k0, %eax
531 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
535 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
536 ; CHECK-LABEL: test_mask_pcmpeq_q:
538 ; CHECK-NEXT: kmovw %edi, %k1
539 ; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
540 ; CHECK-NEXT: kmovw %k0, %eax
542 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
546 declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
548 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
549 ; CHECK-LABEL: test_pcmpgt_d:
551 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
552 ; CHECK-NEXT: kmovw %k0, %eax
554 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
558 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
559 ; CHECK-LABEL: test_mask_pcmpgt_d:
561 ; CHECK-NEXT: kmovw %edi, %k1
562 ; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
563 ; CHECK-NEXT: kmovw %k0, %eax
565 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
569 declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
571 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
572 ; CHECK-LABEL: test_pcmpgt_q:
574 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
575 ; CHECK-NEXT: kmovw %k0, %eax
577 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
581 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
582 ; CHECK-LABEL: test_mask_pcmpgt_q:
584 ; CHECK-NEXT: kmovw %edi, %k1
585 ; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
586 ; CHECK-NEXT: kmovw %k0, %eax
588 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
592 declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
594 declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
596 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
597 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
599 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
600 ; CHECK-NEXT: kmovw %edi, %k1
601 ; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
602 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
604 %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
605 %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
606 %res2 = fadd <8 x double> %res, %res1
607 ret <8 x double> %res2
610 declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
612 define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
613 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
615 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
616 ; CHECK-NEXT: kmovw %edi, %k1
617 ; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
618 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
620 %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
621 %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
622 %res2 = fadd <16 x float> %res, %res1
623 ret <16 x float> %res2
626 declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
628 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
629 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
631 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
632 ; CHECK-NEXT: kmovw %edi, %k1
633 ; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
634 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm0
636 %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
637 %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
638 %res2 = fadd <8 x double> %res, %res1
639 ret <8 x double> %res2
642 declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
644 define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
645 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
647 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
648 ; CHECK-NEXT: kmovw %edi, %k1
649 ; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
650 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
652 %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
653 %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
654 %res2 = fadd <16 x float> %res, %res1
655 ret <16 x float> %res2
658 declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
660 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
661 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
663 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
664 ; CHECK-NEXT: kmovw %edi, %k1
665 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
666 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
667 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
668 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
670 %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
671 %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
672 %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
673 %res3 = add <8 x i64> %res, %res1
674 %res4 = add <8 x i64> %res2, %res3
678 declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
680 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
681 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
683 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
684 ; CHECK-NEXT: kmovw %edi, %k1
685 ; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
686 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
688 %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
689 %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
690 %res2 = add <8 x i64> %res, %res1
694 declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
696 define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
697 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
699 ; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
700 ; CHECK-NEXT: kmovw %edi, %k1
701 ; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
702 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
704 %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
705 %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
706 %res2 = add <16 x i32> %res, %res1
710 declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
712 define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
713 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
715 ; CHECK-NEXT: vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
716 ; CHECK-NEXT: kmovw %edi, %k1
717 ; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
718 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
720 %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
721 %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
722 %res2 = add <16 x i32> %res, %res1
726 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
727 ; CHECK-LABEL: test_x86_avx512_pslli_d:
729 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
731 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
735 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
736 ; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
738 ; CHECK-NEXT: kmovw %edi, %k1
739 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
740 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
742 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
746 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
747 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
749 ; CHECK-NEXT: kmovw %edi, %k1
750 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
752 %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
756 declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
758 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
759 ; CHECK-LABEL: test_x86_avx512_pslli_q:
761 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
763 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
767 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
768 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
770 ; CHECK-NEXT: kmovw %edi, %k1
771 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
772 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
774 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
778 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
779 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
781 ; CHECK-NEXT: kmovw %edi, %k1
782 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
784 %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
788 declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
790 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
791 ; CHECK-LABEL: test_x86_avx512_psrli_d:
793 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
795 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
799 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
800 ; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
802 ; CHECK-NEXT: kmovw %edi, %k1
803 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
804 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
806 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
810 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
811 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
813 ; CHECK-NEXT: kmovw %edi, %k1
814 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
816 %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
820 declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
822 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
823 ; CHECK-LABEL: test_x86_avx512_psrli_q:
825 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
827 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
831 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
832 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
834 ; CHECK-NEXT: kmovw %edi, %k1
835 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
836 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
838 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
842 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
843 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
845 ; CHECK-NEXT: kmovw %edi, %k1
846 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
848 %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
852 declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
854 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
855 ; CHECK-LABEL: test_x86_avx512_psrai_d:
857 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
859 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
863 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
864 ; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
866 ; CHECK-NEXT: kmovw %edi, %k1
867 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
868 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
870 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
874 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
875 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
877 ; CHECK-NEXT: kmovw %edi, %k1
878 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
880 %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
884 declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
886 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
887 ; CHECK-LABEL: test_x86_avx512_psrai_q:
889 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
891 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
895 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
896 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
898 ; CHECK-NEXT: kmovw %edi, %k1
899 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
900 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
902 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
906 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
907 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
909 ; CHECK-NEXT: kmovw %edi, %k1
910 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
912 %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
916 declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
918 declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
920 define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
921 ; CHECK-LABEL: test_storent_q_512:
923 ; CHECK-NEXT: vmovntps %zmm0, (%rdi)
925 call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
929 declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
931 define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
932 ; CHECK-LABEL: test_storent_pd_512:
934 ; CHECK-NEXT: vmovntps %zmm0, (%rdi)
936 call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
940 declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
942 define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
943 ; CHECK-LABEL: test_storent_ps_512:
945 ; CHECK-NEXT: vmovntps %zmm0, (%rdi)
947 call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
951 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
952 ; CHECK-LABEL: test_xor_epi32:
954 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
956 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
960 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
961 ; CHECK-LABEL: test_mask_xor_epi32:
963 ; CHECK-NEXT: kmovw %edi, %k1
964 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
965 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
967 %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
971 declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
973 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
974 ; CHECK-LABEL: test_or_epi32:
976 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
978 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
982 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
983 ; CHECK-LABEL: test_mask_or_epi32:
985 ; CHECK-NEXT: kmovw %edi, %k1
986 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
987 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
989 %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
993 declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
995 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
996 ; CHECK-LABEL: test_and_epi32:
998 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
1000 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
1001 ret < 16 x i32> %res
1004 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1005 ; CHECK-LABEL: test_mask_and_epi32:
1007 ; CHECK-NEXT: kmovw %edi, %k1
1008 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
1009 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1011 %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1012 ret < 16 x i32> %res
1015 declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1017 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
1018 ; CHECK-LABEL: test_xor_epi64:
1020 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
1022 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1026 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1027 ; CHECK-LABEL: test_mask_xor_epi64:
1029 ; CHECK-NEXT: kmovw %edi, %k1
1030 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
1031 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1033 %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1037 declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1039 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
1040 ; CHECK-LABEL: test_or_epi64:
1042 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
1044 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1048 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1049 ; CHECK-LABEL: test_mask_or_epi64:
1051 ; CHECK-NEXT: kmovw %edi, %k1
1052 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
1053 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1055 %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1059 declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1061 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
1062 ; CHECK-LABEL: test_and_epi64:
1064 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
1066 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
1070 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1071 ; CHECK-LABEL: test_mask_and_epi64:
1073 ; CHECK-NEXT: kmovw %edi, %k1
1074 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
1075 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1077 %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1081 declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1083 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
1084 ; CHECK-LABEL: test_mask_add_epi32_rr:
1086 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
1088 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1089 ret < 16 x i32> %res
1092 define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1093 ; CHECK-LABEL: test_mask_add_epi32_rrk:
1095 ; CHECK-NEXT: kmovw %edi, %k1
1096 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 {%k1}
1097 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1099 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1100 ret < 16 x i32> %res
1103 define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1104 ; CHECK-LABEL: test_mask_add_epi32_rrkz:
1106 ; CHECK-NEXT: kmovw %edi, %k1
1107 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
1109 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1110 ret < 16 x i32> %res
1113 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
1114 ; CHECK-LABEL: test_mask_add_epi32_rm:
1116 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
1118 %b = load <16 x i32>, <16 x i32>* %ptr_b
1119 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1120 ret < 16 x i32> %res
1123 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1124 ; CHECK-LABEL: test_mask_add_epi32_rmk:
1126 ; CHECK-NEXT: kmovw %esi, %k1
1127 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm1 {%k1}
1128 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1130 %b = load <16 x i32>, <16 x i32>* %ptr_b
1131 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1132 ret < 16 x i32> %res
1135 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
1136 ; CHECK-LABEL: test_mask_add_epi32_rmkz:
1138 ; CHECK-NEXT: kmovw %esi, %k1
1139 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
1141 %b = load <16 x i32>, <16 x i32>* %ptr_b
1142 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1143 ret < 16 x i32> %res
1146 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
1147 ; CHECK-LABEL: test_mask_add_epi32_rmb:
1149 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0
1151 %q = load i32, i32* %ptr_b
1152 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1153 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1154 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1155 ret < 16 x i32> %res
1158 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1159 ; CHECK-LABEL: test_mask_add_epi32_rmbk:
1161 ; CHECK-NEXT: kmovw %esi, %k1
1162 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
1163 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1165 %q = load i32, i32* %ptr_b
1166 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1167 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1168 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1169 ret < 16 x i32> %res
1172 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
1173 ; CHECK-LABEL: test_mask_add_epi32_rmbkz:
1175 ; CHECK-NEXT: kmovw %esi, %k1
1176 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
1178 %q = load i32, i32* %ptr_b
1179 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1180 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1181 %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1182 ret < 16 x i32> %res
1185 declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1187 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
1188 ; CHECK-LABEL: test_mask_sub_epi32_rr:
1190 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
1192 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1193 ret < 16 x i32> %res
1196 define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1197 ; CHECK-LABEL: test_mask_sub_epi32_rrk:
1199 ; CHECK-NEXT: kmovw %edi, %k1
1200 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1}
1201 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1203 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1204 ret < 16 x i32> %res
1207 define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1208 ; CHECK-LABEL: test_mask_sub_epi32_rrkz:
1210 ; CHECK-NEXT: kmovw %edi, %k1
1211 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
1213 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1214 ret < 16 x i32> %res
1217 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
1218 ; CHECK-LABEL: test_mask_sub_epi32_rm:
1220 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0
1222 %b = load <16 x i32>, <16 x i32>* %ptr_b
1223 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1224 ret < 16 x i32> %res
1227 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1228 ; CHECK-LABEL: test_mask_sub_epi32_rmk:
1230 ; CHECK-NEXT: kmovw %esi, %k1
1231 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm1 {%k1}
1232 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1234 %b = load <16 x i32>, <16 x i32>* %ptr_b
1235 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1236 ret < 16 x i32> %res
1239 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
1240 ; CHECK-LABEL: test_mask_sub_epi32_rmkz:
1242 ; CHECK-NEXT: kmovw %esi, %k1
1243 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
1245 %b = load <16 x i32>, <16 x i32>* %ptr_b
1246 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1247 ret < 16 x i32> %res
1250 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
1251 ; CHECK-LABEL: test_mask_sub_epi32_rmb:
1253 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0
1255 %q = load i32, i32* %ptr_b
1256 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1257 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1258 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1259 ret < 16 x i32> %res
1262 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1263 ; CHECK-LABEL: test_mask_sub_epi32_rmbk:
1265 ; CHECK-NEXT: kmovw %esi, %k1
1266 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
1267 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1269 %q = load i32, i32* %ptr_b
1270 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1271 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1272 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1273 ret < 16 x i32> %res
1276 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
1277 ; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
1279 ; CHECK-NEXT: kmovw %esi, %k1
1280 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
1282 %q = load i32, i32* %ptr_b
1283 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1284 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1285 %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1286 ret < 16 x i32> %res
1289 declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1291 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
1292 ; CHECK-LABEL: test_mask_add_epi64_rr:
1294 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
1296 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1300 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1301 ; CHECK-LABEL: test_mask_add_epi64_rrk:
1303 ; CHECK-NEXT: kmovw %edi, %k1
1304 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
1305 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1307 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1311 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1312 ; CHECK-LABEL: test_mask_add_epi64_rrkz:
1314 ; CHECK-NEXT: kmovw %edi, %k1
1315 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
1317 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1321 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
1322 ; CHECK-LABEL: test_mask_add_epi64_rm:
1324 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
1326 %b = load <8 x i64>, <8 x i64>* %ptr_b
1327 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1331 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
1332 ; CHECK-LABEL: test_mask_add_epi64_rmk:
1334 ; CHECK-NEXT: kmovw %esi, %k1
1335 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
1336 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1338 %b = load <8 x i64>, <8 x i64>* %ptr_b
1339 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1343 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
1344 ; CHECK-LABEL: test_mask_add_epi64_rmkz:
1346 ; CHECK-NEXT: kmovw %esi, %k1
1347 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
1349 %b = load <8 x i64>, <8 x i64>* %ptr_b
1350 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1354 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
1355 ; CHECK-LABEL: test_mask_add_epi64_rmb:
1357 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
1359 %q = load i64, i64* %ptr_b
1360 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1361 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1362 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1366 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
1367 ; CHECK-LABEL: test_mask_add_epi64_rmbk:
1369 ; CHECK-NEXT: kmovw %esi, %k1
1370 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
1371 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1373 %q = load i64, i64* %ptr_b
1374 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1375 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1376 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1380 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
1381 ; CHECK-LABEL: test_mask_add_epi64_rmbkz:
1383 ; CHECK-NEXT: kmovw %esi, %k1
1384 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
1386 %q = load i64, i64* %ptr_b
1387 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1388 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1389 %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1393 declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1395 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
1396 ; CHECK-LABEL: test_mask_sub_epi64_rr:
1398 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
1400 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1404 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
1405 ; CHECK-LABEL: test_mask_sub_epi64_rrk:
1407 ; CHECK-NEXT: kmovw %edi, %k1
1408 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
1409 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1411 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1415 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
1416 ; CHECK-LABEL: test_mask_sub_epi64_rrkz:
1418 ; CHECK-NEXT: kmovw %edi, %k1
1419 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
1421 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1425 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
1426 ; CHECK-LABEL: test_mask_sub_epi64_rm:
1428 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0
1430 %b = load <8 x i64>, <8 x i64>* %ptr_b
1431 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1435 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
1436 ; CHECK-LABEL: test_mask_sub_epi64_rmk:
1438 ; CHECK-NEXT: kmovw %esi, %k1
1439 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
1440 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1442 %b = load <8 x i64>, <8 x i64>* %ptr_b
1443 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1447 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
1448 ; CHECK-LABEL: test_mask_sub_epi64_rmkz:
1450 ; CHECK-NEXT: kmovw %esi, %k1
1451 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
1453 %b = load <8 x i64>, <8 x i64>* %ptr_b
1454 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1458 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
1459 ; CHECK-LABEL: test_mask_sub_epi64_rmb:
1461 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0
1463 %q = load i64, i64* %ptr_b
1464 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1465 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1466 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
1470 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
1471 ; CHECK-LABEL: test_mask_sub_epi64_rmbk:
1473 ; CHECK-NEXT: kmovw %esi, %k1
1474 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
1475 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1477 %q = load i64, i64* %ptr_b
1478 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1479 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1480 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
1484 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
1485 ; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
1487 ; CHECK-NEXT: kmovw %esi, %k1
1488 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
1490 %q = load i64, i64* %ptr_b
1491 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
1492 %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
1493 %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
1497 declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1499 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
1500 ; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
1502 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
1504 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1508 define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
1509 ; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
1511 ; CHECK-NEXT: kmovw %edi, %k1
1512 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2 {%k1}
1513 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
1515 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1516 ret < 16 x i32> %res
1519 define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
1520 ; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
1522 ; CHECK-NEXT: kmovw %edi, %k1
1523 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
1525 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1526 ret < 16 x i32> %res
1529 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
1530 ; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
1532 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0
1534 %b = load <16 x i32>, <16 x i32>* %ptr_b
1535 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1536 ret < 16 x i32> %res
1539 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1540 ; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
1542 ; CHECK-NEXT: kmovw %esi, %k1
1543 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm1 {%k1}
1544 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1546 %b = load <16 x i32>, <16 x i32>* %ptr_b
1547 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1548 ret < 16 x i32> %res
1551 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
1552 ; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
1554 ; CHECK-NEXT: kmovw %esi, %k1
1555 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
1557 %b = load <16 x i32>, <16 x i32>* %ptr_b
1558 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1559 ret < 16 x i32> %res
1562 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
1563 ; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
1565 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0
1567 %q = load i32, i32* %ptr_b
1568 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1569 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1570 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
1571 ret < 16 x i32> %res
1574 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
1575 ; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
1577 ; CHECK-NEXT: kmovw %esi, %k1
1578 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
1579 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
1581 %q = load i32, i32* %ptr_b
1582 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1583 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1584 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
1585 ret < 16 x i32> %res
1588 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
1589 ; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
1591 ; CHECK-NEXT: kmovw %esi, %k1
1592 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
1594 %q = load i32, i32* %ptr_b
1595 %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
1596 %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
1597 %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
1598 ret < 16 x i32> %res
1601 declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1603 declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
1605 define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
1606 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
1608 ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
1609 ; CHECK-NEXT: kmovw %edi, %k1
1610 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
1611 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
1612 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
1613 ; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
1615 %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
1616 %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
1617 %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
1619 %res3 = fadd <8 x double> %res, %res1
1620 %res4 = fadd <8 x double> %res3, %res2
1621 ret <8 x double> %res4
1624 declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
1626 define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
1627 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
1629 ; CHECK-NEXT: vshufps {{.*#+}} zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
1630 ; CHECK-NEXT: kmovw %edi, %k1
1631 ; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
1632 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
1634 %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
1635 %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
1636 %res2 = fadd <16 x float> %res, %res1
1637 ret <16 x float> %res2
1640 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1642 define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
1643 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
1645 ; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3
1646 ; CHECK-NEXT: kmovw %edi, %k1
1647 ; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
1648 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
1650 %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
1651 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
1652 %res2 = add <16 x i32> %res, %res1
1653 ret <16 x i32> %res2
1656 declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1658 define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
1659 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
1661 ; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3
1662 ; CHECK-NEXT: kmovw %edi, %k1
1663 ; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
1664 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
1666 %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
1667 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
1668 %res2 = add <8 x i64> %res, %res1
1672 declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1674 define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
1675 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
1677 ; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm3
1678 ; CHECK-NEXT: kmovw %edi, %k1
1679 ; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
1680 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
1682 %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
1683 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
1684 %res2 = add <16 x i32> %res, %res1
1685 ret <16 x i32> %res2
1688 declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1690 define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
1691 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
1693 ; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3
1694 ; CHECK-NEXT: kmovw %edi, %k1
1695 ; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
1696 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
1698 %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
1699 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
1700 %res2 = add <8 x i64> %res, %res1
1704 declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1706 define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
1707 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
1709 ; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm3
1710 ; CHECK-NEXT: kmovw %edi, %k1
1711 ; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1}
1712 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
1714 %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
1715 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
1716 %res2 = add <16 x i32> %res, %res1
1717 ret <16 x i32> %res2
1720 declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1722 define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
1723 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
1725 ; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm3
1726 ; CHECK-NEXT: kmovw %edi, %k1
1727 ; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1}
1728 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
1730 %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
1731 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
1732 %res2 = add <8 x i64> %res, %res1
1736 declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
1738 define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
1739 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
1741 ; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm3
1742 ; CHECK-NEXT: kmovw %edi, %k1
1743 ; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1}
1744 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
1746 %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
1747 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
1748 %res2 = add <16 x i32> %res, %res1
1749 ret <16 x i32> %res2
1752 declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
1754 define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
1755 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
1757 ; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm3
1758 ; CHECK-NEXT: kmovw %edi, %k1
1759 ; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1}
1760 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
1762 %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
1763 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
1764 %res2 = add <8 x i64> %res, %res1
1768 define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
1769 ; CHECK-LABEL: test_mm_mask_move_ss:
1770 ; CHECK: ## BB#0: ## %entry
1771 ; CHECK-NEXT: kmovw %edi, %k1
1772 ; CHECK-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
1775 %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U)
1776 ret <4 x float> %res
1780 define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
1781 ; CHECK-LABEL: test_mm_maskz_move_ss:
1782 ; CHECK: ## BB#0: ## %entry
1783 ; CHECK-NEXT: kmovw %edi, %k1
1784 ; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
1787 %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U)
1788 ret <4 x float> %res
1791 define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
1792 ; CHECK-LABEL: test_mm_mask_move_sd:
1793 ; CHECK: ## BB#0: ## %entry
1794 ; CHECK-NEXT: kmovw %edi, %k1
1795 ; CHECK-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
1798 %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U)
1799 ret <2 x double> %res
1802 define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
1803 ; CHECK-LABEL: test_mm_maskz_move_sd:
1804 ; CHECK: ## BB#0: ## %entry
1805 ; CHECK-NEXT: kmovw %edi, %k1
1806 ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
1809 %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U)
1810 ret <2 x double> %res
1813 declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
1814 declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
1816 declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
1818 define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
1819 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
1821 ; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1822 ; CHECK-NEXT: kmovw %edi, %k1
1823 ; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1824 ; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1825 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1826 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1828 %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
1829 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
1830 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
1831 %res3 = add <16 x i32> %res, %res1
1832 %res4 = add <16 x i32> %res3, %res2
1833 ret <16 x i32> %res4
1836 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
1838 define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
1839 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
1841 ; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
1842 ; CHECK-NEXT: kmovw %edi, %k1
1843 ; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
1844 ; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
1845 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1846 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1848 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
1849 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
1850 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
1851 %res3 = add <8 x i64> %res, %res1
1852 %res4 = add <8 x i64> %res3, %res2
1856 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
1858 define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
1859 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
1861 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
1862 ; CHECK-NEXT: kmovw %edi, %k1
1863 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
1864 ; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
1865 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1866 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1868 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
1869 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
1870 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
1871 %res3 = add <8 x i64> %res, %res1
1872 %res4 = add <8 x i64> %res3, %res2
1876 declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
1878 define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
1879 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
1881 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1882 ; CHECK-NEXT: kmovw %edi, %k1
1883 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1884 ; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1885 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1886 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1888 %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
1889 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
1890 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
1891 %res3 = add <16 x i32> %res, %res1
1892 %res4 = add <16 x i32> %res3, %res2
1893 ret <16 x i32> %res4
1896 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
1898 define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
1899 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
1901 ; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1902 ; CHECK-NEXT: kmovw %edi, %k1
1903 ; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1904 ; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1905 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1906 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1908 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
1909 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
1910 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
1911 %res3 = add <8 x i64> %res, %res1
1912 %res4 = add <8 x i64> %res3, %res2
1916 declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
1918 define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
1919 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
1921 ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2
1922 ; CHECK-NEXT: kmovw %edi, %k1
1923 ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
1924 ; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
1925 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1926 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1928 %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
1929 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
1930 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
1931 %res3 = add <16 x i32> %res, %res1
1932 %res4 = add <16 x i32> %res3, %res2
1933 ret <16 x i32> %res4
1936 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
1938 define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
1939 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
1941 ; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2
1942 ; CHECK-NEXT: kmovw %edi, %k1
1943 ; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
1944 ; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z}
1945 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1946 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1948 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
1949 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
1950 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
1951 %res3 = add <8 x i64> %res, %res1
1952 %res4 = add <8 x i64> %res3, %res2
1956 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
1958 define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
1959 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
1961 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2
1962 ; CHECK-NEXT: kmovw %edi, %k1
1963 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
1964 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z}
1965 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
1966 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
1968 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
1969 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
1970 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
1971 %res3 = add <8 x i64> %res, %res1
1972 %res4 = add <8 x i64> %res3, %res2
1977 declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
1979 define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
1980 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
1982 ; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2
1983 ; CHECK-NEXT: kmovw %edi, %k1
1984 ; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
1985 ; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z}
1986 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
1987 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
1989 %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
1990 %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
1991 %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
1992 %res3 = add <16 x i32> %res, %res1
1993 %res4 = add <16 x i32> %res3, %res2
1994 ret <16 x i32> %res4
1998 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
2000 define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
2001 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
2003 ; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2
2004 ; CHECK-NEXT: kmovw %edi, %k1
2005 ; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
2006 ; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z}
2007 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
2008 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
2010 %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
2011 %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
2012 %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
2013 %res3 = add <8 x i64> %res, %res1
2014 %res4 = add <8 x i64> %res3, %res2
2018 declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
2020 define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
2021 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
2023 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
2024 ; CHECK-NEXT: kmovw %esi, %k1
2025 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
2026 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
2027 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1
2028 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
2030 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
2031 %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 -1)
2032 %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
2033 %res3 = add <8 x i64> %res, %res1
2034 %res4 = add <8 x i64> %res3, %res2
2038 declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
2040 define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
2041 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
2043 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
2044 ; CHECK-NEXT: kmovw %esi, %k1
2045 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
2046 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z}
2047 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
2048 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
2050 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
2051 %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 -1)
2052 %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
2053 %res3 = add <16 x i32> %res, %res1
2054 %res4 = add <16 x i32> %res3, %res2
2055 ret <16 x i32> %res4
2058 declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
2060 define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
2061 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
2063 ; CHECK-NEXT: vpsrad $3, %zmm0, %zmm2
2064 ; CHECK-NEXT: kmovw %esi, %k1
2065 ; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1}
2066 ; CHECK-NEXT: vpsrad $3, %zmm0, %zmm0 {%k1} {z}
2067 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
2068 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
2070 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
2071 %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
2072 %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
2073 %res3 = add <16 x i32> %res, %res1
2074 %res4 = add <16 x i32> %res3, %res2
2075 ret <16 x i32> %res4
2078 declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
2080 define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
2081 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
2083 ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2
2084 ; CHECK-NEXT: kmovw %esi, %k1
2085 ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
2086 ; CHECK-NEXT: vpsraq $3, %zmm0, %zmm0 {%k1} {z}
2087 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
2088 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
2090 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
2091 %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
2092 %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
2093 %res3 = add <8 x i64> %res, %res1
2094 %res4 = add <8 x i64> %res3, %res2
2098 declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
2100 define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
2101 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
2103 ; CHECK-NEXT: vpslld $3, %zmm0, %zmm2
2104 ; CHECK-NEXT: kmovw %esi, %k1
2105 ; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1}
2106 ; CHECK-NEXT: vpslld $3, %zmm0, %zmm0 {%k1} {z}
2107 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
2108 ; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
2110 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
2111 %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
2112 %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
2113 %res3 = add <16 x i32> %res, %res1
2114 %res4 = add <16 x i32> %res3, %res2
2115 ret <16 x i32> %res4
2118 declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
2120 define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
2121 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
2123 ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2
2124 ; CHECK-NEXT: kmovw %esi, %k1
2125 ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
2126 ; CHECK-NEXT: vpsllq $3, %zmm0, %zmm0 {%k1} {z}
2127 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
2128 ; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0
2130 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
2131 %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
2132 %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
2133 %res3 = add <8 x i64> %res, %res1
2134 %res4 = add <8 x i64> %res3, %res2
2138 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
2139 ; CHECK-LABEL: test_x86_avx512_psll_d:
2141 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
2143 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2147 define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2148 ; CHECK-LABEL: test_x86_avx512_mask_psll_d:
2150 ; CHECK-NEXT: kmovw %edi, %k1
2151 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
2152 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2154 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
2158 define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
2159 ; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
2161 ; CHECK-NEXT: kmovw %edi, %k1
2162 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
2164 %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2168 declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
2170 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
2171 ; CHECK-LABEL: test_x86_avx512_psll_q:
2173 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
2175 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2179 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2180 ; CHECK-LABEL: test_x86_avx512_mask_psll_q:
2182 ; CHECK-NEXT: kmovw %edi, %k1
2183 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
2184 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2186 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
2190 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
2191 ; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
2193 ; CHECK-NEXT: kmovw %edi, %k1
2194 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
2196 %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2200 declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
2202 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
2203 ; CHECK-LABEL: test_x86_avx512_psrl_d:
2205 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
2207 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2211 define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2212 ; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
2214 ; CHECK-NEXT: kmovw %edi, %k1
2215 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
2216 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2218 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
2222 define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
2223 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
2225 ; CHECK-NEXT: kmovw %edi, %k1
2226 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
2228 %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2232 declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
2234 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
2235 ; CHECK-LABEL: test_x86_avx512_psrl_q:
2237 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
2239 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2243 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2244 ; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
2246 ; CHECK-NEXT: kmovw %edi, %k1
2247 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
2248 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2250 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
2254 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
2255 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
2257 ; CHECK-NEXT: kmovw %edi, %k1
2258 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
2260 %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2264 declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
2266 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
2267 ; CHECK-LABEL: test_x86_avx512_psra_d:
2269 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
2271 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2275 define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2276 ; CHECK-LABEL: test_x86_avx512_mask_psra_d:
2278 ; CHECK-NEXT: kmovw %edi, %k1
2279 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
2280 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2282 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
2286 define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
2287 ; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
2289 ; CHECK-NEXT: kmovw %edi, %k1
2290 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
2292 %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2296 declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
2298 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
2299 ; CHECK-LABEL: test_x86_avx512_psra_q:
2301 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
2303 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2307 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2308 ; CHECK-LABEL: test_x86_avx512_mask_psra_q:
2310 ; CHECK-NEXT: kmovw %edi, %k1
2311 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
2312 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2314 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
2318 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
2319 ; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
2321 ; CHECK-NEXT: kmovw %edi, %k1
2322 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
2324 %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2328 declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
2330 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
2331 ; CHECK-LABEL: test_x86_avx512_psllv_d:
2333 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
2335 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2339 define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2340 ; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
2342 ; CHECK-NEXT: kmovw %edi, %k1
2343 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
2344 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2346 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2350 define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2351 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
2353 ; CHECK-NEXT: kmovw %edi, %k1
2354 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
2356 %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2360 declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2362 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
2363 ; CHECK-LABEL: test_x86_avx512_psllv_q:
2365 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
2367 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2371 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2372 ; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
2374 ; CHECK-NEXT: kmovw %edi, %k1
2375 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
2376 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2378 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2382 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2383 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
2385 ; CHECK-NEXT: kmovw %edi, %k1
2386 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2388 %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2392 declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2395 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
2396 ; CHECK-LABEL: test_x86_avx512_psrav_d:
2398 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
2400 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2404 define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2405 ; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
2407 ; CHECK-NEXT: kmovw %edi, %k1
2408 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
2409 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2411 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2415 define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2416 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
2418 ; CHECK-NEXT: kmovw %edi, %k1
2419 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
2421 %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2425 declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2427 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
2428 ; CHECK-LABEL: test_x86_avx512_psrav_q:
2430 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
2432 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2436 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2437 ; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
2439 ; CHECK-NEXT: kmovw %edi, %k1
2440 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
2441 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2443 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2447 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2448 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
2450 ; CHECK-NEXT: kmovw %edi, %k1
2451 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
2453 %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2457 declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2459 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
2460 ; CHECK-LABEL: test_x86_avx512_psrlv_d:
2462 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
2464 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
2468 define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
2469 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
2471 ; CHECK-NEXT: kmovw %edi, %k1
2472 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
2473 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2475 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
2479 define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
2480 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
2482 ; CHECK-NEXT: kmovw %edi, %k1
2483 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
2485 %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
2489 declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
2491 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
2492 ; CHECK-LABEL: test_x86_avx512_psrlv_q:
2494 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
2496 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
2500 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
2501 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
2503 ; CHECK-NEXT: kmovw %edi, %k1
2504 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
2505 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2507 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
2511 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
2512 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
2514 ; CHECK-NEXT: kmovw %edi, %k1
2515 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
2517 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
2521 declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
2523 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
2524 ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
2526 ; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0
2528 %b = load <8 x i64>, <8 x i64>* %ptr
2529 %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
2533 declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
2535 define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
2536 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
2538 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm2
2539 ; CHECK-NEXT: kmovw %edi, %k1
2540 ; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1}
2541 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0
2543 %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
2544 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
2545 %res2 = fadd <8 x double> %res, %res1
2546 ret <8 x double> %res2
2549 declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
2551 define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
2552 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
2554 ; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm2
2555 ; CHECK-NEXT: kmovw %edi, %k1
2556 ; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1}
2557 ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0
2559 %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
2560 %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
2561 %res2 = fadd <8 x double> %res, %res1
2562 ret <8 x double> %res2
2565 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
2566 ; CHECK-LABEL: test_valign_q:
2568 ; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm1[2,3,4,5,6,7],zmm0[0,1]
2570 %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
2574 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
2575 ; CHECK-LABEL: test_mask_valign_q:
2577 ; CHECK-NEXT: kmovw %edi, %k1
2578 ; CHECK-NEXT: valignq {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7],zmm0[0,1]
2579 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2581 %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
2585 declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
2587 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
2588 ; CHECK-LABEL: test_maskz_valign_d:
2590 ; CHECK-NEXT: kmovw %edi, %k1
2591 ; CHECK-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4]
2593 %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
2597 declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
2599 declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
2601 define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
2602 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
2604 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3
2605 ; CHECK-NEXT: kmovw %edi, %k1
2606 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
2607 ; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
2608 ; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
2609 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
2611 %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
2612 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
2613 %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
2614 %res3 = fadd <8 x double> %res, %res1
2615 %res4 = fadd <8 x double> %res2, %res3
2616 ret <8 x double> %res4
2619 declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
2621 define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
2622 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
2624 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm3
2625 ; CHECK-NEXT: kmovw %edi, %k1
2626 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
2627 ; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
2628 ; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
2629 ; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
2631 %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
2632 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
2633 %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
2634 %res3 = fadd <16 x float> %res, %res1
2635 %res4 = fadd <16 x float> %res2, %res3
2636 ret <16 x float> %res4
2639 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
2640 define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
2641 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool:
2643 ; CHECK-NEXT: kmovw %edi, %k1
2644 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
2645 ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
2646 ; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
2647 ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
2648 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
2650 %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
2651 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> zeroinitializer, i16 %x3)
2652 %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>, <16 x float> %x2, i16 -1)
2653 %res3 = fadd <16 x float> %res, %res1
2654 %res4 = fadd <16 x float> %res2, %res3
2655 ret <16 x float> %res4
2658 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
2659 ; CHECK-LABEL: test_mask_mul_epi32_rr:
2661 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
2663 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2667 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2668 ; CHECK-LABEL: test_mask_mul_epi32_rrk:
2670 ; CHECK-NEXT: kmovw %edi, %k1
2671 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
2672 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2674 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2678 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2679 ; CHECK-LABEL: test_mask_mul_epi32_rrkz:
2681 ; CHECK-NEXT: kmovw %edi, %k1
2682 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
2684 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2688 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2689 ; CHECK-LABEL: test_mask_mul_epi32_rm:
2691 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
2693 %b = load <16 x i32>, <16 x i32>* %ptr_b
2694 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2698 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2699 ; CHECK-LABEL: test_mask_mul_epi32_rmk:
2701 ; CHECK-NEXT: kmovw %esi, %k1
2702 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
2703 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
2705 %b = load <16 x i32>, <16 x i32>* %ptr_b
2706 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2710 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
2711 ; CHECK-LABEL: test_mask_mul_epi32_rmkz:
2713 ; CHECK-NEXT: kmovw %esi, %k1
2714 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
2716 %b = load <16 x i32>, <16 x i32>* %ptr_b
2717 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2721 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
2722 ; CHECK-LABEL: test_mask_mul_epi32_rmb:
2724 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
2726 %q = load i64, i64* %ptr_b
2727 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2728 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2729 %b = bitcast <8 x i64> %b64 to <16 x i32>
2730 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2734 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2735 ; CHECK-LABEL: test_mask_mul_epi32_rmbk:
2737 ; CHECK-NEXT: kmovw %esi, %k1
2738 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2739 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
2741 %q = load i64, i64* %ptr_b
2742 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2743 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2744 %b = bitcast <8 x i64> %b64 to <16 x i32>
2745 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2749 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
2750 ; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
2752 ; CHECK-NEXT: kmovw %esi, %k1
2753 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2755 %q = load i64, i64* %ptr_b
2756 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2757 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2758 %b = bitcast <8 x i64> %b64 to <16 x i32>
2759 %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2763 declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
2765 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
2766 ; CHECK-LABEL: test_mask_mul_epu32_rr:
2768 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
2770 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2774 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
2775 ; CHECK-LABEL: test_mask_mul_epu32_rrk:
2777 ; CHECK-NEXT: kmovw %edi, %k1
2778 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
2779 ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
2781 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2785 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
2786 ; CHECK-LABEL: test_mask_mul_epu32_rrkz:
2788 ; CHECK-NEXT: kmovw %edi, %k1
2789 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
2791 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2795 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
2796 ; CHECK-LABEL: test_mask_mul_epu32_rm:
2798 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
2800 %b = load <16 x i32>, <16 x i32>* %ptr_b
2801 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2805 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2806 ; CHECK-LABEL: test_mask_mul_epu32_rmk:
2808 ; CHECK-NEXT: kmovw %esi, %k1
2809 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
2810 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
2812 %b = load <16 x i32>, <16 x i32>* %ptr_b
2813 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2817 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
2818 ; CHECK-LABEL: test_mask_mul_epu32_rmkz:
2820 ; CHECK-NEXT: kmovw %esi, %k1
2821 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
2823 %b = load <16 x i32>, <16 x i32>* %ptr_b
2824 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2828 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
2829 ; CHECK-LABEL: test_mask_mul_epu32_rmb:
2831 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
2833 %q = load i64, i64* %ptr_b
2834 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2835 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2836 %b = bitcast <8 x i64> %b64 to <16 x i32>
2837 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
2841 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
2842 ; CHECK-LABEL: test_mask_mul_epu32_rmbk:
2844 ; CHECK-NEXT: kmovw %esi, %k1
2845 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
2846 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
2848 %q = load i64, i64* %ptr_b
2849 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2850 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2851 %b = bitcast <8 x i64> %b64 to <16 x i32>
2852 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
2856 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
2857 ; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
2859 ; CHECK-NEXT: kmovw %esi, %k1
2860 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
2862 %q = load i64, i64* %ptr_b
2863 %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
2864 %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
2865 %b = bitcast <8 x i64> %b64 to <16 x i32>
2866 %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
2870 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
2872 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
2873 ; CHECK-LABEL: test_mask_vextractf32x4:
2875 ; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
2876 ; CHECK-NEXT: kmovw %edi, %k1
2877 ; CHECK-NEXT: kshiftlw $12, %k1, %k0
2878 ; CHECK-NEXT: kshiftrw $15, %k0, %k0
2879 ; CHECK-NEXT: kshiftlw $13, %k1, %k2
2880 ; CHECK-NEXT: kshiftrw $15, %k2, %k2
2881 ; CHECK-NEXT: kshiftlw $15, %k1, %k3
2882 ; CHECK-NEXT: kshiftrw $15, %k3, %k3
2883 ; CHECK-NEXT: kshiftlw $14, %k1, %k1
2884 ; CHECK-NEXT: kshiftrw $15, %k1, %k1
2885 ; CHECK-NEXT: kmovw %k1, %eax
2886 ; CHECK-NEXT: kmovw %k3, %ecx
2887 ; CHECK-NEXT: vmovd %ecx, %xmm2
2888 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
2889 ; CHECK-NEXT: kmovw %k2, %eax
2890 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
2891 ; CHECK-NEXT: kmovw %k0, %eax
2892 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
2893 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
2894 ; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
2896 %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
2897 ret <4 x float> %res
2900 declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
2902 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
2903 ; CHECK-LABEL: test_mask_vextracti64x4:
2905 ; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
2906 ; CHECK-NEXT: kmovw %edi, %k1
2907 ; CHECK-NEXT: kshiftlw $12, %k1, %k0
2908 ; CHECK-NEXT: kshiftrw $15, %k0, %k0
2909 ; CHECK-NEXT: kshiftlw $13, %k1, %k2
2910 ; CHECK-NEXT: kshiftrw $15, %k2, %k2
2911 ; CHECK-NEXT: kshiftlw $15, %k1, %k3
2912 ; CHECK-NEXT: kshiftrw $15, %k3, %k3
2913 ; CHECK-NEXT: kshiftlw $14, %k1, %k1
2914 ; CHECK-NEXT: kshiftrw $15, %k1, %k1
2915 ; CHECK-NEXT: kmovw %k1, %eax
2916 ; CHECK-NEXT: kmovw %k3, %ecx
2917 ; CHECK-NEXT: vmovd %ecx, %xmm2
2918 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
2919 ; CHECK-NEXT: kmovw %k2, %eax
2920 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
2921 ; CHECK-NEXT: kmovw %k0, %eax
2922 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
2923 ; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
2924 ; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2
2925 ; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
2927 %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
2931 declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
2933 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
2934 ; CHECK-LABEL: test_maskz_vextracti32x4:
2936 ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
2937 ; CHECK-NEXT: kmovw %edi, %k1
2938 ; CHECK-NEXT: kshiftlw $12, %k1, %k0
2939 ; CHECK-NEXT: kshiftrw $15, %k0, %k0
2940 ; CHECK-NEXT: kshiftlw $13, %k1, %k2
2941 ; CHECK-NEXT: kshiftrw $15, %k2, %k2
2942 ; CHECK-NEXT: kshiftlw $15, %k1, %k3
2943 ; CHECK-NEXT: kshiftrw $15, %k3, %k3
2944 ; CHECK-NEXT: kshiftlw $14, %k1, %k1
2945 ; CHECK-NEXT: kshiftrw $15, %k1, %k1
2946 ; CHECK-NEXT: kmovw %k1, %eax
2947 ; CHECK-NEXT: kmovw %k3, %ecx
2948 ; CHECK-NEXT: vmovd %ecx, %xmm1
2949 ; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
2950 ; CHECK-NEXT: kmovw %k2, %eax
2951 ; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
2952 ; CHECK-NEXT: kmovw %k0, %eax
2953 ; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
2954 ; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
2955 ; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1
2956 ; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
2958 %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
2962 declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
2964 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
2965 ; CHECK-LABEL: test_vextractf64x4:
2967 ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2969 %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
2970 ret <4 x double> %res
2973 declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
2975 declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
2977 define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
2978 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
2980 ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
2981 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
2982 ; CHECK-NEXT: kmovw %edi, %k1
2983 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
2984 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
2985 ; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
2986 ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
2988 %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
2989 %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
2990 %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
2991 %res3 = fadd <16 x float> %res, %res1
2992 %res4 = fadd <16 x float> %res2, %res3
2993 ret <16 x float> %res4
2996 declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
2998 define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
2999 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
3001 ; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
3002 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
3003 ; CHECK-NEXT: kmovw %edi, %k1
3004 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
3005 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
3006 ; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
3007 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
3009 %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
3010 %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
3011 %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
3012 %res3 = add <16 x i32> %res, %res1
3013 %res4 = add <16 x i32> %res2, %res3
3014 ret <16 x i32> %res4
3017 declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
3019 define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
3020 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
3022 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
3023 ; CHECK-NEXT: kmovw %edi, %k1
3024 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
3025 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
3026 ; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
3027 ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
3029 %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
3030 %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
3031 %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
3032 %res3 = fadd <8 x double> %res, %res1
3033 %res4 = fadd <8 x double> %res2, %res3
3034 ret <8 x double> %res4
3037 declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
3039 define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
3040 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
3042 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
3043 ; CHECK-NEXT: kmovw %edi, %k1
3044 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
3045 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
3046 ; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
3047 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
3049 %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
3050 %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
3051 %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
3052 %res3 = add <8 x i64> %res, %res1
3053 %res4 = add <8 x i64> %res2, %res3