1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
5 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
6 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
7 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
8 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
10 @glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
13 ; SCALAR: extractelement <16 x float*>
14 ; SCALAR-NEXT: load float
15 ; SCALAR-NEXT: insertelement <16 x float>
16 ; SCALAR-NEXT: extractelement <16 x float*>
17 ; SCALAR-NEXT: load float
19 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
20 ; KNL_64-LABEL: test1:
22 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
23 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
24 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
27 ; KNL_32-LABEL: test1:
29 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
30 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
31 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
32 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
37 ; SKX-NEXT: kxnorw %k0, %k0, %k1
38 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
39 ; SKX-NEXT: vmovaps %zmm1, %zmm0
42 ; SKX_32-LABEL: test1:
44 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
45 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
46 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
47 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
50 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
51 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
53 %sext_ind = sext <16 x i32> %ind to <16 x i64>
54 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
56 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
60 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
61 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
62 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
66 ; SCALAR: extractelement <16 x float*>
67 ; SCALAR-NEXT: load float
68 ; SCALAR-NEXT: insertelement <16 x float>
69 ; SCALAR-NEXT: br label %else
71 ; SCALAR-NEXT: %res.phi.else = phi
72 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
73 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
74 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
76 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
77 ; KNL_64-LABEL: test2:
79 ; KNL_64-NEXT: kmovw %esi, %k1
80 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
81 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
84 ; KNL_32-LABEL: test2:
86 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
87 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
88 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
89 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
94 ; SKX-NEXT: kmovw %esi, %k1
95 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
96 ; SKX-NEXT: vmovaps %zmm1, %zmm0
99 ; SKX_32-LABEL: test2:
101 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
102 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
103 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
104 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
107 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
108 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
110 %sext_ind = sext <16 x i32> %ind to <16 x i64>
111 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
112 %imask = bitcast i16 %mask to <16 x i1>
113 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
114 ret <16 x float> %res
117 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
118 ; KNL_64-LABEL: test3:
120 ; KNL_64-NEXT: kmovw %esi, %k1
121 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
122 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
125 ; KNL_32-LABEL: test3:
127 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
128 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
129 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
130 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
135 ; SKX-NEXT: kmovw %esi, %k1
136 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
137 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
140 ; SKX_32-LABEL: test3:
142 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
143 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
144 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
145 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
148 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
149 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
151 %sext_ind = sext <16 x i32> %ind to <16 x i64>
152 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
153 %imask = bitcast i16 %mask to <16 x i1>
154 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
159 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
160 ; KNL_64-LABEL: test4:
162 ; KNL_64-NEXT: kmovw %esi, %k1
163 ; KNL_64-NEXT: kmovw %k1, %k2
164 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
165 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
166 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
167 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
170 ; KNL_32-LABEL: test4:
172 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
173 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
174 ; KNL_32-NEXT: kmovw %k1, %k2
175 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
176 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
177 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
178 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
183 ; SKX-NEXT: kmovw %esi, %k1
184 ; SKX-NEXT: kmovw %k1, %k2
185 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
186 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
187 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
188 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
191 ; SKX_32-LABEL: test4:
193 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
194 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
195 ; SKX_32-NEXT: kmovw %k1, %k2
196 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
197 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
198 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
199 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
202 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
203 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
205 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
206 %imask = bitcast i16 %mask to <16 x i1>
207 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
208 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
209 %res = add <16 x i32> %gt1, %gt2
214 ; SCALAR-LABEL: test5
215 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
216 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
217 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
218 ; SCALAR: cond.store:
219 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
220 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
221 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
222 ; SCALAR-NEXT: br label %else
224 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
225 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
226 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
228 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
229 ; KNL_64-LABEL: test5:
231 ; KNL_64-NEXT: kmovw %esi, %k1
232 ; KNL_64-NEXT: kmovw %k1, %k2
233 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
234 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
235 ; KNL_64-NEXT: vzeroupper
238 ; KNL_32-LABEL: test5:
240 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
241 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
242 ; KNL_32-NEXT: kmovw %k1, %k2
243 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
244 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
245 ; KNL_32-NEXT: vzeroupper
250 ; SKX-NEXT: kmovw %esi, %k1
251 ; SKX-NEXT: kmovw %k1, %k2
252 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
253 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
254 ; SKX-NEXT: vzeroupper
257 ; SKX_32-LABEL: test5:
259 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
260 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
261 ; SKX_32-NEXT: kmovw %k1, %k2
262 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
263 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
264 ; SKX_32-NEXT: vzeroupper
267 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
268 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
270 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
271 %imask = bitcast i16 %mask to <16 x i1>
272 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
273 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
277 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
278 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
281 ; SCALAR-LABEL: test6
282 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
283 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
284 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
285 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
286 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
287 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
288 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
290 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
291 ; KNL_64-LABEL: test6:
293 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
294 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
295 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
296 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
297 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
300 ; KNL_32-LABEL: test6:
302 ; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
303 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
304 ; KNL_32-NEXT: movw $255, %ax
305 ; KNL_32-NEXT: kmovw %eax, %k1
306 ; KNL_32-NEXT: kmovw %k1, %k2
307 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2}
308 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
309 ; KNL_32-NEXT: vmovdqa %ymm2, %ymm0
314 ; SKX-NEXT: kxnorw %k0, %k0, %k1
315 ; SKX-NEXT: kxnorw %k0, %k0, %k2
316 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
317 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
318 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
321 ; SKX_32-LABEL: test6:
323 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
324 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
325 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
326 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
327 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
330 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
332 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
336 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
338 ; KNL_64-LABEL: test7:
340 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
341 ; KNL_64-NEXT: kmovw %esi, %k0
342 ; KNL_64-NEXT: kshiftlw $8, %k0, %k0
343 ; KNL_64-NEXT: kshiftrw $8, %k0, %k1
344 ; KNL_64-NEXT: kmovw %k1, %k2
345 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
346 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
347 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
348 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
351 ; KNL_32-LABEL: test7:
353 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
354 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
355 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
356 ; KNL_32-NEXT: kmovw %ecx, %k0
357 ; KNL_32-NEXT: kshiftlw $8, %k0, %k0
358 ; KNL_32-NEXT: kshiftrw $8, %k0, %k1
359 ; KNL_32-NEXT: kmovw %k1, %k2
360 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
361 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
362 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
363 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
368 ; SKX-NEXT: kmovw %esi, %k1
369 ; SKX-NEXT: kmovw %k1, %k2
370 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
371 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
372 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
373 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
376 ; SKX_32-LABEL: test7:
378 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
379 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
380 ; SKX_32-NEXT: kmovw %k1, %k2
381 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
382 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
383 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
384 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
387 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
388 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
390 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
391 %imask = bitcast i8 %mask to <8 x i1>
392 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
393 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
394 %res = add <8 x i32> %gt1, %gt2
398 ; No uniform base in this case, index <8 x i64> contains addresses,
399 ; each gather call will be split into two
400 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
401 ; KNL_64-LABEL: test8:
403 ; KNL_64-NEXT: kmovw %edi, %k1
404 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
405 ; KNL_64-NEXT: kmovw %k2, %k3
406 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
407 ; KNL_64-NEXT: kmovw %k1, %k3
408 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
409 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
410 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
411 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
412 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
413 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
416 ; KNL_32-LABEL: test8:
418 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
419 ; KNL_32-NEXT: kmovw %k1, %k2
420 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
421 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
422 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
423 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
428 ; SKX-NEXT: kmovw %edi, %k1
429 ; SKX-NEXT: kshiftrw $8, %k1, %k2
430 ; SKX-NEXT: kmovw %k2, %k3
431 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
432 ; SKX-NEXT: kmovw %k1, %k3
433 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
434 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
435 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
436 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
437 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
438 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
441 ; SKX_32-LABEL: test8:
443 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
444 ; SKX_32-NEXT: kmovw %k1, %k2
445 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
446 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
447 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
448 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
451 %imask = bitcast i16 %mask to <16 x i1>
452 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
453 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
454 %res = add <16 x i32> %gt1, %gt2
458 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
459 %struct.ST = type { i32, double, %struct.RT }
461 ; Masked gather for agregate types
462 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
465 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
466 ; KNL_64-LABEL: test9:
467 ; KNL_64: # %bb.0: # %entry
468 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
469 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
470 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
471 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
472 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
474 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
475 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
476 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
477 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
478 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
479 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
480 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
481 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
484 ; KNL_32-LABEL: test9:
485 ; KNL_32: # %bb.0: # %entry
486 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
487 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
488 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
489 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
490 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
491 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
492 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
493 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
494 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
495 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
496 ; KNL_32-NEXT: movw $255, %ax
497 ; KNL_32-NEXT: kmovw %eax, %k1
498 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
499 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
502 ; SKX_SMALL-LABEL: test9:
503 ; SKX_SMALL: # %bb.0: # %entry
504 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
505 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
506 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
507 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
508 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
509 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
510 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
511 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
512 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
513 ; SKX_SMALL-NEXT: retq
515 ; SKX_LARGE-LABEL: test9:
516 ; SKX_LARGE: # %bb.0: # %entry
517 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
518 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
519 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
520 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
521 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
522 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
523 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
524 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
525 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
526 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
527 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
528 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
529 ; SKX_LARGE-NEXT: retq
531 ; SKX_32-LABEL: test9:
532 ; SKX_32: # %bb.0: # %entry
533 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
534 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
535 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
536 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
537 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
538 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
539 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
540 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
543 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
544 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
546 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
547 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
551 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
552 ; KNL_64-LABEL: test10:
553 ; KNL_64: # %bb.0: # %entry
554 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
555 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
556 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
557 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
558 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
559 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
560 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
561 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
562 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
563 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
564 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
565 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
566 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
567 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
570 ; KNL_32-LABEL: test10:
571 ; KNL_32: # %bb.0: # %entry
572 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
573 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
574 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
575 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
576 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
577 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
578 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
579 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
580 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
581 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1
582 ; KNL_32-NEXT: movw $255, %ax
583 ; KNL_32-NEXT: kmovw %eax, %k1
584 ; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1}
585 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
588 ; SKX_SMALL-LABEL: test10:
589 ; SKX_SMALL: # %bb.0: # %entry
590 ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
591 ; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
592 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
593 ; SKX_SMALL-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
594 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
595 ; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
596 ; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
597 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
598 ; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
599 ; SKX_SMALL-NEXT: retq
601 ; SKX_LARGE-LABEL: test10:
602 ; SKX_LARGE: # %bb.0: # %entry
603 ; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
604 ; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
605 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
606 ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
607 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
608 ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
609 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
610 ; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
611 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
612 ; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
613 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
614 ; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
615 ; SKX_LARGE-NEXT: retq
617 ; SKX_32-LABEL: test10:
618 ; SKX_32: # %bb.0: # %entry
619 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
620 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
621 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
622 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
623 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
624 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
625 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
626 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
629 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
630 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
632 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
633 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
637 ; Splat index in GEP, requires broadcast
638 define <16 x float> @test11(float* %base, i32 %ind) {
639 ; KNL_64-LABEL: test11:
641 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
642 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
643 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
646 ; KNL_32-LABEL: test11:
648 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
649 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
650 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
651 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
656 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
657 ; SKX-NEXT: kxnorw %k0, %k0, %k1
658 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
661 ; SKX_32-LABEL: test11:
663 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
664 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
665 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
666 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
669 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
670 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
672 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
674 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
678 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
679 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
680 ; KNL_64-LABEL: test12:
682 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
683 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
684 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
687 ; KNL_32-LABEL: test12:
689 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
690 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
691 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
692 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
697 ; SKX-NEXT: kxnorw %k0, %k0, %k1
698 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
699 ; SKX-NEXT: vmovaps %zmm1, %zmm0
702 ; SKX_32-LABEL: test12:
704 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
706 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
707 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
710 %sext_ind = sext <16 x i32> %ind to <16 x i64>
711 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
717 ; The same as the previous, but the mask is undefined
718 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
719 ; KNL_64-LABEL: test13:
721 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
722 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
723 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
726 ; KNL_32-LABEL: test13:
728 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
729 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
730 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
731 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
736 ; SKX-NEXT: kxnorw %k0, %k0, %k1
737 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
738 ; SKX-NEXT: vmovaps %zmm1, %zmm0
741 ; SKX_32-LABEL: test13:
743 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
744 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
745 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
746 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
749 %sext_ind = sext <16 x i32> %ind to <16 x i64>
750 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
752 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
756 ; The base pointer is not splat, can't find unform base
757 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
758 ; KNL_64-LABEL: test14:
760 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
761 ; KNL_64-NEXT: vmovd %esi, %xmm1
762 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
763 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
764 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
765 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
766 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
767 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
768 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
771 ; KNL_32-LABEL: test14:
773 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
774 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
775 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
776 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
777 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
782 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
783 ; SKX-NEXT: vpbroadcastd %esi, %ymm1
784 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
785 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
786 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
787 ; SKX-NEXT: kxnorw %k0, %k0, %k1
788 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
789 ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
792 ; SKX_32-LABEL: test14:
794 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
795 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
796 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
797 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
798 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
801 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
802 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
804 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
806 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
810 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
811 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
812 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
814 ; Gather smaller than existing instruction
815 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
816 ; KNL_64-LABEL: test15:
818 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
819 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
820 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
821 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
822 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
823 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
824 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
825 ; KNL_64-NEXT: vzeroupper
828 ; KNL_32-LABEL: test15:
830 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
831 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
832 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
833 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
834 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
835 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
836 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
837 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
838 ; KNL_32-NEXT: vzeroupper
843 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
844 ; SKX-NEXT: vpmovd2m %xmm1, %k1
845 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
846 ; SKX-NEXT: vmovaps %xmm1, %xmm0
849 ; SKX_32-LABEL: test15:
851 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
852 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
853 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
854 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
855 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
858 %sext_ind = sext <4 x i32> %ind to <4 x i64>
859 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
860 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
864 ; Gather smaller than existing instruction
865 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
866 ; KNL_64-LABEL: test16:
868 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
869 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
870 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
871 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
872 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
873 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
874 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
875 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
878 ; KNL_32-LABEL: test16:
880 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
881 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
882 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
883 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
884 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
885 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
886 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
887 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k1}
888 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
893 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
894 ; SKX-NEXT: vpmovd2m %xmm1, %k1
895 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
896 ; SKX-NEXT: vmovapd %ymm2, %ymm0
899 ; SKX_32-LABEL: test16:
901 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
902 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
903 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
904 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
905 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
908 %sext_ind = sext <4 x i32> %ind to <4 x i64>
909 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
910 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
914 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
915 ; KNL_64-LABEL: test17:
917 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
918 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
919 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
920 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
921 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
922 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
923 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
924 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
925 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
926 ; KNL_64-NEXT: vzeroupper
929 ; KNL_32-LABEL: test17:
931 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
932 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
933 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
934 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
935 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
936 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
937 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
938 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
939 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
940 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
941 ; KNL_32-NEXT: vzeroupper
946 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
947 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
948 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
949 ; SKX-NEXT: vpmovq2m %xmm1, %k1
950 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
951 ; SKX-NEXT: vmovapd %xmm2, %xmm0
954 ; SKX_32-LABEL: test17:
956 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
957 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
958 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
959 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
960 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
961 ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
962 ; SKX_32-NEXT: vmovapd %xmm2, %xmm0
965 %sext_ind = sext <2 x i32> %ind to <2 x i64>
966 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
967 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
971 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
972 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
973 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
974 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
975 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
977 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
978 ; KNL_64-LABEL: test18:
980 ; KNL_64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
981 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
982 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
983 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0
984 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
985 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
986 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
987 ; KNL_64-NEXT: vzeroupper
990 ; KNL_32-LABEL: test18:
992 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
993 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
994 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
995 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0
996 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
997 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
998 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
999 ; KNL_32-NEXT: vzeroupper
1002 ; SKX-LABEL: test18:
1004 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1005 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1006 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1007 ; SKX-NEXT: vzeroupper
1010 ; SKX_32-LABEL: test18:
1012 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1013 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1014 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1016 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1020 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1021 ; KNL_64-LABEL: test19:
1023 ; KNL_64-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1024 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1025 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1026 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
1027 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
1028 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
1029 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1030 ; KNL_64-NEXT: vzeroupper
1033 ; KNL_32-LABEL: test19:
1035 ; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
1036 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
1037 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1038 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
1039 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
1040 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
1041 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1042 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1043 ; KNL_32-NEXT: vzeroupper
1046 ; SKX-LABEL: test19:
1048 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1049 ; SKX-NEXT: vpmovd2m %xmm1, %k1
1050 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1051 ; SKX-NEXT: vzeroupper
1054 ; SKX_32-LABEL: test19:
1056 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1057 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
1058 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1059 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1060 ; SKX_32-NEXT: vzeroupper
1062 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1063 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1067 ; Data type requires widening
1068 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1069 ; KNL_64-LABEL: test20:
1071 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1072 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
1073 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1074 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1075 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1076 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1077 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1078 ; KNL_64-NEXT: vzeroupper
1081 ; KNL_32-LABEL: test20:
1083 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1084 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
1085 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1086 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1087 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1088 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1089 ; KNL_32-NEXT: vscatterdps %zmm0, (,%zmm1) {%k1}
1090 ; KNL_32-NEXT: vzeroupper
1093 ; SKX-LABEL: test20:
1095 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1096 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1097 ; SKX-NEXT: vscatterqps %xmm0, (,%xmm1) {%k1}
1100 ; SKX_32-LABEL: test20:
1102 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
1103 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1104 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1105 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
1107 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1111 ; Data type requires promotion
1112 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1113 ; KNL_64-LABEL: test21:
1115 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1116 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
1117 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
1118 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1119 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1120 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1121 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1122 ; KNL_64-NEXT: vzeroupper
1125 ; KNL_32-LABEL: test21:
1127 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
1128 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
1129 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1130 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1131 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1132 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1133 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1134 ; KNL_32-NEXT: vzeroupper
1137 ; SKX-LABEL: test21:
1139 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1140 ; SKX-NEXT: vpmovq2m %xmm2, %k1
1141 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1142 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1145 ; SKX_32-LABEL: test21:
1147 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1148 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
1149 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1150 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1151 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1153 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1157 ; The result type requires widening
1158 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1160 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1161 ; KNL_64-LABEL: test22:
1163 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1164 ; KNL_64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1165 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1166 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1167 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1168 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1169 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
1170 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1171 ; KNL_64-NEXT: vzeroupper
1174 ; KNL_32-LABEL: test22:
1176 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1177 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1178 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1179 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1180 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1181 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1182 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1183 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm2 {%k1}
1184 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1185 ; KNL_32-NEXT: vzeroupper
1188 ; SKX-LABEL: test22:
1190 ; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1191 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1192 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1193 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1194 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1197 ; SKX_32-LABEL: test22:
1199 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
1200 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1201 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1202 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1203 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1204 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1206 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1207 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1208 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1212 define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
1213 ; KNL_64-LABEL: test22a:
1215 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1216 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1217 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1218 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1219 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1220 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1221 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1222 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1223 ; KNL_64-NEXT: vzeroupper
1226 ; KNL_32-LABEL: test22a:
1228 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
1229 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1230 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1231 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1232 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1233 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1234 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1235 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1236 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1237 ; KNL_32-NEXT: vzeroupper
1240 ; SKX-LABEL: test22a:
1242 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1243 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1244 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
1245 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1248 ; SKX_32-LABEL: test22a:
1250 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1251 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1252 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1253 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm2 {%k1}
1254 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1256 %gep.random = getelementptr float, float* %base, <2 x i64> %ind
1257 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1261 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1262 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1264 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1265 ; KNL_64-LABEL: test23:
1267 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1268 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1269 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1270 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1271 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1272 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1273 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1274 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1275 ; KNL_64-NEXT: vzeroupper
1278 ; KNL_32-LABEL: test23:
1280 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1281 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1282 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1283 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1284 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1285 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1286 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1287 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1288 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1289 ; KNL_32-NEXT: vzeroupper
1292 ; SKX-LABEL: test23:
1294 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1295 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1296 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1297 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1298 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1299 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1302 ; SKX_32-LABEL: test23:
1304 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1305 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1306 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1307 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1308 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1309 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1310 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1312 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1313 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1314 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1318 define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1319 ; KNL_64-LABEL: test23b:
1321 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1322 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1323 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1324 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1325 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1326 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1327 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
1328 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1329 ; KNL_64-NEXT: vzeroupper
1332 ; KNL_32-LABEL: test23b:
1334 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
1335 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1336 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1337 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1338 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1339 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1340 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1341 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
1342 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1343 ; KNL_32-NEXT: vzeroupper
1346 ; SKX-LABEL: test23b:
1348 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1349 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1350 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1351 ; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
1352 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1355 ; SKX_32-LABEL: test23b:
1357 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1358 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1359 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1360 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1361 ; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
1362 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1364 %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
1365 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1369 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1370 ; KNL_64-LABEL: test24:
1372 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1373 ; KNL_64-NEXT: movw $3, %ax
1374 ; KNL_64-NEXT: kmovw %eax, %k1
1375 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
1376 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1377 ; KNL_64-NEXT: vzeroupper
1380 ; KNL_32-LABEL: test24:
1382 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1383 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1384 ; KNL_32-NEXT: movw $3, %cx
1385 ; KNL_32-NEXT: kmovw %ecx, %k1
1386 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
1387 ; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1388 ; KNL_32-NEXT: vzeroupper
1391 ; SKX-LABEL: test24:
1393 ; SKX-NEXT: movb $3, %al
1394 ; SKX-NEXT: kmovw %eax, %k1
1395 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1396 ; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
1397 ; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1400 ; SKX_32-LABEL: test24:
1402 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1403 ; SKX_32-NEXT: movb $3, %cl
1404 ; SKX_32-NEXT: kmovw %ecx, %k1
1405 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1406 ; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
1407 ; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
1409 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1410 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1411 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1415 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1416 ; KNL_64-LABEL: test25:
1418 ; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1419 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
1420 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
1421 ; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1
1422 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0
1423 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
1424 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
1425 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1426 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1427 ; KNL_64-NEXT: vzeroupper
1430 ; KNL_32-LABEL: test25:
1432 ; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
1433 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
1434 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
1435 ; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1
1436 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0
1437 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
1438 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
1439 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1440 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1441 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1442 ; KNL_32-NEXT: vzeroupper
1445 ; SKX-LABEL: test25:
1447 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
1448 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
1449 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1450 ; SKX-NEXT: vpmovq2m %xmm1, %k1
1451 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1452 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1455 ; SKX_32-LABEL: test25:
1457 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
1458 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
1459 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1460 ; SKX_32-NEXT: vpmovq2m %xmm1, %k1
1461 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1462 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1463 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1465 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1466 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1467 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1471 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1472 ; KNL_64-LABEL: test26:
1474 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1475 ; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
1476 ; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
1477 ; KNL_64-NEXT: movb $3, %al
1478 ; KNL_64-NEXT: kmovw %eax, %k1
1479 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1480 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1481 ; KNL_64-NEXT: vzeroupper
1484 ; KNL_32-LABEL: test26:
1486 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1487 ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
1488 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
1489 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1490 ; KNL_32-NEXT: movb $3, %cl
1491 ; KNL_32-NEXT: kmovw %ecx, %k1
1492 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1493 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1494 ; KNL_32-NEXT: vzeroupper
1497 ; SKX-LABEL: test26:
1499 ; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
1500 ; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
1501 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1502 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1503 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1506 ; SKX_32-LABEL: test26:
1508 ; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
1509 ; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
1510 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1511 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1512 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1513 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1515 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1516 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1517 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1521 ; Result type requires widening; all-ones mask
1522 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1523 ; KNL_64-LABEL: test27:
1525 ; KNL_64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1526 ; KNL_64-NEXT: movw $3, %ax
1527 ; KNL_64-NEXT: kmovw %eax, %k1
1528 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
1529 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1530 ; KNL_64-NEXT: vzeroupper
1533 ; KNL_32-LABEL: test27:
1535 ; KNL_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1536 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1537 ; KNL_32-NEXT: movw $3, %cx
1538 ; KNL_32-NEXT: kmovw %ecx, %k1
1539 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
1540 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1541 ; KNL_32-NEXT: vzeroupper
1544 ; SKX-LABEL: test27:
1546 ; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1547 ; SKX-NEXT: movb $3, %al
1548 ; SKX-NEXT: kmovw %eax, %k1
1549 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1552 ; SKX_32-LABEL: test27:
1554 ; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
1555 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1556 ; SKX_32-NEXT: movb $3, %cl
1557 ; SKX_32-NEXT: kmovw %ecx, %k1
1558 ; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
1560 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1561 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1562 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1566 ; Data type requires promotion, mask is all-ones
1567 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1568 ; KNL_64-LABEL: test28:
1570 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1571 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1572 ; KNL_64-NEXT: movb $3, %al
1573 ; KNL_64-NEXT: kmovw %eax, %k1
1574 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1575 ; KNL_64-NEXT: vzeroupper
1578 ; KNL_32-LABEL: test28:
1580 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1581 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1582 ; KNL_32-NEXT: movw $3, %ax
1583 ; KNL_32-NEXT: kmovw %eax, %k1
1584 ; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1}
1585 ; KNL_32-NEXT: vzeroupper
1588 ; SKX-LABEL: test28:
1590 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1591 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1592 ; SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
1595 ; SKX_32-LABEL: test28:
1597 ; SKX_32-NEXT: movb $3, %al
1598 ; SKX_32-NEXT: kmovw %eax, %k1
1599 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1600 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1601 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
1603 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1607 ; SCALAR-LABEL: test29
1608 ; SCALAR: extractelement <16 x float*>
1609 ; SCALAR-NEXT: load float
1610 ; SCALAR-NEXT: insertelement <16 x float>
1611 ; SCALAR-NEXT: extractelement <16 x float*>
1612 ; SCALAR-NEXT: load float
1614 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1615 ; KNL_64-LABEL: test29:
1617 ; KNL_64-NEXT: movw $44, %ax
1618 ; KNL_64-NEXT: kmovw %eax, %k1
1619 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1620 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1623 ; KNL_32-LABEL: test29:
1625 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1626 ; KNL_32-NEXT: movw $44, %cx
1627 ; KNL_32-NEXT: kmovw %ecx, %k1
1628 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1629 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1632 ; SKX-LABEL: test29:
1634 ; SKX-NEXT: movw $44, %ax
1635 ; SKX-NEXT: kmovw %eax, %k1
1636 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1637 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1640 ; SKX_32-LABEL: test29:
1642 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1643 ; SKX_32-NEXT: movw $44, %cx
1644 ; SKX_32-NEXT: kmovw %ecx, %k1
1645 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1646 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1649 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1650 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1652 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1653 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1655 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1656 ret <16 x float>%res
1659 ; Check non-power-of-2 case. It should be scalarized.
1660 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1661 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1662 ; KNL_64-LABEL: test30:
1664 ; KNL_64-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3
1665 ; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2
1666 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1667 ; KNL_64-NEXT: kmovw %k1, %eax
1668 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1669 ; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
1670 ; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1671 ; KNL_64-NEXT: testb $1, %al
1672 ; KNL_64-NEXT: # implicit-def: $xmm0
1673 ; KNL_64-NEXT: je .LBB31_2
1674 ; KNL_64-NEXT: # %bb.1: # %cond.load
1675 ; KNL_64-NEXT: vmovq %xmm1, %rax
1676 ; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1677 ; KNL_64-NEXT: .LBB31_2: # %else
1678 ; KNL_64-NEXT: kshiftrw $1, %k1, %k0
1679 ; KNL_64-NEXT: kmovw %k0, %eax
1680 ; KNL_64-NEXT: testb $1, %al
1681 ; KNL_64-NEXT: je .LBB31_4
1682 ; KNL_64-NEXT: # %bb.3: # %cond.load1
1683 ; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
1684 ; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
1685 ; KNL_64-NEXT: .LBB31_4: # %else2
1686 ; KNL_64-NEXT: kshiftrw $2, %k1, %k0
1687 ; KNL_64-NEXT: kmovw %k0, %eax
1688 ; KNL_64-NEXT: testb $1, %al
1689 ; KNL_64-NEXT: je .LBB31_6
1690 ; KNL_64-NEXT: # %bb.5: # %cond.load4
1691 ; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
1692 ; KNL_64-NEXT: vmovq %xmm1, %rax
1693 ; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
1694 ; KNL_64-NEXT: .LBB31_6: # %else5
1695 ; KNL_64-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
1696 ; KNL_64-NEXT: vmovdqa %xmm3, %xmm0
1697 ; KNL_64-NEXT: vzeroupper
1700 ; KNL_32-LABEL: test30:
1702 ; KNL_32-NEXT: subl $12, %esp
1703 ; KNL_32-NEXT: .cfi_def_cfa_offset 16
1704 ; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2
1705 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
1706 ; KNL_32-NEXT: kmovw %k1, %eax
1707 ; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
1708 ; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
1709 ; KNL_32-NEXT: testb $1, %al
1710 ; KNL_32-NEXT: # implicit-def: $xmm1
1711 ; KNL_32-NEXT: je .LBB31_2
1712 ; KNL_32-NEXT: # %bb.1: # %cond.load
1713 ; KNL_32-NEXT: vmovd %xmm2, %eax
1714 ; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1715 ; KNL_32-NEXT: .LBB31_2: # %else
1716 ; KNL_32-NEXT: kshiftrw $1, %k1, %k0
1717 ; KNL_32-NEXT: kmovw %k0, %eax
1718 ; KNL_32-NEXT: testb $1, %al
1719 ; KNL_32-NEXT: je .LBB31_4
1720 ; KNL_32-NEXT: # %bb.3: # %cond.load1
1721 ; KNL_32-NEXT: vpextrd $1, %xmm2, %eax
1722 ; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
1723 ; KNL_32-NEXT: .LBB31_4: # %else2
1724 ; KNL_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1725 ; KNL_32-NEXT: kshiftrw $2, %k1, %k0
1726 ; KNL_32-NEXT: kmovw %k0, %eax
1727 ; KNL_32-NEXT: testb $1, %al
1728 ; KNL_32-NEXT: je .LBB31_6
1729 ; KNL_32-NEXT: # %bb.5: # %cond.load4
1730 ; KNL_32-NEXT: vpextrd $2, %xmm2, %eax
1731 ; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
1732 ; KNL_32-NEXT: .LBB31_6: # %else5
1733 ; KNL_32-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
1734 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1735 ; KNL_32-NEXT: addl $12, %esp
1736 ; KNL_32-NEXT: .cfi_def_cfa_offset 4
1737 ; KNL_32-NEXT: vzeroupper
1740 ; SKX-LABEL: test30:
1742 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
1743 ; SKX-NEXT: vpmovd2m %xmm2, %k1
1744 ; SKX-NEXT: kmovw %k1, %eax
1745 ; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
1746 ; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
1747 ; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
1748 ; SKX-NEXT: testb $1, %al
1749 ; SKX-NEXT: # implicit-def: $xmm0
1750 ; SKX-NEXT: je .LBB31_2
1751 ; SKX-NEXT: # %bb.1: # %cond.load
1752 ; SKX-NEXT: vmovq %xmm1, %rax
1753 ; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1754 ; SKX-NEXT: .LBB31_2: # %else
1755 ; SKX-NEXT: kshiftrw $1, %k1, %k0
1756 ; SKX-NEXT: kmovw %k0, %eax
1757 ; SKX-NEXT: testb $1, %al
1758 ; SKX-NEXT: je .LBB31_4
1759 ; SKX-NEXT: # %bb.3: # %cond.load1
1760 ; SKX-NEXT: vpextrq $1, %xmm1, %rax
1761 ; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
1762 ; SKX-NEXT: .LBB31_4: # %else2
1763 ; SKX-NEXT: kshiftrw $2, %k1, %k0
1764 ; SKX-NEXT: kmovw %k0, %eax
1765 ; SKX-NEXT: testb $1, %al
1766 ; SKX-NEXT: je .LBB31_6
1767 ; SKX-NEXT: # %bb.5: # %cond.load4
1768 ; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
1769 ; SKX-NEXT: vmovq %xmm1, %rax
1770 ; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
1771 ; SKX-NEXT: .LBB31_6: # %else5
1772 ; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
1773 ; SKX-NEXT: vmovdqa %xmm3, %xmm0
1774 ; SKX-NEXT: vzeroupper
1777 ; SKX_32-LABEL: test30:
1779 ; SKX_32-NEXT: subl $12, %esp
1780 ; SKX_32-NEXT: .cfi_def_cfa_offset 16
1781 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
1782 ; SKX_32-NEXT: vpmovd2m %xmm2, %k1
1783 ; SKX_32-NEXT: kmovw %k1, %eax
1784 ; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
1785 ; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
1786 ; SKX_32-NEXT: testb $1, %al
1787 ; SKX_32-NEXT: # implicit-def: $xmm1
1788 ; SKX_32-NEXT: je .LBB31_2
1789 ; SKX_32-NEXT: # %bb.1: # %cond.load
1790 ; SKX_32-NEXT: vmovd %xmm2, %eax
1791 ; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1792 ; SKX_32-NEXT: .LBB31_2: # %else
1793 ; SKX_32-NEXT: kshiftrw $1, %k1, %k0
1794 ; SKX_32-NEXT: kmovw %k0, %eax
1795 ; SKX_32-NEXT: testb $1, %al
1796 ; SKX_32-NEXT: je .LBB31_4
1797 ; SKX_32-NEXT: # %bb.3: # %cond.load1
1798 ; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
1799 ; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
1800 ; SKX_32-NEXT: .LBB31_4: # %else2
1801 ; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
1802 ; SKX_32-NEXT: kshiftrw $2, %k1, %k0
1803 ; SKX_32-NEXT: kmovw %k0, %eax
1804 ; SKX_32-NEXT: testb $1, %al
1805 ; SKX_32-NEXT: je .LBB31_6
1806 ; SKX_32-NEXT: # %bb.5: # %cond.load4
1807 ; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
1808 ; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
1809 ; SKX_32-NEXT: .LBB31_6: # %else5
1810 ; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
1811 ; SKX_32-NEXT: addl $12, %esp
1812 ; SKX_32-NEXT: .cfi_def_cfa_offset 4
1815 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1816 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1817 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1821 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1822 define <16 x float*> @test31(<16 x float**> %ptrs) {
1823 ; KNL_64-LABEL: test31:
1825 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1826 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1827 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1828 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1829 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1830 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
1833 ; KNL_32-LABEL: test31:
1835 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1836 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1837 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1840 ; SKX-LABEL: test31:
1842 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1843 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1844 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1845 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1846 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
1847 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
1850 ; SKX_32-LABEL: test31:
1852 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1853 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1854 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
1857 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1858 ret <16 x float*>%res
1861 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1862 ; KNL_64-LABEL: test_gather_16i32:
1864 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1865 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1866 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1867 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1868 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1869 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1870 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1871 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1874 ; KNL_32-LABEL: test_gather_16i32:
1876 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1877 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1878 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1879 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1880 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1883 ; SKX-LABEL: test_gather_16i32:
1885 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1886 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1887 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1888 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1889 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1890 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1891 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1892 ; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1895 ; SKX_32-LABEL: test_gather_16i32:
1897 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1898 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1899 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1900 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1901 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1903 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1906 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1907 ; KNL_64-LABEL: test_gather_16i64:
1909 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1910 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1911 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1912 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1913 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1914 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1915 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
1916 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
1919 ; KNL_32-LABEL: test_gather_16i64:
1921 ; KNL_32-NEXT: pushl %ebp
1922 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1923 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1924 ; KNL_32-NEXT: movl %esp, %ebp
1925 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1926 ; KNL_32-NEXT: andl $-64, %esp
1927 ; KNL_32-NEXT: subl $64, %esp
1928 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1929 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1930 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1931 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1932 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1933 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1934 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1935 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1936 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1937 ; KNL_32-NEXT: movl %ebp, %esp
1938 ; KNL_32-NEXT: popl %ebp
1939 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
1942 ; SKX-LABEL: test_gather_16i64:
1944 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1945 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1946 ; SKX-NEXT: vpmovd2m %zmm2, %k1
1947 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1948 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1949 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1950 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
1951 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
1954 ; SKX_32-LABEL: test_gather_16i64:
1956 ; SKX_32-NEXT: pushl %ebp
1957 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1958 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1959 ; SKX_32-NEXT: movl %esp, %ebp
1960 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1961 ; SKX_32-NEXT: andl $-64, %esp
1962 ; SKX_32-NEXT: subl $64, %esp
1963 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1964 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1965 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
1966 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1967 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1968 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1969 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1970 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1971 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1972 ; SKX_32-NEXT: movl %ebp, %esp
1973 ; SKX_32-NEXT: popl %ebp
1974 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
1976 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1979 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1980 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1981 ; KNL_64-LABEL: test_gather_16f32:
1983 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1984 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1985 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1986 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1987 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1988 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1989 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1990 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1993 ; KNL_32-LABEL: test_gather_16f32:
1995 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1996 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1997 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1998 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1999 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
2002 ; SKX-LABEL: test_gather_16f32:
2004 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2005 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2006 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2007 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
2008 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2009 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
2010 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
2011 ; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
2014 ; SKX_32-LABEL: test_gather_16f32:
2016 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2017 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2018 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2019 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
2020 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
2022 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
2023 ret <16 x float> %res
2025 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2026 ; KNL_64-LABEL: test_gather_16f64:
2028 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2029 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2030 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2031 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2032 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2033 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2034 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
2035 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
2038 ; KNL_32-LABEL: test_gather_16f64:
2040 ; KNL_32-NEXT: pushl %ebp
2041 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2042 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2043 ; KNL_32-NEXT: movl %esp, %ebp
2044 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2045 ; KNL_32-NEXT: andl $-64, %esp
2046 ; KNL_32-NEXT: subl $64, %esp
2047 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2048 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2049 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2050 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2051 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2052 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2053 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2054 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2055 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2056 ; KNL_32-NEXT: movl %ebp, %esp
2057 ; KNL_32-NEXT: popl %ebp
2058 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2061 ; SKX-LABEL: test_gather_16f64:
2063 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2064 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2065 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2066 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2067 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
2068 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
2069 ; SKX-NEXT: vmovapd %zmm3, %zmm0
2070 ; SKX-NEXT: vmovapd %zmm4, %zmm1
2073 ; SKX_32-LABEL: test_gather_16f64:
2075 ; SKX_32-NEXT: pushl %ebp
2076 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2077 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2078 ; SKX_32-NEXT: movl %esp, %ebp
2079 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2080 ; SKX_32-NEXT: andl $-64, %esp
2081 ; SKX_32-NEXT: subl $64, %esp
2082 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2083 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2084 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2085 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2086 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2087 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
2088 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2089 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
2090 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2091 ; SKX_32-NEXT: movl %ebp, %esp
2092 ; SKX_32-NEXT: popl %ebp
2093 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2095 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
2096 ret <16 x double> %res
2098 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
2099 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
2100 ; KNL_64-LABEL: test_scatter_16i32:
2102 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2103 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2104 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2105 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2106 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2107 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2108 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2109 ; KNL_64-NEXT: vzeroupper
2112 ; KNL_32-LABEL: test_scatter_16i32:
2114 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2115 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2116 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2117 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2118 ; KNL_32-NEXT: vzeroupper
2121 ; SKX-LABEL: test_scatter_16i32:
2123 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2124 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2125 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2126 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2127 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
2128 ; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
2129 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
2130 ; SKX-NEXT: vzeroupper
2133 ; SKX_32-LABEL: test_scatter_16i32:
2135 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2136 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2137 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2138 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
2139 ; SKX_32-NEXT: vzeroupper
2141 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
2144 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
2145 ; KNL_64-LABEL: test_scatter_16i64:
2147 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2148 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2149 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2150 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2151 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2152 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2153 ; KNL_64-NEXT: vzeroupper
2156 ; KNL_32-LABEL: test_scatter_16i64:
2158 ; KNL_32-NEXT: pushl %ebp
2159 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2160 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2161 ; KNL_32-NEXT: movl %esp, %ebp
2162 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2163 ; KNL_32-NEXT: andl $-64, %esp
2164 ; KNL_32-NEXT: subl $64, %esp
2165 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2166 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2167 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2168 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2169 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2170 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2171 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2172 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2173 ; KNL_32-NEXT: movl %ebp, %esp
2174 ; KNL_32-NEXT: popl %ebp
2175 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2176 ; KNL_32-NEXT: vzeroupper
2179 ; SKX-LABEL: test_scatter_16i64:
2181 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2182 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2183 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2184 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2185 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
2186 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
2187 ; SKX-NEXT: vzeroupper
2190 ; SKX_32-LABEL: test_scatter_16i64:
2192 ; SKX_32-NEXT: pushl %ebp
2193 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2194 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2195 ; SKX_32-NEXT: movl %esp, %ebp
2196 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2197 ; SKX_32-NEXT: andl $-64, %esp
2198 ; SKX_32-NEXT: subl $64, %esp
2199 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2200 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2201 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2202 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
2203 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2204 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
2205 ; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2206 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
2207 ; SKX_32-NEXT: movl %ebp, %esp
2208 ; SKX_32-NEXT: popl %ebp
2209 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2210 ; SKX_32-NEXT: vzeroupper
2212 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
2215 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2216 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2217 ; KNL_64-LABEL: test_scatter_16f32:
2219 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2220 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2221 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2222 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2223 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2224 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2225 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2226 ; KNL_64-NEXT: vzeroupper
2229 ; KNL_32-LABEL: test_scatter_16f32:
2231 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2232 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2233 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2234 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2235 ; KNL_32-NEXT: vzeroupper
2238 ; SKX-LABEL: test_scatter_16f32:
2240 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2241 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2242 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2243 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2244 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2245 ; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2246 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2247 ; SKX-NEXT: vzeroupper
2250 ; SKX_32-LABEL: test_scatter_16f32:
2252 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2253 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2254 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2255 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2256 ; SKX_32-NEXT: vzeroupper
2258 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2261 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2262 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2263 ; KNL_64-LABEL: test_scatter_16f64:
2265 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2266 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2267 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2268 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2269 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2270 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2271 ; KNL_64-NEXT: vzeroupper
2274 ; KNL_32-LABEL: test_scatter_16f64:
2276 ; KNL_32-NEXT: pushl %ebp
2277 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2278 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2279 ; KNL_32-NEXT: movl %esp, %ebp
2280 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2281 ; KNL_32-NEXT: andl $-64, %esp
2282 ; KNL_32-NEXT: subl $64, %esp
2283 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2284 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2285 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2286 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2287 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2288 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2289 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2290 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2291 ; KNL_32-NEXT: movl %ebp, %esp
2292 ; KNL_32-NEXT: popl %ebp
2293 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2294 ; KNL_32-NEXT: vzeroupper
2297 ; SKX-LABEL: test_scatter_16f64:
2299 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2300 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2301 ; SKX-NEXT: vpmovd2m %zmm2, %k1
2302 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2303 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2304 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2305 ; SKX-NEXT: vzeroupper
2308 ; SKX_32-LABEL: test_scatter_16f64:
2310 ; SKX_32-NEXT: pushl %ebp
2311 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2312 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2313 ; SKX_32-NEXT: movl %esp, %ebp
2314 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2315 ; SKX_32-NEXT: andl $-64, %esp
2316 ; SKX_32-NEXT: subl $64, %esp
2317 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2318 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2319 ; SKX_32-NEXT: vpmovd2m %zmm1, %k1
2320 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2321 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2322 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2323 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
2324 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2325 ; SKX_32-NEXT: movl %ebp, %esp
2326 ; SKX_32-NEXT: popl %ebp
2327 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2328 ; SKX_32-NEXT: vzeroupper
2330 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2333 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2335 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2336 ; KNL_64-LABEL: test_pr28312:
2338 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
2339 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
2340 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0
2341 ; KNL_64-NEXT: kshiftlw $12, %k0, %k0
2342 ; KNL_64-NEXT: kshiftrw $12, %k0, %k1
2343 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2344 ; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2345 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2348 ; KNL_32-LABEL: test_pr28312:
2350 ; KNL_32-NEXT: pushl %ebp
2351 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2352 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2353 ; KNL_32-NEXT: movl %esp, %ebp
2354 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2355 ; KNL_32-NEXT: andl $-32, %esp
2356 ; KNL_32-NEXT: subl $32, %esp
2357 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
2358 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
2359 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0
2360 ; KNL_32-NEXT: kshiftlw $12, %k0, %k0
2361 ; KNL_32-NEXT: kshiftrw $12, %k0, %k1
2362 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k1}
2363 ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2364 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2365 ; KNL_32-NEXT: movl %ebp, %esp
2366 ; KNL_32-NEXT: popl %ebp
2367 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2370 ; SKX-LABEL: test_pr28312:
2372 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
2373 ; SKX-NEXT: vpmovd2m %xmm1, %k1
2374 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
2375 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2376 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2379 ; SKX_32-LABEL: test_pr28312:
2381 ; SKX_32-NEXT: pushl %ebp
2382 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2383 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2384 ; SKX_32-NEXT: movl %esp, %ebp
2385 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2386 ; SKX_32-NEXT: andl $-32, %esp
2387 ; SKX_32-NEXT: subl $32, %esp
2388 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
2389 ; SKX_32-NEXT: vpmovd2m %xmm1, %k1
2390 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
2391 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2392 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2393 ; SKX_32-NEXT: movl %ebp, %esp
2394 ; SKX_32-NEXT: popl %ebp
2395 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2397 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2398 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2399 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2400 %a = add <4 x i64> %g1, %g2
2401 %b = add <4 x i64> %a, %g3
2404 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
2406 define <8 x i32> @test_global_array(<8 x i64> %indxs) {
2407 ; KNL_64-LABEL: test_global_array:
2409 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2410 ; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2411 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
2414 ; KNL_32-LABEL: test_global_array:
2416 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2417 ; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2418 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
2421 ; SKX_SMALL-LABEL: test_global_array:
2422 ; SKX_SMALL: # %bb.0:
2423 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2424 ; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2425 ; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
2426 ; SKX_SMALL-NEXT: retq
2428 ; SKX_LARGE-LABEL: test_global_array:
2429 ; SKX_LARGE: # %bb.0:
2430 ; SKX_LARGE-NEXT: movabsq $glob_array, %rax
2431 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2432 ; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
2433 ; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
2434 ; SKX_LARGE-NEXT: retq
2436 ; SKX_32-LABEL: test_global_array:
2438 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2439 ; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
2440 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
2442 %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
2443 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
2447 define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
2448 ; KNL_64-LABEL: v1_scatter:
2450 ; KNL_64-NEXT: testb $1, %dl
2451 ; KNL_64-NEXT: jne .LBB43_1
2452 ; KNL_64-NEXT: # %bb.2: # %else
2454 ; KNL_64-NEXT: .LBB43_1: # %cond.store
2455 ; KNL_64-NEXT: movl %edi, (%rsi)
2458 ; KNL_32-LABEL: v1_scatter:
2460 ; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2461 ; KNL_32-NEXT: jne .LBB43_1
2462 ; KNL_32-NEXT: # %bb.2: # %else
2464 ; KNL_32-NEXT: .LBB43_1: # %cond.store
2465 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2466 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2467 ; KNL_32-NEXT: movl %ecx, (%eax)
2470 ; SKX-LABEL: v1_scatter:
2472 ; SKX-NEXT: testb $1, %dl
2473 ; SKX-NEXT: jne .LBB43_1
2474 ; SKX-NEXT: # %bb.2: # %else
2476 ; SKX-NEXT: .LBB43_1: # %cond.store
2477 ; SKX-NEXT: movl %edi, (%rsi)
2480 ; SKX_32-LABEL: v1_scatter:
2482 ; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
2483 ; SKX_32-NEXT: jne .LBB43_1
2484 ; SKX_32-NEXT: # %bb.2: # %else
2486 ; SKX_32-NEXT: .LBB43_1: # %cond.store
2487 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2488 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2489 ; SKX_32-NEXT: movl %ecx, (%eax)
2491 call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
2494 declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
2496 define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
2497 ; KNL_64-LABEL: v1_gather:
2499 ; KNL_64-NEXT: movl (%rdi), %eax
2502 ; KNL_32-LABEL: v1_gather:
2504 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2505 ; KNL_32-NEXT: movl (%eax), %eax
2508 ; SKX-LABEL: v1_gather:
2510 ; SKX-NEXT: movl (%rdi), %eax
2513 ; SKX_32-LABEL: v1_gather:
2515 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2516 ; SKX_32-NEXT: movl (%eax), %eax
2518 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
2521 declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
2523 ; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
2524 ; This experienced a bad interaction when we widened and then tried to split.
2525 define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
2526 ; KNL_64-LABEL: large_index:
2528 ; KNL_64-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2529 ; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0
2530 ; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0
2531 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2532 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2533 ; KNL_64-NEXT: vmovq %rcx, %xmm0
2534 ; KNL_64-NEXT: vmovq %rsi, %xmm2
2535 ; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2536 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k1}
2537 ; KNL_64-NEXT: vmovaps %xmm1, %xmm0
2538 ; KNL_64-NEXT: vzeroupper
2541 ; KNL_32-LABEL: large_index:
2543 ; KNL_32-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
2544 ; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0
2545 ; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0
2546 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2547 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2548 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2549 ; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2550 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2551 ; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2552 ; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2553 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm1 {%k1}
2554 ; KNL_32-NEXT: vmovaps %xmm1, %xmm0
2555 ; KNL_32-NEXT: vzeroupper
2558 ; SKX-LABEL: large_index:
2560 ; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
2561 ; SKX-NEXT: vpmovq2m %xmm0, %k1
2562 ; SKX-NEXT: vmovq %rcx, %xmm0
2563 ; SKX-NEXT: vmovq %rsi, %xmm2
2564 ; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2565 ; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
2566 ; SKX-NEXT: vmovaps %xmm1, %xmm0
2569 ; SKX_32-LABEL: large_index:
2571 ; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
2572 ; SKX_32-NEXT: vpmovq2m %xmm0, %k1
2573 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2574 ; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2575 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2576 ; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2577 ; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2578 ; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
2579 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
2581 %gep.random = getelementptr float, float* %base, <2 x i128> %ind
2582 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
2586 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2587 define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
2588 ; KNL_64-LABEL: sext_i8_index:
2590 ; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1
2591 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2592 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2595 ; KNL_32-LABEL: sext_i8_index:
2597 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2598 ; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1
2599 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2600 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2603 ; SKX-LABEL: sext_i8_index:
2605 ; SKX-NEXT: vpmovsxbd %xmm0, %zmm1
2606 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2607 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2610 ; SKX_32-LABEL: sext_i8_index:
2612 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2613 ; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1
2614 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2615 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2618 %sext_ind = sext <16 x i8> %ind to <16 x i64>
2619 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2621 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2622 ret <16 x float>%res
2625 ; Make sure we allow index to be sign extended from a smaller than i32 element size.
2626 define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
2627 ; KNL_64-LABEL: sext_v8i8_index:
2629 ; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2630 ; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0
2631 ; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm1
2632 ; KNL_64-NEXT: movw $255, %ax
2633 ; KNL_64-NEXT: kmovw %eax, %k1
2634 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2635 ; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2638 ; KNL_32-LABEL: sext_v8i8_index:
2640 ; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2641 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2642 ; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0
2643 ; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm1
2644 ; KNL_32-NEXT: movw $255, %cx
2645 ; KNL_32-NEXT: kmovw %ecx, %k1
2646 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2647 ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
2650 ; SKX-LABEL: sext_v8i8_index:
2652 ; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2653 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2654 ; SKX-NEXT: vpslld $24, %ymm0, %ymm0
2655 ; SKX-NEXT: vpsrad $24, %ymm0, %ymm1
2656 ; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
2659 ; SKX_32-LABEL: sext_v8i8_index:
2661 ; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2662 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2663 ; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
2664 ; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
2665 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2666 ; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
2669 %sext_ind = sext <8 x i8> %ind to <8 x i64>
2670 %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
2672 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
2675 declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
2677 ; Index requires promotion
2678 define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
2679 ; KNL_64-LABEL: test_scatter_2i32_index:
2681 ; KNL_64-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2682 ; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1
2683 ; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1
2684 ; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2
2685 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0
2686 ; KNL_64-NEXT: kshiftlw $14, %k0, %k0
2687 ; KNL_64-NEXT: kshiftrw $14, %k0, %k1
2688 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1}
2689 ; KNL_64-NEXT: vzeroupper
2692 ; KNL_32-LABEL: test_scatter_2i32_index:
2694 ; KNL_32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
2695 ; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
2696 ; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
2697 ; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2
2698 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0
2699 ; KNL_32-NEXT: kshiftlw $14, %k0, %k0
2700 ; KNL_32-NEXT: kshiftrw $14, %k0, %k1
2701 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2702 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1}
2703 ; KNL_32-NEXT: vzeroupper
2706 ; SKX-LABEL: test_scatter_2i32_index:
2708 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
2709 ; SKX-NEXT: vpmovq2m %xmm2, %k1
2710 ; SKX-NEXT: vpsllq $32, %xmm1, %xmm1
2711 ; SKX-NEXT: vpsraq $32, %xmm1, %xmm1
2712 ; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
2715 ; SKX_32-LABEL: test_scatter_2i32_index:
2717 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
2718 ; SKX_32-NEXT: vpmovq2m %xmm2, %k1
2719 ; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
2720 ; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
2721 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2722 ; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1}
2724 %gep = getelementptr double, double *%base, <2 x i32> %ind
2725 call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
2728 declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
2730 define <16 x float> @zext_index(float* %base, <16 x i32> %ind) {
2731 ; KNL_64-LABEL: zext_index:
2733 ; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
2734 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2735 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2738 ; KNL_32-LABEL: zext_index:
2740 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2741 ; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2742 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2743 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2746 ; SKX_SMALL-LABEL: zext_index:
2747 ; SKX_SMALL: # %bb.0:
2748 ; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1
2749 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
2750 ; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2751 ; SKX_SMALL-NEXT: retq
2753 ; SKX_LARGE-LABEL: zext_index:
2754 ; SKX_LARGE: # %bb.0:
2755 ; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
2756 ; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1
2757 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
2758 ; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
2759 ; SKX_LARGE-NEXT: retq
2761 ; SKX_32-LABEL: zext_index:
2763 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2764 ; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1
2765 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2766 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
2768 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
2769 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64>
2770 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
2772 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2773 ret <16 x float>%res
2776 define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
2777 ; KNL_64-LABEL: test_gather_setcc_split:
2779 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2780 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2781 ; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
2782 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2783 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2784 ; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
2785 ; KNL_64-NEXT: vmovapd %zmm2, %zmm0
2786 ; KNL_64-NEXT: vmovapd %zmm3, %zmm1
2789 ; KNL_32-LABEL: test_gather_setcc_split:
2791 ; KNL_32-NEXT: pushl %ebp
2792 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2793 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2794 ; KNL_32-NEXT: movl %esp, %ebp
2795 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2796 ; KNL_32-NEXT: andl $-64, %esp
2797 ; KNL_32-NEXT: subl $64, %esp
2798 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2799 ; KNL_32-NEXT: movl 8(%ebp), %eax
2800 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2801 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2802 ; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
2803 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2804 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2805 ; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
2806 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
2807 ; KNL_32-NEXT: vmovapd %zmm3, %zmm1
2808 ; KNL_32-NEXT: movl %ebp, %esp
2809 ; KNL_32-NEXT: popl %ebp
2810 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2813 ; SKX-LABEL: test_gather_setcc_split:
2815 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2816 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2817 ; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
2818 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2819 ; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
2820 ; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
2821 ; SKX-NEXT: vmovapd %zmm2, %zmm0
2822 ; SKX-NEXT: vmovapd %zmm3, %zmm1
2825 ; SKX_32-LABEL: test_gather_setcc_split:
2827 ; SKX_32-NEXT: pushl %ebp
2828 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2829 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2830 ; SKX_32-NEXT: movl %esp, %ebp
2831 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2832 ; SKX_32-NEXT: andl $-64, %esp
2833 ; SKX_32-NEXT: subl $64, %esp
2834 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2835 ; SKX_32-NEXT: movl 8(%ebp), %eax
2836 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2837 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
2838 ; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
2839 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2840 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
2841 ; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
2842 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
2843 ; SKX_32-NEXT: vmovapd %zmm3, %zmm1
2844 ; SKX_32-NEXT: movl %ebp, %esp
2845 ; SKX_32-NEXT: popl %ebp
2846 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2848 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2849 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2851 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2852 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru)
2853 ret <16 x double>%res
2856 define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
2857 ; KNL_64-LABEL: test_scatter_setcc_split:
2859 ; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2860 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k1
2861 ; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm1
2862 ; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
2863 ; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
2864 ; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
2865 ; KNL_64-NEXT: vzeroupper
2868 ; KNL_32-LABEL: test_scatter_setcc_split:
2870 ; KNL_32-NEXT: pushl %ebp
2871 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2872 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2873 ; KNL_32-NEXT: movl %esp, %ebp
2874 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2875 ; KNL_32-NEXT: andl $-64, %esp
2876 ; KNL_32-NEXT: subl $64, %esp
2877 ; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
2878 ; KNL_32-NEXT: movl 8(%ebp), %eax
2879 ; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2880 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k1
2881 ; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1
2882 ; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
2883 ; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
2884 ; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
2885 ; KNL_32-NEXT: movl %ebp, %esp
2886 ; KNL_32-NEXT: popl %ebp
2887 ; KNL_32-NEXT: .cfi_def_cfa %esp, 4
2888 ; KNL_32-NEXT: vzeroupper
2891 ; SKX-LABEL: test_scatter_setcc_split:
2893 ; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2894 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1
2895 ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
2896 ; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
2897 ; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k2}
2898 ; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
2899 ; SKX-NEXT: vzeroupper
2902 ; SKX_32-LABEL: test_scatter_setcc_split:
2904 ; SKX_32-NEXT: pushl %ebp
2905 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2906 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2907 ; SKX_32-NEXT: movl %esp, %ebp
2908 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2909 ; SKX_32-NEXT: andl $-64, %esp
2910 ; SKX_32-NEXT: subl $64, %esp
2911 ; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
2912 ; SKX_32-NEXT: movl 8(%ebp), %eax
2913 ; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
2914 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k1
2915 ; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1
2916 ; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
2917 ; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k2}
2918 ; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
2919 ; SKX_32-NEXT: movl %ebp, %esp
2920 ; SKX_32-NEXT: popl %ebp
2921 ; SKX_32-NEXT: .cfi_def_cfa %esp, 4
2922 ; SKX_32-NEXT: vzeroupper
2924 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2925 %gep.random = getelementptr double, double *%base, <16 x i64> %sext_ind
2927 %mask = icmp eq <16 x i32> %cmp, zeroinitializer
2928 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %gep.random, i32 4, <16 x i1> %mask)
2932 ; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend.
2933 define <16 x float> @test_sext_cse(float* %base, <16 x i32> %ind, <16 x i32>* %foo) {
2934 ; KNL_64-LABEL: test_sext_cse:
2936 ; KNL_64-NEXT: vmovaps %zmm0, (%rsi)
2937 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
2938 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2939 ; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0
2942 ; KNL_32-LABEL: test_sext_cse:
2944 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2945 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2946 ; KNL_32-NEXT: vmovaps %zmm0, (%ecx)
2947 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
2948 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2949 ; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2952 ; SKX-LABEL: test_sext_cse:
2954 ; SKX-NEXT: vmovaps %zmm0, (%rsi)
2955 ; SKX-NEXT: kxnorw %k0, %k0, %k1
2956 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
2957 ; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0
2960 ; SKX_32-LABEL: test_sext_cse:
2962 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
2963 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
2964 ; SKX_32-NEXT: vmovaps %zmm0, (%ecx)
2965 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
2966 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
2967 ; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0
2969 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
2970 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
2972 %sext_ind = sext <16 x i32> %ind to <16 x i64>
2973 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
2975 store <16 x i32> %ind, <16 x i32>* %foo
2976 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2977 %gep.random2 = getelementptr float, <16 x float*> %broadcast.splat, <16 x i32> %ind
2978 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
2979 %res3 = fadd <16 x float> %res2, %res
2980 ret <16 x float>%res3