1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
3 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
4 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
5 ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
6 ; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
7 ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
9 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
10 target triple = "x86_64-unknown-linux-gnu"
14 ; SCALAR: extractelement <16 x float*>
15 ; SCALAR-NEXT: load float
16 ; SCALAR-NEXT: insertelement <16 x float>
17 ; SCALAR-NEXT: extractelement <16 x float*>
18 ; SCALAR-NEXT: load float
20 define <16 x float> @test1(float* %base, <16 x i32> %ind) {
21 ; KNL_64-LABEL: test1:
23 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
24 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
25 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
28 ; KNL_32-LABEL: test1:
30 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
31 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
32 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
33 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
38 ; SKX-NEXT: kxnorw %k0, %k0, %k1
39 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
40 ; SKX-NEXT: vmovaps %zmm1, %zmm0
43 ; SKX_32-LABEL: test1:
45 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
46 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
47 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
48 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
51 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
52 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
54 %sext_ind = sext <16 x i32> %ind to <16 x i64>
55 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
57 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
61 declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
62 declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
63 declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
67 ; SCALAR: extractelement <16 x float*>
68 ; SCALAR-NEXT: load float
69 ; SCALAR-NEXT: insertelement <16 x float>
70 ; SCALAR-NEXT: br label %else
72 ; SCALAR-NEXT: %res.phi.else = phi
73 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
74 ; SCALAR-NEXT: %ToLoad1 = icmp eq i1 %Mask1, true
75 ; SCALAR-NEXT: br i1 %ToLoad1, label %cond.load1, label %else2
77 define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
78 ; KNL_64-LABEL: test2:
80 ; KNL_64-NEXT: kmovw %esi, %k1
81 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
82 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
85 ; KNL_32-LABEL: test2:
87 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
88 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
89 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
90 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
95 ; SKX-NEXT: kmovw %esi, %k1
96 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
97 ; SKX-NEXT: vmovaps %zmm1, %zmm0
100 ; SKX_32-LABEL: test2:
102 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
103 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
104 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
105 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
108 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
109 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
111 %sext_ind = sext <16 x i32> %ind to <16 x i64>
112 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
113 %imask = bitcast i16 %mask to <16 x i1>
114 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
115 ret <16 x float> %res
118 define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
119 ; KNL_64-LABEL: test3:
121 ; KNL_64-NEXT: kmovw %esi, %k1
122 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
123 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
126 ; KNL_32-LABEL: test3:
128 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
129 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
130 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
131 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
136 ; SKX-NEXT: kmovw %esi, %k1
137 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
138 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
141 ; SKX_32-LABEL: test3:
143 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
144 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
145 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
146 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
149 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
150 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
152 %sext_ind = sext <16 x i32> %ind to <16 x i64>
153 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
154 %imask = bitcast i16 %mask to <16 x i1>
155 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
160 define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
161 ; KNL_64-LABEL: test4:
163 ; KNL_64-NEXT: kmovw %esi, %k1
164 ; KNL_64-NEXT: kmovw %k1, %k2
165 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
166 ; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2
167 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
168 ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0
171 ; KNL_32-LABEL: test4:
173 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
174 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
175 ; KNL_32-NEXT: kmovw %k1, %k2
176 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
177 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
178 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
179 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
184 ; SKX-NEXT: kmovw %esi, %k1
185 ; SKX-NEXT: kmovw %k1, %k2
186 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
187 ; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
188 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
189 ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0
192 ; SKX_32-LABEL: test4:
194 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
195 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
196 ; SKX_32-NEXT: kmovw %k1, %k2
197 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
198 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
199 ; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
200 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
203 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
204 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
206 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
207 %imask = bitcast i16 %mask to <16 x i1>
208 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
209 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
210 %res = add <16 x i32> %gt1, %gt2
215 ; SCALAR-LABEL: test5
216 ; SCALAR: %Mask0 = extractelement <16 x i1> %imask, i32 0
217 ; SCALAR-NEXT: %ToStore0 = icmp eq i1 %Mask0, true
218 ; SCALAR-NEXT: br i1 %ToStore0, label %cond.store, label %else
219 ; SCALAR: cond.store:
220 ; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i32 0
221 ; SCALAR-NEXT: %Ptr0 = extractelement <16 x i32*> %gep.random, i32 0
222 ; SCALAR-NEXT: store i32 %Elt0, i32* %Ptr0, align 4
223 ; SCALAR-NEXT: br label %else
225 ; SCALAR-NEXT: %Mask1 = extractelement <16 x i1> %imask, i32 1
226 ; SCALAR-NEXT: %ToStore1 = icmp eq i1 %Mask1, true
227 ; SCALAR-NEXT: br i1 %ToStore1, label %cond.store1, label %else2
229 define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
230 ; KNL_64-LABEL: test5:
232 ; KNL_64-NEXT: kmovw %esi, %k1
233 ; KNL_64-NEXT: kmovw %k1, %k2
234 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
235 ; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
236 ; KNL_64-NEXT: vzeroupper
239 ; KNL_32-LABEL: test5:
241 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
242 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
243 ; KNL_32-NEXT: kmovw %k1, %k2
244 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
245 ; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
246 ; KNL_32-NEXT: vzeroupper
251 ; SKX-NEXT: kmovw %esi, %k1
252 ; SKX-NEXT: kmovw %k1, %k2
253 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
254 ; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
255 ; SKX-NEXT: vzeroupper
258 ; SKX_32-LABEL: test5:
260 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
261 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
262 ; SKX_32-NEXT: kmovw %k1, %k2
263 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
264 ; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
265 ; SKX_32-NEXT: vzeroupper
268 %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
269 %broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
271 %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
272 %imask = bitcast i16 %mask to <16 x i1>
273 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
274 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
278 declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
279 declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
282 ; SCALAR-LABEL: test6
283 ; SCALAR: store i32 %Elt0, i32* %Ptr01, align 4
284 ; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i32 1
285 ; SCALAR-NEXT: %Ptr12 = extractelement <8 x i32*> %ptr, i32 1
286 ; SCALAR-NEXT: store i32 %Elt1, i32* %Ptr12, align 4
287 ; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i32 2
288 ; SCALAR-NEXT: %Ptr23 = extractelement <8 x i32*> %ptr, i32 2
289 ; SCALAR-NEXT: store i32 %Elt2, i32* %Ptr23, align 4
291 define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
292 ; KNL_64-LABEL: test6:
294 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
295 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
296 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
297 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
298 ; KNL_64-NEXT: vmovdqa %ymm2, %ymm0
301 ; KNL_32-LABEL: test6:
303 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
304 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
305 ; KNL_32-NEXT: kxnorw %k0, %k0, %k2
306 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
307 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
308 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
313 ; SKX-NEXT: kxnorw %k0, %k0, %k1
314 ; SKX-NEXT: kxnorw %k0, %k0, %k2
315 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
316 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
317 ; SKX-NEXT: vmovdqa %ymm2, %ymm0
320 ; SKX_32-LABEL: test6:
322 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
323 ; SKX_32-NEXT: kxnorw %k0, %k0, %k2
324 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
325 ; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
326 ; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
329 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
331 call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
335 define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
337 ; KNL_64-LABEL: test7:
339 ; KNL_64-NEXT: kmovw %esi, %k1
340 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
341 ; KNL_64-NEXT: kmovw %k1, %k2
342 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
343 ; KNL_64-NEXT: vmovdqa %ymm1, %ymm2
344 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
345 ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0
348 ; KNL_32-LABEL: test7:
350 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
351 ; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
352 ; KNL_32-NEXT: kmovw %ecx, %k1
353 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
354 ; KNL_32-NEXT: kmovw %k1, %k2
355 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2}
356 ; KNL_32-NEXT: vmovdqa %ymm1, %ymm2
357 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
358 ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
363 ; SKX-NEXT: kmovw %esi, %k1
364 ; SKX-NEXT: kmovw %k1, %k2
365 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
366 ; SKX-NEXT: vmovdqa %ymm1, %ymm2
367 ; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
368 ; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0
371 ; SKX_32-LABEL: test7:
373 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
374 ; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
375 ; SKX_32-NEXT: kmovw %k1, %k2
376 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
377 ; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
378 ; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
379 ; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0
382 %broadcast.splatinsert = insertelement <8 x i32*> undef, i32* %base, i32 0
383 %broadcast.splat = shufflevector <8 x i32*> %broadcast.splatinsert, <8 x i32*> undef, <8 x i32> zeroinitializer
385 %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
386 %imask = bitcast i8 %mask to <8 x i1>
387 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
388 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
389 %res = add <8 x i32> %gt1, %gt2
393 ; No uniform base in this case, index <8 x i64> contains addresses,
394 ; each gather call will be split into two
395 define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
396 ; KNL_64-LABEL: test8:
398 ; KNL_64-NEXT: kmovw %edi, %k1
399 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
400 ; KNL_64-NEXT: kmovw %k2, %k3
401 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
402 ; KNL_64-NEXT: kmovw %k1, %k3
403 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
404 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
405 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
406 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
407 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
408 ; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0
411 ; KNL_32-LABEL: test8:
413 ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
414 ; KNL_32-NEXT: kmovw %k1, %k2
415 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
416 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2
417 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
418 ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
423 ; SKX-NEXT: kmovw %edi, %k1
424 ; SKX-NEXT: kshiftrw $8, %k1, %k2
425 ; SKX-NEXT: kmovw %k2, %k3
426 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
427 ; SKX-NEXT: kmovw %k1, %k3
428 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
429 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
430 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
431 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
432 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
433 ; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
436 ; SKX_32-LABEL: test8:
438 ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
439 ; SKX_32-NEXT: kmovw %k1, %k2
440 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
441 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
442 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
443 ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0
446 %imask = bitcast i16 %mask to <16 x i1>
447 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
448 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
449 %res = add <16 x i32> %gt1, %gt2
453 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
454 %struct.ST = type { i32, double, %struct.RT }
456 ; Masked gather for agregate types
457 ; Test9 and Test10 should give the same result (scalar and vector indices in GEP)
460 define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
461 ; KNL_64-LABEL: test9:
462 ; KNL_64: # BB#0: # %entry
463 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
464 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
465 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
466 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
467 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
468 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
469 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
470 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
471 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
472 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
473 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
474 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
475 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
476 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
479 ; KNL_32-LABEL: test9:
480 ; KNL_32: # BB#0: # %entry
481 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
482 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
483 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
484 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
485 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
486 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
487 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
488 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
489 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
490 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
491 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
492 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
493 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
497 ; SKX: # BB#0: # %entry
498 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
499 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
500 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
501 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
502 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
503 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
504 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
505 ; SKX-NEXT: kxnorw %k0, %k0, %k1
506 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
509 ; SKX_32-LABEL: test9:
510 ; SKX_32: # BB#0: # %entry
511 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
512 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
513 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
514 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
515 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
516 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
517 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
518 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
521 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
522 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
524 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
525 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
529 define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
530 ; KNL_64-LABEL: test10:
531 ; KNL_64: # BB#0: # %entry
532 ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
533 ; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
534 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
535 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
536 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
537 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0
538 ; KNL_64-NEXT: vpaddq %zmm2, %zmm0, %zmm0
539 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
540 ; KNL_64-NEXT: vpmuldq {{.*}}(%rip){1to8}, %zmm1, %zmm1
541 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
542 ; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0
543 ; KNL_64-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
544 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
545 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
548 ; KNL_32-LABEL: test10:
549 ; KNL_32: # BB#0: # %entry
550 ; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
551 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
552 ; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
553 ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
554 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
555 ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
556 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
557 ; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
558 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
559 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
560 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
561 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
562 ; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
566 ; SKX: # BB#0: # %entry
567 ; SKX-NEXT: vpbroadcastq %rdi, %zmm2
568 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
569 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
570 ; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
571 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
572 ; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
573 ; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
574 ; SKX-NEXT: kxnorw %k0, %k0, %k1
575 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
578 ; SKX_32-LABEL: test10:
579 ; SKX_32: # BB#0: # %entry
580 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
581 ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
582 ; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
583 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
584 ; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
585 ; SKX_32-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm1
586 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
587 ; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1}
590 %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0
591 %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer
593 %arrayidx = getelementptr %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
594 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
598 ; Splat index in GEP, requires broadcast
599 define <16 x float> @test11(float* %base, i32 %ind) {
600 ; KNL_64-LABEL: test11:
602 ; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
603 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
604 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
607 ; KNL_32-LABEL: test11:
609 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
610 ; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
611 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
612 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
617 ; SKX-NEXT: vpbroadcastd %esi, %zmm1
618 ; SKX-NEXT: kxnorw %k0, %k0, %k1
619 ; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
622 ; SKX_32-LABEL: test11:
624 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
625 ; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
626 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
627 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
630 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
631 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
633 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
635 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
639 ; We are checking the uniform base here. It is taken directly from input to vgatherdps
640 define <16 x float> @test12(float* %base, <16 x i32> %ind) {
641 ; KNL_64-LABEL: test12:
643 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
644 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
645 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
648 ; KNL_32-LABEL: test12:
650 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
651 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
652 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
653 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
658 ; SKX-NEXT: kxnorw %k0, %k0, %k1
659 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
660 ; SKX-NEXT: vmovaps %zmm1, %zmm0
663 ; SKX_32-LABEL: test12:
665 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
666 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
667 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
668 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
671 %sext_ind = sext <16 x i32> %ind to <16 x i64>
672 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
674 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
678 ; The same as the previous, but the mask is undefined
679 define <16 x float> @test13(float* %base, <16 x i32> %ind) {
680 ; KNL_64-LABEL: test13:
682 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
683 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
684 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
687 ; KNL_32-LABEL: test13:
689 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
690 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
691 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
692 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
697 ; SKX-NEXT: kxnorw %k0, %k0, %k1
698 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
699 ; SKX-NEXT: vmovaps %zmm1, %zmm0
702 ; SKX_32-LABEL: test13:
704 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
705 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
706 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
707 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
710 %sext_ind = sext <16 x i32> %ind to <16 x i64>
711 %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
717 ; The base pointer is not splat, can't find unform base
718 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
719 ; KNL_64-LABEL: test14:
721 ; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
722 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
723 ; KNL_64-NEXT: vmovd %esi, %xmm1
724 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1
725 ; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1
726 ; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
727 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
728 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
729 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
730 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
731 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
732 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
735 ; KNL_32-LABEL: test14:
737 ; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
738 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
739 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
740 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
741 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
742 ; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
747 ; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
748 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
749 ; SKX-NEXT: vpbroadcastd %esi, %ymm1
750 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
751 ; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
752 ; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
753 ; SKX-NEXT: kxnorw %k0, %k0, %k1
754 ; SKX-NEXT: kshiftrw $8, %k1, %k2
755 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
756 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
757 ; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
760 ; SKX_32-LABEL: test14:
762 ; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
763 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
764 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
765 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1
766 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
767 ; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1}
770 %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1
771 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
773 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind
775 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
779 declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
780 declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
781 declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
783 ; Gather smaller than existing instruction
784 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
786 ; KNL_64-LABEL: test15:
788 ; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
789 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
790 ; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
791 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
792 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
793 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0
794 ; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
795 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
796 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
797 ; KNL_64-NEXT: vzeroupper
800 ; KNL_32-LABEL: test15:
802 ; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
803 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
804 ; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
805 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
806 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
807 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
808 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0
809 ; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
810 ; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
811 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
812 ; KNL_32-NEXT: vzeroupper
817 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
818 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
819 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
820 ; SKX-NEXT: vmovaps %xmm1, %xmm0
823 ; SKX_32-LABEL: test15:
825 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
826 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
827 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
828 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1}
829 ; SKX_32-NEXT: vmovaps %xmm1, %xmm0
832 %sext_ind = sext <4 x i32> %ind to <4 x i64>
833 %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
834 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
838 ; Gather smaller than existing instruction
839 define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
841 ; KNL_64-LABEL: test16:
843 ; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
844 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
845 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
846 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
847 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
848 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
849 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
850 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
851 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
852 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
853 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
854 ; KNL_64-NEXT: vmovapd %ymm2, %ymm0
857 ; KNL_32-LABEL: test16:
859 ; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
860 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
861 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
862 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
863 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
864 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
865 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
866 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
867 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
868 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
869 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
870 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
871 ; KNL_32-NEXT: vmovapd %ymm2, %ymm0
876 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
877 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
878 ; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
879 ; SKX-NEXT: vmovapd %ymm2, %ymm0
882 ; SKX_32-LABEL: test16:
884 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
885 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
886 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
887 ; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1}
888 ; SKX_32-NEXT: vmovapd %ymm2, %ymm0
891 %sext_ind = sext <4 x i32> %ind to <4 x i64>
892 %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
893 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
897 define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
899 ; KNL_64-LABEL: test17:
901 ; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
902 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
903 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
904 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
905 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
906 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
907 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
908 ; KNL_64-NEXT: vmovapd %xmm2, %xmm0
909 ; KNL_64-NEXT: vzeroupper
912 ; KNL_32-LABEL: test17:
914 ; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
915 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
916 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
917 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
918 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
919 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
920 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
921 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
922 ; KNL_32-NEXT: vmovapd %xmm2, %xmm0
923 ; KNL_32-NEXT: vzeroupper
928 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
929 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
930 ; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
931 ; SKX-NEXT: vmovapd %xmm2, %xmm0
934 ; SKX_32-LABEL: test17:
936 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
937 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
938 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
939 ; SKX_32-NEXT: vgatherqpd (%eax,%xmm0,8), %xmm2 {%k1}
940 ; SKX_32-NEXT: vmovapd %xmm2, %xmm0
943 %sext_ind = sext <2 x i32> %ind to <2 x i64>
944 %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
945 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
949 declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
950 declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
951 declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
952 declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
953 declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
955 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
957 ; KNL_64-LABEL: test18:
959 ; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
960 ; KNL_64-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
961 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
962 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
963 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
964 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
965 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
966 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
967 ; KNL_64-NEXT: vzeroupper
970 ; KNL_32-LABEL: test18:
972 ; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
973 ; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
974 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
975 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
976 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
977 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
978 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
979 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
980 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
981 ; KNL_32-NEXT: vzeroupper
986 ; SKX-NEXT: vpslld $31, %xmm2, %xmm2
987 ; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
988 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
989 ; SKX-NEXT: vzeroupper
992 ; SKX_32-LABEL: test18:
994 ; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
995 ; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
996 ; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
998 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
1002 define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
1004 ; KNL_64-LABEL: test19:
1006 ; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
1007 ; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
1008 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
1009 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
1010 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
1011 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1012 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
1013 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1014 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1015 ; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
1016 ; KNL_64-NEXT: vzeroupper
1019 ; KNL_32-LABEL: test19:
1021 ; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
1022 ; KNL_32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
1023 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
1024 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
1025 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
1026 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1027 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
1028 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1029 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
1030 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1031 ; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
1032 ; KNL_32-NEXT: vzeroupper
1035 ; SKX-LABEL: test19:
1037 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
1038 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
1039 ; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
1040 ; SKX-NEXT: vzeroupper
1043 ; SKX_32-LABEL: test19:
1045 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
1046 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
1047 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1048 ; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
1049 ; SKX_32-NEXT: vzeroupper
1051 %gep = getelementptr double, double* %ptr, <4 x i64> %ind
1052 call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
1056 ; Data type requires widening
1057 define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
1059 ; KNL_64-LABEL: test20:
1061 ; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1062 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
1063 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
1064 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
1065 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
1066 ; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
1067 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1068 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1069 ; KNL_64-NEXT: vzeroupper
1072 ; KNL_32-LABEL: test20:
1074 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
1075 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
1076 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
1077 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
1078 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1079 ; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
1080 ; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
1081 ; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
1082 ; KNL_32-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
1083 ; KNL_32-NEXT: vzeroupper
1086 ; SKX-LABEL: test20:
1088 ; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1089 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1090 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
1091 ; SKX-NEXT: kshiftlb $6, %k0, %k0
1092 ; SKX-NEXT: kshiftrb $6, %k0, %k1
1093 ; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
1094 ; SKX-NEXT: vzeroupper
1097 ; SKX_32-LABEL: test20:
1099 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1100 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1101 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
1102 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0
1103 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1
1104 ; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
1106 call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
1110 ; Data type requires promotion
1111 define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
1113 ; KNL_64-LABEL: test21:
1115 ; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1116 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1117 ; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
1118 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1119 ; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
1120 ; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
1121 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1122 ; KNL_64-NEXT: vzeroupper
1125 ; KNL_32-LABEL: test21:
1127 ; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1128 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1129 ; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
1130 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1131 ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
1132 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1133 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1134 ; KNL_32-NEXT: vzeroupper
1137 ; SKX-LABEL: test21:
1139 ; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1140 ; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
1141 ; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
1142 ; SKX-NEXT: kshiftlb $6, %k0, %k0
1143 ; SKX-NEXT: kshiftrb $6, %k0, %k1
1144 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1145 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1146 ; SKX-NEXT: vzeroupper
1149 ; SKX_32-LABEL: test21:
1151 ; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1152 ; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
1153 ; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
1154 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0
1155 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1
1156 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1157 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1158 ; SKX_32-NEXT: vzeroupper
1160 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
1164 ; The result type requires widening
1165 declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
1167 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
1170 ; KNL_64-LABEL: test22:
1172 ; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
1173 ; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
1174 ; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
1175 ; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1176 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1177 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
1178 ; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
1179 ; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
1180 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
1181 ; KNL_64-NEXT: vmovaps %xmm2, %xmm0
1182 ; KNL_64-NEXT: vzeroupper
1185 ; KNL_32-LABEL: test22:
1187 ; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
1188 ; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
1189 ; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
1190 ; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1191 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1192 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1193 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
1194 ; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
1195 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1196 ; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
1197 ; KNL_32-NEXT: vmovaps %xmm2, %xmm0
1198 ; KNL_32-NEXT: vzeroupper
1201 ; SKX-LABEL: test22:
1203 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1204 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1205 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
1206 ; SKX-NEXT: kshiftlb $6, %k0, %k0
1207 ; SKX-NEXT: kshiftrb $6, %k0, %k1
1208 ; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
1209 ; SKX-NEXT: vmovaps %xmm2, %xmm0
1212 ; SKX_32-LABEL: test22:
1214 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1215 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1216 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
1217 ; SKX_32-NEXT: kshiftlb $6, %k0, %k0
1218 ; SKX_32-NEXT: kshiftrb $6, %k0, %k1
1219 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1220 ; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
1221 ; SKX_32-NEXT: vmovaps %xmm2, %xmm0
1223 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1224 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1225 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
1229 declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
1230 declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
1232 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
1234 ; KNL_64-LABEL: test23:
1236 ; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
1237 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1238 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1239 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1240 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1241 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1242 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1243 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1244 ; KNL_64-NEXT: vzeroupper
1247 ; KNL_32-LABEL: test23:
1249 ; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
1250 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1251 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1252 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1253 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1254 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
1255 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1256 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1257 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1258 ; KNL_32-NEXT: vzeroupper
1261 ; SKX-LABEL: test23:
1263 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1264 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
1265 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1266 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1269 ; SKX_32-LABEL: test23:
1271 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1272 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
1273 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1274 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1275 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1277 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1278 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1279 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
1283 define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
1284 ; KNL_64-LABEL: test24:
1286 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1287 ; KNL_64-NEXT: movb $3, %al
1288 ; KNL_64-NEXT: kmovw %eax, %k1
1289 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1290 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1291 ; KNL_64-NEXT: vzeroupper
1294 ; KNL_32-LABEL: test24:
1296 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1297 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1298 ; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
1299 ; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm1, %zmm1
1300 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
1301 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1302 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1303 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1304 ; KNL_32-NEXT: vzeroupper
1307 ; SKX-LABEL: test24:
1309 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1310 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1311 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1314 ; SKX_32-LABEL: test24:
1316 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1317 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1318 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1319 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1321 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1322 %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
1323 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
1327 define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
1329 ; KNL_64-LABEL: test25:
1331 ; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
1332 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1333 ; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
1334 ; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1335 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
1336 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
1337 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
1338 ; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
1339 ; KNL_64-NEXT: vzeroupper
1342 ; KNL_32-LABEL: test25:
1344 ; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
1345 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1346 ; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
1347 ; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
1348 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1349 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
1350 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
1351 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
1352 ; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
1353 ; KNL_32-NEXT: vzeroupper
1356 ; SKX-LABEL: test25:
1358 ; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
1359 ; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
1360 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
1361 ; SKX-NEXT: vmovdqa %xmm2, %xmm0
1364 ; SKX_32-LABEL: test25:
1366 ; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
1367 ; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
1368 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1369 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm2 {%k1}
1370 ; SKX_32-NEXT: vmovdqa %xmm2, %xmm0
1372 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1373 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1374 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
1378 define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
1380 ; KNL_64-LABEL: test26:
1382 ; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1383 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1384 ; KNL_64-NEXT: movb $3, %al
1385 ; KNL_64-NEXT: kmovw %eax, %k1
1386 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
1387 ; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
1388 ; KNL_64-NEXT: vzeroupper
1391 ; KNL_32-LABEL: test26:
1393 ; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1394 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
1395 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1396 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1397 ; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
1398 ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
1399 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1400 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
1401 ; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
1402 ; KNL_32-NEXT: vzeroupper
1405 ; SKX-LABEL: test26:
1407 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1408 ; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
1409 ; SKX-NEXT: vmovdqa %xmm1, %xmm0
1412 ; SKX_32-LABEL: test26:
1414 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1415 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1416 ; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
1417 ; SKX_32-NEXT: vmovdqa %xmm1, %xmm0
1419 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1420 %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
1421 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
1425 ; Result type requires widening; all-ones mask
1426 define <2 x float> @test27(float* %base, <2 x i32> %ind) {
1428 ; KNL_64-LABEL: test27:
1430 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1431 ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
1432 ; KNL_64-NEXT: movb $3, %al
1433 ; KNL_64-NEXT: kmovw %eax, %k1
1434 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
1435 ; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1436 ; KNL_64-NEXT: vzeroupper
1439 ; KNL_32-LABEL: test27:
1441 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1442 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1443 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
1444 ; KNL_32-NEXT: movb $3, %cl
1445 ; KNL_32-NEXT: kmovw %ecx, %k1
1446 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
1447 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
1448 ; KNL_32-NEXT: vzeroupper
1451 ; SKX-LABEL: test27:
1453 ; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1454 ; SKX-NEXT: movb $3, %al
1455 ; SKX-NEXT: kmovw %eax, %k1
1456 ; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
1459 ; SKX_32-LABEL: test27:
1461 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
1462 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1463 ; SKX_32-NEXT: movb $3, %cl
1464 ; SKX_32-NEXT: kmovw %ecx, %k1
1465 ; SKX_32-NEXT: vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
1467 %sext_ind = sext <2 x i32> %ind to <2 x i64>
1468 %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
1469 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
1473 ; Data type requires promotion, mask is all-ones
1474 define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
1477 ; KNL_64-LABEL: test28:
1479 ; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1480 ; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1481 ; KNL_64-NEXT: movb $3, %al
1482 ; KNL_64-NEXT: kmovw %eax, %k1
1483 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1484 ; KNL_64-NEXT: vzeroupper
1487 ; KNL_32-LABEL: test28:
1489 ; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
1490 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1491 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
1492 ; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
1493 ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
1494 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
1495 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
1496 ; KNL_32-NEXT: vzeroupper
1499 ; SKX-LABEL: test28:
1501 ; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1502 ; SKX-NEXT: movb $3, %al
1503 ; SKX-NEXT: kmovw %eax, %k1
1504 ; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1505 ; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1506 ; SKX-NEXT: vzeroupper
1509 ; SKX_32-LABEL: test28:
1511 ; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
1512 ; SKX_32-NEXT: movb $3, %al
1513 ; SKX_32-NEXT: kmovw %eax, %k1
1514 ; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1515 ; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
1516 ; SKX_32-NEXT: vzeroupper
1518 call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
1523 ; SCALAR-LABEL: test29
1524 ; SCALAR: extractelement <16 x float*>
1525 ; SCALAR-NEXT: load float
1526 ; SCALAR-NEXT: insertelement <16 x float>
1527 ; SCALAR-NEXT: extractelement <16 x float*>
1528 ; SCALAR-NEXT: load float
1530 define <16 x float> @test29(float* %base, <16 x i32> %ind) {
1531 ; KNL_64-LABEL: test29:
1533 ; KNL_64-NEXT: movw $44, %ax
1534 ; KNL_64-NEXT: kmovw %eax, %k1
1535 ; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1536 ; KNL_64-NEXT: vmovaps %zmm1, %zmm0
1539 ; KNL_32-LABEL: test29:
1541 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1542 ; KNL_32-NEXT: movw $44, %cx
1543 ; KNL_32-NEXT: kmovw %ecx, %k1
1544 ; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1545 ; KNL_32-NEXT: vmovaps %zmm1, %zmm0
1548 ; SKX-LABEL: test29:
1550 ; SKX-NEXT: movw $44, %ax
1551 ; SKX-NEXT: kmovw %eax, %k1
1552 ; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
1553 ; SKX-NEXT: vmovaps %zmm1, %zmm0
1556 ; SKX_32-LABEL: test29:
1558 ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
1559 ; SKX_32-NEXT: movw $44, %cx
1560 ; SKX_32-NEXT: kmovw %ecx, %k1
1561 ; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
1562 ; SKX_32-NEXT: vmovaps %zmm1, %zmm0
1565 %broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
1566 %broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
1568 %sext_ind = sext <16 x i32> %ind to <16 x i64>
1569 %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
1571 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
1572 ret <16 x float>%res
1575 ; Check non-power-of-2 case. It should be scalarized.
1576 declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
1577 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
1581 %sext_ind = sext <3 x i32> %ind to <3 x i64>
1582 %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
1583 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
1587 declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
1592 define <16 x float*> @test31(<16 x float**> %ptrs) {
1593 ; KNL_64-LABEL: test31:
1595 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1
1596 ; KNL_64-NEXT: kxnorw %k0, %k0, %k2
1597 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1598 ; KNL_64-NEXT: kshiftrw $8, %k1, %k1
1599 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1600 ; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
1601 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
1604 ; KNL_32-LABEL: test31:
1606 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1
1607 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1608 ; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
1611 ; SKX-LABEL: test31:
1613 ; SKX-NEXT: kxnorw %k0, %k0, %k1
1614 ; SKX-NEXT: kxnorw %k0, %k0, %k2
1615 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
1616 ; SKX-NEXT: kshiftrw $8, %k1, %k1
1617 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
1618 ; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
1619 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
1622 ; SKX_32-LABEL: test31:
1624 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1
1625 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
1626 ; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
1629 %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
1630 ret <16 x float*>%res
1633 define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1634 ; KNL_64-LABEL: test_gather_16i32:
1636 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1637 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1638 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1639 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2
1640 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1641 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1642 ; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1643 ; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
1646 ; KNL_32-LABEL: test_gather_16i32:
1648 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1649 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1650 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1651 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1652 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1655 ; SKX-LABEL: test_gather_16i32:
1657 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1658 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1659 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1660 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
1661 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1662 ; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
1663 ; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
1664 ; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
1667 ; SKX_32-LABEL: test_gather_16i32:
1669 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1670 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1671 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1672 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
1673 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1675 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
1678 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1679 ; KNL_64-LABEL: test_gather_16i64:
1681 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1682 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1683 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1684 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1685 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1686 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1687 ; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0
1688 ; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1
1691 ; KNL_32-LABEL: test_gather_16i64:
1693 ; KNL_32-NEXT: pushl %ebp
1694 ; KNL_32-NEXT: .Lcfi0:
1695 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1696 ; KNL_32-NEXT: .Lcfi1:
1697 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1698 ; KNL_32-NEXT: movl %esp, %ebp
1699 ; KNL_32-NEXT: .Lcfi2:
1700 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1701 ; KNL_32-NEXT: andl $-64, %esp
1702 ; KNL_32-NEXT: subl $64, %esp
1703 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1704 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1705 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1706 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1707 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1708 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1709 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1710 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1711 ; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0
1712 ; KNL_32-NEXT: movl %ebp, %esp
1713 ; KNL_32-NEXT: popl %ebp
1716 ; SKX-LABEL: test_gather_16i64:
1718 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1719 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1720 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1721 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1722 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
1723 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
1724 ; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
1725 ; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
1728 ; SKX_32-LABEL: test_gather_16i64:
1730 ; SKX_32-NEXT: pushl %ebp
1731 ; SKX_32-NEXT: .Lcfi1:
1732 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1733 ; SKX_32-NEXT: .Lcfi2:
1734 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1735 ; SKX_32-NEXT: movl %esp, %ebp
1736 ; SKX_32-NEXT: .Lcfi3:
1737 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1738 ; SKX_32-NEXT: andl $-64, %esp
1739 ; SKX_32-NEXT: subl $64, %esp
1740 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1741 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1742 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1743 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1744 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1745 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
1746 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1747 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
1748 ; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
1749 ; SKX_32-NEXT: movl %ebp, %esp
1750 ; SKX_32-NEXT: popl %ebp
1752 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
1755 declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
1756 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
1757 ; KNL_64-LABEL: test_gather_16f32:
1759 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1760 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1761 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1762 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2
1763 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1764 ; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1765 ; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1766 ; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
1769 ; KNL_32-LABEL: test_gather_16f32:
1771 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1772 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1773 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1774 ; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1775 ; KNL_32-NEXT: vmovaps %zmm2, %zmm0
1778 ; SKX-LABEL: test_gather_16f32:
1780 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1781 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1782 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1783 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
1784 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1785 ; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
1786 ; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
1787 ; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
1790 ; SKX_32-LABEL: test_gather_16f32:
1792 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1793 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1794 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1795 ; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
1796 ; SKX_32-NEXT: vmovaps %zmm2, %zmm0
1798 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
1799 ret <16 x float> %res
1801 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
1802 ; KNL_64-LABEL: test_gather_16f64:
1804 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1805 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1806 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1807 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1808 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1809 ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1810 ; KNL_64-NEXT: vmovapd %zmm3, %zmm0
1811 ; KNL_64-NEXT: vmovapd %zmm4, %zmm1
1814 ; KNL_32-LABEL: test_gather_16f64:
1816 ; KNL_32-NEXT: pushl %ebp
1817 ; KNL_32-NEXT: .Lcfi3:
1818 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1819 ; KNL_32-NEXT: .Lcfi4:
1820 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1821 ; KNL_32-NEXT: movl %esp, %ebp
1822 ; KNL_32-NEXT: .Lcfi5:
1823 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1824 ; KNL_32-NEXT: andl $-64, %esp
1825 ; KNL_32-NEXT: subl $64, %esp
1826 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1827 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1828 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1829 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
1830 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1831 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1832 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1833 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1834 ; KNL_32-NEXT: vmovapd %zmm2, %zmm0
1835 ; KNL_32-NEXT: movl %ebp, %esp
1836 ; KNL_32-NEXT: popl %ebp
1839 ; SKX-LABEL: test_gather_16f64:
1841 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1842 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1843 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1844 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1845 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
1846 ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
1847 ; SKX-NEXT: vmovapd %zmm3, %zmm0
1848 ; SKX-NEXT: vmovapd %zmm4, %zmm1
1851 ; SKX_32-LABEL: test_gather_16f64:
1853 ; SKX_32-NEXT: pushl %ebp
1854 ; SKX_32-NEXT: .Lcfi4:
1855 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1856 ; SKX_32-NEXT: .Lcfi5:
1857 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1858 ; SKX_32-NEXT: movl %esp, %ebp
1859 ; SKX_32-NEXT: .Lcfi6:
1860 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1861 ; SKX_32-NEXT: andl $-64, %esp
1862 ; SKX_32-NEXT: subl $64, %esp
1863 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1864 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1865 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1866 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
1867 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1868 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
1869 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1870 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
1871 ; SKX_32-NEXT: vmovapd %zmm2, %zmm0
1872 ; SKX_32-NEXT: movl %ebp, %esp
1873 ; SKX_32-NEXT: popl %ebp
1875 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
1876 ret <16 x double> %res
1878 declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
1879 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
1880 ; KNL_64-LABEL: test_scatter_16i32:
1882 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1883 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1884 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1885 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1886 ; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1887 ; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
1888 ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1889 ; KNL_64-NEXT: vzeroupper
1892 ; KNL_32-LABEL: test_scatter_16i32:
1894 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1895 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1896 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1897 ; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1898 ; KNL_32-NEXT: vzeroupper
1901 ; SKX-LABEL: test_scatter_16i32:
1903 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1904 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1905 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1906 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1907 ; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
1908 ; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
1909 ; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
1910 ; SKX-NEXT: vzeroupper
1913 ; SKX_32-LABEL: test_scatter_16i32:
1915 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1916 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1917 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1918 ; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
1919 ; SKX_32-NEXT: vzeroupper
1921 call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
1924 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
1925 ; KNL_64-LABEL: test_scatter_16i64:
1927 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
1928 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
1929 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
1930 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
1931 ; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1932 ; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1933 ; KNL_64-NEXT: vzeroupper
1936 ; KNL_32-LABEL: test_scatter_16i64:
1938 ; KNL_32-NEXT: pushl %ebp
1939 ; KNL_32-NEXT: .Lcfi6:
1940 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
1941 ; KNL_32-NEXT: .Lcfi7:
1942 ; KNL_32-NEXT: .cfi_offset %ebp, -8
1943 ; KNL_32-NEXT: movl %esp, %ebp
1944 ; KNL_32-NEXT: .Lcfi8:
1945 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
1946 ; KNL_32-NEXT: andl $-64, %esp
1947 ; KNL_32-NEXT: subl $64, %esp
1948 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
1949 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
1950 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1951 ; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1952 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
1953 ; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1954 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
1955 ; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1956 ; KNL_32-NEXT: movl %ebp, %esp
1957 ; KNL_32-NEXT: popl %ebp
1958 ; KNL_32-NEXT: vzeroupper
1961 ; SKX-LABEL: test_scatter_16i64:
1963 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
1964 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
1965 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
1966 ; SKX-NEXT: kshiftrw $8, %k1, %k2
1967 ; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
1968 ; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
1969 ; SKX-NEXT: vzeroupper
1972 ; SKX_32-LABEL: test_scatter_16i64:
1974 ; SKX_32-NEXT: pushl %ebp
1975 ; SKX_32-NEXT: .Lcfi7:
1976 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
1977 ; SKX_32-NEXT: .Lcfi8:
1978 ; SKX_32-NEXT: .cfi_offset %ebp, -8
1979 ; SKX_32-NEXT: movl %esp, %ebp
1980 ; SKX_32-NEXT: .Lcfi9:
1981 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
1982 ; SKX_32-NEXT: andl $-64, %esp
1983 ; SKX_32-NEXT: subl $64, %esp
1984 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
1985 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
1986 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
1987 ; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
1988 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
1989 ; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
1990 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
1991 ; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
1992 ; SKX_32-NEXT: movl %ebp, %esp
1993 ; SKX_32-NEXT: popl %ebp
1994 ; SKX_32-NEXT: vzeroupper
1996 call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
1999 declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
2000 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
2001 ; KNL_64-LABEL: test_scatter_16f32:
2003 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2004 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2005 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2006 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2007 ; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2008 ; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
2009 ; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2010 ; KNL_64-NEXT: vzeroupper
2013 ; KNL_32-LABEL: test_scatter_16f32:
2015 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2016 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2017 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2018 ; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2019 ; KNL_32-NEXT: vzeroupper
2022 ; SKX-LABEL: test_scatter_16f32:
2024 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2025 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2026 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
2027 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2028 ; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
2029 ; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
2030 ; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
2031 ; SKX-NEXT: vzeroupper
2034 ; SKX_32-LABEL: test_scatter_16f32:
2036 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2037 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2038 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2039 ; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
2040 ; SKX_32-NEXT: vzeroupper
2042 call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
2045 declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
2046 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
2047 ; KNL_64-LABEL: test_scatter_16f64:
2049 ; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
2050 ; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
2051 ; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
2052 ; KNL_64-NEXT: kshiftrw $8, %k1, %k2
2053 ; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2054 ; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2055 ; KNL_64-NEXT: vzeroupper
2058 ; KNL_32-LABEL: test_scatter_16f64:
2060 ; KNL_32-NEXT: pushl %ebp
2061 ; KNL_32-NEXT: .Lcfi9:
2062 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2063 ; KNL_32-NEXT: .Lcfi10:
2064 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2065 ; KNL_32-NEXT: movl %esp, %ebp
2066 ; KNL_32-NEXT: .Lcfi11:
2067 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2068 ; KNL_32-NEXT: andl $-64, %esp
2069 ; KNL_32-NEXT: subl $64, %esp
2070 ; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
2071 ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
2072 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2073 ; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
2074 ; KNL_32-NEXT: kshiftrw $8, %k1, %k2
2075 ; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2076 ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
2077 ; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2078 ; KNL_32-NEXT: movl %ebp, %esp
2079 ; KNL_32-NEXT: popl %ebp
2080 ; KNL_32-NEXT: vzeroupper
2083 ; SKX-LABEL: test_scatter_16f64:
2085 ; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
2086 ; SKX-NEXT: vpslld $31, %zmm2, %zmm2
2087 ; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
2088 ; SKX-NEXT: kshiftrw $8, %k1, %k2
2089 ; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
2090 ; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
2091 ; SKX-NEXT: vzeroupper
2094 ; SKX_32-LABEL: test_scatter_16f64:
2096 ; SKX_32-NEXT: pushl %ebp
2097 ; SKX_32-NEXT: .Lcfi10:
2098 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2099 ; SKX_32-NEXT: .Lcfi11:
2100 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2101 ; SKX_32-NEXT: movl %esp, %ebp
2102 ; SKX_32-NEXT: .Lcfi12:
2103 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2104 ; SKX_32-NEXT: andl $-64, %esp
2105 ; SKX_32-NEXT: subl $64, %esp
2106 ; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
2107 ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
2108 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
2109 ; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
2110 ; SKX_32-NEXT: kshiftrw $8, %k1, %k2
2111 ; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
2112 ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
2113 ; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
2114 ; SKX_32-NEXT: movl %ebp, %esp
2115 ; SKX_32-NEXT: popl %ebp
2116 ; SKX_32-NEXT: vzeroupper
2118 call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
2121 declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
2123 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
2124 ; KNL_64-LABEL: test_pr28312:
2126 ; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
2127 ; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
2128 ; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
2129 ; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
2130 ; KNL_64-NEXT: vpxord %zmm2, %zmm2, %zmm2
2131 ; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
2132 ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
2133 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
2134 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2135 ; KNL_64-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2136 ; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2139 ; KNL_32-LABEL: test_pr28312:
2141 ; KNL_32-NEXT: pushl %ebp
2142 ; KNL_32-NEXT: .Lcfi12:
2143 ; KNL_32-NEXT: .cfi_def_cfa_offset 8
2144 ; KNL_32-NEXT: .Lcfi13:
2145 ; KNL_32-NEXT: .cfi_offset %ebp, -8
2146 ; KNL_32-NEXT: movl %esp, %ebp
2147 ; KNL_32-NEXT: .Lcfi14:
2148 ; KNL_32-NEXT: .cfi_def_cfa_register %ebp
2149 ; KNL_32-NEXT: andl $-32, %esp
2150 ; KNL_32-NEXT: subl $32, %esp
2151 ; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
2152 ; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
2153 ; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
2154 ; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
2155 ; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
2156 ; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
2157 ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
2158 ; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
2159 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
2160 ; KNL_32-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
2161 ; KNL_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2162 ; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2163 ; KNL_32-NEXT: movl %ebp, %esp
2164 ; KNL_32-NEXT: popl %ebp
2167 ; SKX-LABEL: test_pr28312:
2169 ; SKX-NEXT: vpslld $31, %xmm1, %xmm1
2170 ; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
2171 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
2172 ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2173 ; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2176 ; SKX_32-LABEL: test_pr28312:
2178 ; SKX_32-NEXT: pushl %ebp
2179 ; SKX_32-NEXT: .Lcfi13:
2180 ; SKX_32-NEXT: .cfi_def_cfa_offset 8
2181 ; SKX_32-NEXT: .Lcfi14:
2182 ; SKX_32-NEXT: .cfi_offset %ebp, -8
2183 ; SKX_32-NEXT: movl %esp, %ebp
2184 ; SKX_32-NEXT: .Lcfi15:
2185 ; SKX_32-NEXT: .cfi_def_cfa_register %ebp
2186 ; SKX_32-NEXT: andl $-32, %esp
2187 ; SKX_32-NEXT: subl $32, %esp
2188 ; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
2189 ; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
2190 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1}
2191 ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0
2192 ; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0
2193 ; SKX_32-NEXT: movl %ebp, %esp
2194 ; SKX_32-NEXT: popl %ebp
2196 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2197 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2198 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
2199 %a = add <4 x i64> %g1, %g2
2200 %b = add <4 x i64> %a, %g3
2203 declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)