1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
10 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
13 ; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
20 define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21 ; SSE2-LABEL: var_shift_v2i64:
23 ; SSE2-NEXT: movdqa %xmm0, %xmm2
24 ; SSE2-NEXT: psllq %xmm1, %xmm2
25 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
26 ; SSE2-NEXT: psllq %xmm1, %xmm0
27 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
30 ; SSE41-LABEL: var_shift_v2i64:
32 ; SSE41-NEXT: movdqa %xmm0, %xmm2
33 ; SSE41-NEXT: psllq %xmm1, %xmm2
34 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
35 ; SSE41-NEXT: psllq %xmm1, %xmm0
36 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
39 ; AVX1-LABEL: var_shift_v2i64:
41 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
42 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
43 ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
44 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
47 ; AVX2-LABEL: var_shift_v2i64:
49 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
52 ; XOPAVX1-LABEL: var_shift_v2i64:
54 ; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
57 ; XOPAVX2-LABEL: var_shift_v2i64:
59 ; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
62 ; AVX512-LABEL: var_shift_v2i64:
64 ; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
67 ; AVX512VL-LABEL: var_shift_v2i64:
69 ; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
72 ; X32-SSE-LABEL: var_shift_v2i64:
74 ; X32-SSE-NEXT: movdqa %xmm0, %xmm2
75 ; X32-SSE-NEXT: psllq %xmm1, %xmm2
76 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
77 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
78 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
80 %shift = shl <2 x i64> %a, %b
84 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
85 ; SSE2-LABEL: var_shift_v4i32:
87 ; SSE2-NEXT: pslld $23, %xmm1
88 ; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
89 ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
90 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
91 ; SSE2-NEXT: pmuludq %xmm0, %xmm1
92 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
93 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
94 ; SSE2-NEXT: pmuludq %xmm2, %xmm0
95 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
96 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
97 ; SSE2-NEXT: movdqa %xmm1, %xmm0
100 ; SSE41-LABEL: var_shift_v4i32:
102 ; SSE41-NEXT: pslld $23, %xmm1
103 ; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
104 ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
105 ; SSE41-NEXT: pmulld %xmm1, %xmm0
108 ; AVX1-LABEL: var_shift_v4i32:
110 ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
111 ; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
112 ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
113 ; AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0
116 ; AVX2-LABEL: var_shift_v4i32:
118 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
121 ; XOPAVX1-LABEL: var_shift_v4i32:
123 ; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
126 ; XOPAVX2-LABEL: var_shift_v4i32:
128 ; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
131 ; AVX512-LABEL: var_shift_v4i32:
133 ; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
136 ; AVX512VL-LABEL: var_shift_v4i32:
138 ; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
139 ; AVX512VL-NEXT: retq
141 ; X32-SSE-LABEL: var_shift_v4i32:
143 ; X32-SSE-NEXT: pslld $23, %xmm1
144 ; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
145 ; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
146 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
147 ; X32-SSE-NEXT: pmuludq %xmm0, %xmm1
148 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
149 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
150 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm0
151 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
152 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
153 ; X32-SSE-NEXT: movdqa %xmm1, %xmm0
155 %shift = shl <4 x i32> %a, %b
159 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
160 ; SSE2-LABEL: var_shift_v8i16:
162 ; SSE2-NEXT: psllw $12, %xmm1
163 ; SSE2-NEXT: movdqa %xmm1, %xmm2
164 ; SSE2-NEXT: psraw $15, %xmm2
165 ; SSE2-NEXT: movdqa %xmm2, %xmm3
166 ; SSE2-NEXT: pandn %xmm0, %xmm3
167 ; SSE2-NEXT: psllw $8, %xmm0
168 ; SSE2-NEXT: pand %xmm2, %xmm0
169 ; SSE2-NEXT: por %xmm3, %xmm0
170 ; SSE2-NEXT: paddw %xmm1, %xmm1
171 ; SSE2-NEXT: movdqa %xmm1, %xmm2
172 ; SSE2-NEXT: psraw $15, %xmm2
173 ; SSE2-NEXT: movdqa %xmm2, %xmm3
174 ; SSE2-NEXT: pandn %xmm0, %xmm3
175 ; SSE2-NEXT: psllw $4, %xmm0
176 ; SSE2-NEXT: pand %xmm2, %xmm0
177 ; SSE2-NEXT: por %xmm3, %xmm0
178 ; SSE2-NEXT: paddw %xmm1, %xmm1
179 ; SSE2-NEXT: movdqa %xmm1, %xmm2
180 ; SSE2-NEXT: psraw $15, %xmm2
181 ; SSE2-NEXT: movdqa %xmm2, %xmm3
182 ; SSE2-NEXT: pandn %xmm0, %xmm3
183 ; SSE2-NEXT: psllw $2, %xmm0
184 ; SSE2-NEXT: pand %xmm2, %xmm0
185 ; SSE2-NEXT: por %xmm3, %xmm0
186 ; SSE2-NEXT: paddw %xmm1, %xmm1
187 ; SSE2-NEXT: psraw $15, %xmm1
188 ; SSE2-NEXT: movdqa %xmm1, %xmm2
189 ; SSE2-NEXT: pandn %xmm0, %xmm2
190 ; SSE2-NEXT: psllw $1, %xmm0
191 ; SSE2-NEXT: pand %xmm1, %xmm0
192 ; SSE2-NEXT: por %xmm2, %xmm0
195 ; SSE41-LABEL: var_shift_v8i16:
197 ; SSE41-NEXT: movdqa %xmm0, %xmm2
198 ; SSE41-NEXT: movdqa %xmm1, %xmm0
199 ; SSE41-NEXT: psllw $12, %xmm0
200 ; SSE41-NEXT: psllw $4, %xmm1
201 ; SSE41-NEXT: por %xmm0, %xmm1
202 ; SSE41-NEXT: movdqa %xmm1, %xmm3
203 ; SSE41-NEXT: paddw %xmm3, %xmm3
204 ; SSE41-NEXT: movdqa %xmm2, %xmm4
205 ; SSE41-NEXT: psllw $8, %xmm4
206 ; SSE41-NEXT: movdqa %xmm1, %xmm0
207 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2
208 ; SSE41-NEXT: movdqa %xmm2, %xmm1
209 ; SSE41-NEXT: psllw $4, %xmm1
210 ; SSE41-NEXT: movdqa %xmm3, %xmm0
211 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
212 ; SSE41-NEXT: movdqa %xmm2, %xmm1
213 ; SSE41-NEXT: psllw $2, %xmm1
214 ; SSE41-NEXT: paddw %xmm3, %xmm3
215 ; SSE41-NEXT: movdqa %xmm3, %xmm0
216 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
217 ; SSE41-NEXT: movdqa %xmm2, %xmm1
218 ; SSE41-NEXT: psllw $1, %xmm1
219 ; SSE41-NEXT: paddw %xmm3, %xmm3
220 ; SSE41-NEXT: movdqa %xmm3, %xmm0
221 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
222 ; SSE41-NEXT: movdqa %xmm2, %xmm0
225 ; AVX1-LABEL: var_shift_v8i16:
227 ; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
228 ; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
229 ; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
230 ; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
231 ; AVX1-NEXT: vpsllw $8, %xmm0, %xmm3
232 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
233 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm1
234 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
235 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
236 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
237 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
238 ; AVX1-NEXT: vpsllw $1, %xmm0, %xmm1
239 ; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
240 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
243 ; AVX2-LABEL: var_shift_v8i16:
245 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
246 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
247 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
248 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
249 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
250 ; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
251 ; AVX2-NEXT: vzeroupper
254 ; XOP-LABEL: var_shift_v8i16:
256 ; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
259 ; AVX512DQ-LABEL: var_shift_v8i16:
261 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
262 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
263 ; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
264 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
265 ; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
266 ; AVX512DQ-NEXT: vzeroupper
267 ; AVX512DQ-NEXT: retq
269 ; AVX512BW-LABEL: var_shift_v8i16:
271 ; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
272 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
273 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
274 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
275 ; AVX512BW-NEXT: vzeroupper
276 ; AVX512BW-NEXT: retq
278 ; AVX512DQVL-LABEL: var_shift_v8i16:
279 ; AVX512DQVL: # %bb.0:
280 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
281 ; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
282 ; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
283 ; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
284 ; AVX512DQVL-NEXT: vzeroupper
285 ; AVX512DQVL-NEXT: retq
287 ; AVX512BWVL-LABEL: var_shift_v8i16:
288 ; AVX512BWVL: # %bb.0:
289 ; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
290 ; AVX512BWVL-NEXT: retq
292 ; X32-SSE-LABEL: var_shift_v8i16:
294 ; X32-SSE-NEXT: psllw $12, %xmm1
295 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
296 ; X32-SSE-NEXT: psraw $15, %xmm2
297 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
298 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
299 ; X32-SSE-NEXT: psllw $8, %xmm0
300 ; X32-SSE-NEXT: pand %xmm2, %xmm0
301 ; X32-SSE-NEXT: por %xmm3, %xmm0
302 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
303 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
304 ; X32-SSE-NEXT: psraw $15, %xmm2
305 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
306 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
307 ; X32-SSE-NEXT: psllw $4, %xmm0
308 ; X32-SSE-NEXT: pand %xmm2, %xmm0
309 ; X32-SSE-NEXT: por %xmm3, %xmm0
310 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
311 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
312 ; X32-SSE-NEXT: psraw $15, %xmm2
313 ; X32-SSE-NEXT: movdqa %xmm2, %xmm3
314 ; X32-SSE-NEXT: pandn %xmm0, %xmm3
315 ; X32-SSE-NEXT: psllw $2, %xmm0
316 ; X32-SSE-NEXT: pand %xmm2, %xmm0
317 ; X32-SSE-NEXT: por %xmm3, %xmm0
318 ; X32-SSE-NEXT: paddw %xmm1, %xmm1
319 ; X32-SSE-NEXT: psraw $15, %xmm1
320 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
321 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
322 ; X32-SSE-NEXT: psllw $1, %xmm0
323 ; X32-SSE-NEXT: pand %xmm1, %xmm0
324 ; X32-SSE-NEXT: por %xmm2, %xmm0
326 %shift = shl <8 x i16> %a, %b
330 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
331 ; SSE2-LABEL: var_shift_v16i8:
333 ; SSE2-NEXT: psllw $5, %xmm1
334 ; SSE2-NEXT: pxor %xmm2, %xmm2
335 ; SSE2-NEXT: pxor %xmm3, %xmm3
336 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
337 ; SSE2-NEXT: movdqa %xmm3, %xmm4
338 ; SSE2-NEXT: pandn %xmm0, %xmm4
339 ; SSE2-NEXT: psllw $4, %xmm0
340 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
341 ; SSE2-NEXT: pand %xmm3, %xmm0
342 ; SSE2-NEXT: por %xmm4, %xmm0
343 ; SSE2-NEXT: paddb %xmm1, %xmm1
344 ; SSE2-NEXT: pxor %xmm3, %xmm3
345 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
346 ; SSE2-NEXT: movdqa %xmm3, %xmm4
347 ; SSE2-NEXT: pandn %xmm0, %xmm4
348 ; SSE2-NEXT: psllw $2, %xmm0
349 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
350 ; SSE2-NEXT: pand %xmm3, %xmm0
351 ; SSE2-NEXT: por %xmm4, %xmm0
352 ; SSE2-NEXT: paddb %xmm1, %xmm1
353 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
354 ; SSE2-NEXT: movdqa %xmm2, %xmm1
355 ; SSE2-NEXT: pandn %xmm0, %xmm1
356 ; SSE2-NEXT: paddb %xmm0, %xmm0
357 ; SSE2-NEXT: pand %xmm2, %xmm0
358 ; SSE2-NEXT: por %xmm1, %xmm0
361 ; SSE41-LABEL: var_shift_v16i8:
363 ; SSE41-NEXT: movdqa %xmm0, %xmm2
364 ; SSE41-NEXT: psllw $5, %xmm1
365 ; SSE41-NEXT: movdqa %xmm2, %xmm3
366 ; SSE41-NEXT: psllw $4, %xmm3
367 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
368 ; SSE41-NEXT: movdqa %xmm1, %xmm0
369 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
370 ; SSE41-NEXT: movdqa %xmm2, %xmm3
371 ; SSE41-NEXT: psllw $2, %xmm3
372 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
373 ; SSE41-NEXT: paddb %xmm1, %xmm1
374 ; SSE41-NEXT: movdqa %xmm1, %xmm0
375 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
376 ; SSE41-NEXT: movdqa %xmm2, %xmm3
377 ; SSE41-NEXT: paddb %xmm3, %xmm3
378 ; SSE41-NEXT: paddb %xmm1, %xmm1
379 ; SSE41-NEXT: movdqa %xmm1, %xmm0
380 ; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
381 ; SSE41-NEXT: movdqa %xmm2, %xmm0
384 ; AVX-LABEL: var_shift_v16i8:
386 ; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
387 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
388 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
389 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
390 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
391 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
392 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
393 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
394 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
395 ; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
396 ; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
399 ; XOP-LABEL: var_shift_v16i8:
401 ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
404 ; AVX512DQ-LABEL: var_shift_v16i8:
406 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
408 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
409 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
410 ; AVX512DQ-NEXT: vzeroupper
411 ; AVX512DQ-NEXT: retq
413 ; AVX512BW-LABEL: var_shift_v16i8:
415 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
416 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
417 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
418 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
419 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
420 ; AVX512BW-NEXT: vzeroupper
421 ; AVX512BW-NEXT: retq
423 ; AVX512DQVL-LABEL: var_shift_v16i8:
424 ; AVX512DQVL: # %bb.0:
425 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
426 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
427 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
428 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
429 ; AVX512DQVL-NEXT: vzeroupper
430 ; AVX512DQVL-NEXT: retq
432 ; AVX512BWVL-LABEL: var_shift_v16i8:
433 ; AVX512BWVL: # %bb.0:
434 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
435 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
436 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
437 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
438 ; AVX512BWVL-NEXT: vzeroupper
439 ; AVX512BWVL-NEXT: retq
441 ; X32-SSE-LABEL: var_shift_v16i8:
443 ; X32-SSE-NEXT: psllw $5, %xmm1
444 ; X32-SSE-NEXT: pxor %xmm2, %xmm2
445 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
446 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
447 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
448 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
449 ; X32-SSE-NEXT: psllw $4, %xmm0
450 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
451 ; X32-SSE-NEXT: pand %xmm3, %xmm0
452 ; X32-SSE-NEXT: por %xmm4, %xmm0
453 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
454 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
455 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
456 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
457 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
458 ; X32-SSE-NEXT: psllw $2, %xmm0
459 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
460 ; X32-SSE-NEXT: pand %xmm3, %xmm0
461 ; X32-SSE-NEXT: por %xmm4, %xmm0
462 ; X32-SSE-NEXT: paddb %xmm1, %xmm1
463 ; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
464 ; X32-SSE-NEXT: movdqa %xmm2, %xmm1
465 ; X32-SSE-NEXT: pandn %xmm0, %xmm1
466 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
467 ; X32-SSE-NEXT: pand %xmm2, %xmm0
468 ; X32-SSE-NEXT: por %xmm1, %xmm0
470 %shift = shl <16 x i8> %a, %b
475 ; Uniform Variable Shifts
478 define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
479 ; SSE-LABEL: splatvar_shift_v2i64:
481 ; SSE-NEXT: psllq %xmm1, %xmm0
484 ; AVX-LABEL: splatvar_shift_v2i64:
486 ; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
489 ; XOP-LABEL: splatvar_shift_v2i64:
491 ; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0
494 ; AVX512-LABEL: splatvar_shift_v2i64:
496 ; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0
499 ; AVX512VL-LABEL: splatvar_shift_v2i64:
501 ; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
502 ; AVX512VL-NEXT: retq
504 ; X32-SSE-LABEL: splatvar_shift_v2i64:
506 ; X32-SSE-NEXT: psllq %xmm1, %xmm0
508 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
509 %shift = shl <2 x i64> %a, %splat
513 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
514 ; SSE2-LABEL: splatvar_shift_v4i32:
516 ; SSE2-NEXT: xorps %xmm2, %xmm2
517 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
518 ; SSE2-NEXT: pslld %xmm2, %xmm0
521 ; SSE41-LABEL: splatvar_shift_v4i32:
523 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
524 ; SSE41-NEXT: pslld %xmm1, %xmm0
527 ; AVX-LABEL: splatvar_shift_v4i32:
529 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
530 ; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
533 ; XOP-LABEL: splatvar_shift_v4i32:
535 ; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
536 ; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
539 ; AVX512-LABEL: splatvar_shift_v4i32:
541 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
542 ; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
545 ; AVX512VL-LABEL: splatvar_shift_v4i32:
547 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
548 ; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
549 ; AVX512VL-NEXT: retq
551 ; X32-SSE-LABEL: splatvar_shift_v4i32:
553 ; X32-SSE-NEXT: xorps %xmm2, %xmm2
554 ; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
555 ; X32-SSE-NEXT: pslld %xmm2, %xmm0
557 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
558 %shift = shl <4 x i32> %a, %splat
562 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
563 ; SSE2-LABEL: splatvar_shift_v8i16:
565 ; SSE2-NEXT: pextrw $0, %xmm1, %eax
566 ; SSE2-NEXT: movd %eax, %xmm1
567 ; SSE2-NEXT: psllw %xmm1, %xmm0
570 ; SSE41-LABEL: splatvar_shift_v8i16:
572 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
573 ; SSE41-NEXT: psllw %xmm1, %xmm0
576 ; AVX-LABEL: splatvar_shift_v8i16:
578 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
579 ; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
582 ; XOP-LABEL: splatvar_shift_v8i16:
584 ; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
585 ; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
588 ; AVX512-LABEL: splatvar_shift_v8i16:
590 ; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
591 ; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
594 ; AVX512VL-LABEL: splatvar_shift_v8i16:
596 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
597 ; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
598 ; AVX512VL-NEXT: retq
600 ; X32-SSE-LABEL: splatvar_shift_v8i16:
602 ; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
603 ; X32-SSE-NEXT: movd %eax, %xmm1
604 ; X32-SSE-NEXT: psllw %xmm1, %xmm0
606 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
607 %shift = shl <8 x i16> %a, %splat
611 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
612 ; SSE2-LABEL: splatvar_shift_v16i8:
614 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
615 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
616 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
617 ; SSE2-NEXT: psllw $5, %xmm2
618 ; SSE2-NEXT: pxor %xmm1, %xmm1
619 ; SSE2-NEXT: pxor %xmm3, %xmm3
620 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
621 ; SSE2-NEXT: movdqa %xmm3, %xmm4
622 ; SSE2-NEXT: pandn %xmm0, %xmm4
623 ; SSE2-NEXT: psllw $4, %xmm0
624 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
625 ; SSE2-NEXT: pand %xmm3, %xmm0
626 ; SSE2-NEXT: por %xmm4, %xmm0
627 ; SSE2-NEXT: paddb %xmm2, %xmm2
628 ; SSE2-NEXT: pxor %xmm3, %xmm3
629 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
630 ; SSE2-NEXT: movdqa %xmm3, %xmm4
631 ; SSE2-NEXT: pandn %xmm0, %xmm4
632 ; SSE2-NEXT: psllw $2, %xmm0
633 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
634 ; SSE2-NEXT: pand %xmm3, %xmm0
635 ; SSE2-NEXT: por %xmm4, %xmm0
636 ; SSE2-NEXT: paddb %xmm2, %xmm2
637 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
638 ; SSE2-NEXT: movdqa %xmm1, %xmm2
639 ; SSE2-NEXT: pandn %xmm0, %xmm2
640 ; SSE2-NEXT: paddb %xmm0, %xmm0
641 ; SSE2-NEXT: pand %xmm1, %xmm0
642 ; SSE2-NEXT: por %xmm2, %xmm0
645 ; SSE41-LABEL: splatvar_shift_v16i8:
647 ; SSE41-NEXT: movdqa %xmm0, %xmm2
648 ; SSE41-NEXT: pxor %xmm0, %xmm0
649 ; SSE41-NEXT: pshufb %xmm0, %xmm1
650 ; SSE41-NEXT: psllw $5, %xmm1
651 ; SSE41-NEXT: movdqa %xmm1, %xmm3
652 ; SSE41-NEXT: paddb %xmm3, %xmm3
653 ; SSE41-NEXT: movdqa %xmm2, %xmm4
654 ; SSE41-NEXT: psllw $4, %xmm4
655 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm4
656 ; SSE41-NEXT: movdqa %xmm1, %xmm0
657 ; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2
658 ; SSE41-NEXT: movdqa %xmm2, %xmm1
659 ; SSE41-NEXT: psllw $2, %xmm1
660 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
661 ; SSE41-NEXT: movdqa %xmm3, %xmm0
662 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
663 ; SSE41-NEXT: movdqa %xmm2, %xmm1
664 ; SSE41-NEXT: paddb %xmm1, %xmm1
665 ; SSE41-NEXT: paddb %xmm3, %xmm3
666 ; SSE41-NEXT: movdqa %xmm3, %xmm0
667 ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
668 ; SSE41-NEXT: movdqa %xmm2, %xmm0
671 ; AVX1-LABEL: splatvar_shift_v16i8:
673 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
674 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
675 ; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
676 ; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2
677 ; AVX1-NEXT: vpsllw $4, %xmm0, %xmm3
678 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm3, %xmm3
679 ; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
680 ; AVX1-NEXT: vpsllw $2, %xmm0, %xmm1
681 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
682 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
683 ; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1
684 ; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2
685 ; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
688 ; AVX2-LABEL: splatvar_shift_v16i8:
690 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
691 ; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
692 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
693 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
694 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
695 ; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2
696 ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
697 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
698 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
699 ; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2
700 ; AVX2-NEXT: vpaddb %xmm1, %xmm1, %xmm1
701 ; AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
704 ; XOPAVX1-LABEL: splatvar_shift_v16i8:
706 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
707 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
708 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
711 ; XOPAVX2-LABEL: splatvar_shift_v16i8:
713 ; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
714 ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
717 ; AVX512DQ-LABEL: splatvar_shift_v16i8:
719 ; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
720 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
721 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
722 ; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
723 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
724 ; AVX512DQ-NEXT: vzeroupper
725 ; AVX512DQ-NEXT: retq
727 ; AVX512BW-LABEL: splatvar_shift_v16i8:
729 ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
730 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
731 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
732 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
733 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
734 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
735 ; AVX512BW-NEXT: vzeroupper
736 ; AVX512BW-NEXT: retq
738 ; AVX512DQVL-LABEL: splatvar_shift_v16i8:
739 ; AVX512DQVL: # %bb.0:
740 ; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
741 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
742 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
743 ; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
744 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
745 ; AVX512DQVL-NEXT: vzeroupper
746 ; AVX512DQVL-NEXT: retq
748 ; AVX512BWVL-LABEL: splatvar_shift_v16i8:
749 ; AVX512BWVL: # %bb.0:
750 ; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
751 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
752 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
753 ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
754 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
755 ; AVX512BWVL-NEXT: vzeroupper
756 ; AVX512BWVL-NEXT: retq
758 ; X32-SSE-LABEL: splatvar_shift_v16i8:
760 ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
761 ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
762 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
763 ; X32-SSE-NEXT: psllw $5, %xmm2
764 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
765 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
766 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
767 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
768 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
769 ; X32-SSE-NEXT: psllw $4, %xmm0
770 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
771 ; X32-SSE-NEXT: pand %xmm3, %xmm0
772 ; X32-SSE-NEXT: por %xmm4, %xmm0
773 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
774 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
775 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
776 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
777 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
778 ; X32-SSE-NEXT: psllw $2, %xmm0
779 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
780 ; X32-SSE-NEXT: pand %xmm3, %xmm0
781 ; X32-SSE-NEXT: por %xmm4, %xmm0
782 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
783 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
784 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
785 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
786 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
787 ; X32-SSE-NEXT: pand %xmm1, %xmm0
788 ; X32-SSE-NEXT: por %xmm2, %xmm0
790 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
791 %shift = shl <16 x i8> %a, %splat
799 define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
800 ; SSE2-LABEL: constant_shift_v2i64:
802 ; SSE2-NEXT: movdqa %xmm0, %xmm1
803 ; SSE2-NEXT: psllq $1, %xmm1
804 ; SSE2-NEXT: psllq $7, %xmm0
805 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
808 ; SSE41-LABEL: constant_shift_v2i64:
810 ; SSE41-NEXT: movdqa %xmm0, %xmm1
811 ; SSE41-NEXT: psllq $7, %xmm1
812 ; SSE41-NEXT: psllq $1, %xmm0
813 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
816 ; AVX1-LABEL: constant_shift_v2i64:
818 ; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
819 ; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
820 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
823 ; AVX2-LABEL: constant_shift_v2i64:
825 ; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
828 ; XOPAVX1-LABEL: constant_shift_v2i64:
830 ; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
833 ; XOPAVX2-LABEL: constant_shift_v2i64:
835 ; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
838 ; AVX512-LABEL: constant_shift_v2i64:
840 ; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
843 ; AVX512VL-LABEL: constant_shift_v2i64:
845 ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
846 ; AVX512VL-NEXT: retq
848 ; X32-SSE-LABEL: constant_shift_v2i64:
850 ; X32-SSE-NEXT: movdqa %xmm0, %xmm1
851 ; X32-SSE-NEXT: psllq $1, %xmm1
852 ; X32-SSE-NEXT: psllq $7, %xmm0
853 ; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
855 %shift = shl <2 x i64> %a, <i64 1, i64 7>
859 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
860 ; SSE2-LABEL: constant_shift_v4i32:
862 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
863 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
864 ; SSE2-NEXT: pmuludq %xmm1, %xmm0
865 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
866 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
867 ; SSE2-NEXT: pmuludq %xmm2, %xmm1
868 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
869 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
872 ; SSE41-LABEL: constant_shift_v4i32:
874 ; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
877 ; AVX1-LABEL: constant_shift_v4i32:
879 ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
882 ; AVX2-LABEL: constant_shift_v4i32:
884 ; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
887 ; XOPAVX1-LABEL: constant_shift_v4i32:
889 ; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
892 ; XOPAVX2-LABEL: constant_shift_v4i32:
894 ; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
897 ; AVX512-LABEL: constant_shift_v4i32:
899 ; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
902 ; AVX512VL-LABEL: constant_shift_v4i32:
904 ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
905 ; AVX512VL-NEXT: retq
907 ; X32-SSE-LABEL: constant_shift_v4i32:
909 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
910 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
911 ; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
912 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
913 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
914 ; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
915 ; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
916 ; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
918 %shift = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
922 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
923 ; SSE-LABEL: constant_shift_v8i16:
925 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
928 ; AVX-LABEL: constant_shift_v8i16:
930 ; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
933 ; XOP-LABEL: constant_shift_v8i16:
935 ; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
938 ; AVX512DQ-LABEL: constant_shift_v8i16:
940 ; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
941 ; AVX512DQ-NEXT: retq
943 ; AVX512BW-LABEL: constant_shift_v8i16:
945 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
946 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
947 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
948 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
949 ; AVX512BW-NEXT: vzeroupper
950 ; AVX512BW-NEXT: retq
952 ; AVX512DQVL-LABEL: constant_shift_v8i16:
953 ; AVX512DQVL: # %bb.0:
954 ; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
955 ; AVX512DQVL-NEXT: retq
957 ; AVX512BWVL-LABEL: constant_shift_v8i16:
958 ; AVX512BWVL: # %bb.0:
959 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
960 ; AVX512BWVL-NEXT: retq
962 ; X32-SSE-LABEL: constant_shift_v8i16:
964 ; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
966 %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
970 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
971 ; SSE2-LABEL: constant_shift_v16i8:
973 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
974 ; SSE2-NEXT: pxor %xmm1, %xmm1
975 ; SSE2-NEXT: pxor %xmm3, %xmm3
976 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
977 ; SSE2-NEXT: movdqa %xmm3, %xmm4
978 ; SSE2-NEXT: pandn %xmm0, %xmm4
979 ; SSE2-NEXT: psllw $4, %xmm0
980 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
981 ; SSE2-NEXT: pand %xmm3, %xmm0
982 ; SSE2-NEXT: por %xmm4, %xmm0
983 ; SSE2-NEXT: paddb %xmm2, %xmm2
984 ; SSE2-NEXT: pxor %xmm3, %xmm3
985 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm3
986 ; SSE2-NEXT: movdqa %xmm3, %xmm4
987 ; SSE2-NEXT: pandn %xmm0, %xmm4
988 ; SSE2-NEXT: psllw $2, %xmm0
989 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
990 ; SSE2-NEXT: pand %xmm3, %xmm0
991 ; SSE2-NEXT: por %xmm4, %xmm0
992 ; SSE2-NEXT: paddb %xmm2, %xmm2
993 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
994 ; SSE2-NEXT: movdqa %xmm1, %xmm2
995 ; SSE2-NEXT: pandn %xmm0, %xmm2
996 ; SSE2-NEXT: paddb %xmm0, %xmm0
997 ; SSE2-NEXT: pand %xmm1, %xmm0
998 ; SSE2-NEXT: por %xmm2, %xmm0
1001 ; SSE41-LABEL: constant_shift_v16i8:
1003 ; SSE41-NEXT: movdqa %xmm0, %xmm1
1004 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1005 ; SSE41-NEXT: psllw $4, %xmm2
1006 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1007 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,49376,32928,16480,32]
1008 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1009 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1010 ; SSE41-NEXT: psllw $2, %xmm2
1011 ; SSE41-NEXT: pand {{.*}}(%rip), %xmm2
1012 ; SSE41-NEXT: paddb %xmm0, %xmm0
1013 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1014 ; SSE41-NEXT: movdqa %xmm1, %xmm2
1015 ; SSE41-NEXT: paddb %xmm2, %xmm2
1016 ; SSE41-NEXT: paddb %xmm0, %xmm0
1017 ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
1018 ; SSE41-NEXT: movdqa %xmm1, %xmm0
1021 ; AVX-LABEL: constant_shift_v16i8:
1023 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
1024 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1025 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
1026 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1027 ; AVX-NEXT: vpsllw $2, %xmm0, %xmm1
1028 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
1029 ; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2
1030 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1031 ; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm1
1032 ; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2
1033 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1036 ; XOP-LABEL: constant_shift_v16i8:
1038 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
1041 ; AVX512DQ-LABEL: constant_shift_v16i8:
1042 ; AVX512DQ: # %bb.0:
1043 ; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1044 ; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1045 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
1046 ; AVX512DQ-NEXT: vzeroupper
1047 ; AVX512DQ-NEXT: retq
1049 ; AVX512BW-LABEL: constant_shift_v16i8:
1050 ; AVX512BW: # %bb.0:
1051 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1052 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1053 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
1054 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
1055 ; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
1056 ; AVX512BW-NEXT: vzeroupper
1057 ; AVX512BW-NEXT: retq
1059 ; AVX512DQVL-LABEL: constant_shift_v16i8:
1060 ; AVX512DQVL: # %bb.0:
1061 ; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1062 ; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
1063 ; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
1064 ; AVX512DQVL-NEXT: vzeroupper
1065 ; AVX512DQVL-NEXT: retq
1067 ; AVX512BWVL-LABEL: constant_shift_v16i8:
1068 ; AVX512BWVL: # %bb.0:
1069 ; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1070 ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1071 ; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
1072 ; AVX512BWVL-NEXT: vzeroupper
1073 ; AVX512BWVL-NEXT: retq
1075 ; X32-SSE-LABEL: constant_shift_v16i8:
1077 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
1078 ; X32-SSE-NEXT: pxor %xmm1, %xmm1
1079 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1080 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
1081 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
1082 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
1083 ; X32-SSE-NEXT: psllw $4, %xmm0
1084 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1085 ; X32-SSE-NEXT: pand %xmm3, %xmm0
1086 ; X32-SSE-NEXT: por %xmm4, %xmm0
1087 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
1088 ; X32-SSE-NEXT: pxor %xmm3, %xmm3
1089 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm3
1090 ; X32-SSE-NEXT: movdqa %xmm3, %xmm4
1091 ; X32-SSE-NEXT: pandn %xmm0, %xmm4
1092 ; X32-SSE-NEXT: psllw $2, %xmm0
1093 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1094 ; X32-SSE-NEXT: pand %xmm3, %xmm0
1095 ; X32-SSE-NEXT: por %xmm4, %xmm0
1096 ; X32-SSE-NEXT: paddb %xmm2, %xmm2
1097 ; X32-SSE-NEXT: pcmpgtb %xmm2, %xmm1
1098 ; X32-SSE-NEXT: movdqa %xmm1, %xmm2
1099 ; X32-SSE-NEXT: pandn %xmm0, %xmm2
1100 ; X32-SSE-NEXT: paddb %xmm0, %xmm0
1101 ; X32-SSE-NEXT: pand %xmm1, %xmm0
1102 ; X32-SSE-NEXT: por %xmm2, %xmm0
1103 ; X32-SSE-NEXT: retl
1104 %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1105 ret <16 x i8> %shift
1109 ; Uniform Constant Shifts
1112 define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1113 ; SSE-LABEL: splatconstant_shift_v2i64:
1115 ; SSE-NEXT: psllq $7, %xmm0
1118 ; AVX-LABEL: splatconstant_shift_v2i64:
1120 ; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
1123 ; XOP-LABEL: splatconstant_shift_v2i64:
1125 ; XOP-NEXT: vpsllq $7, %xmm0, %xmm0
1128 ; AVX512-LABEL: splatconstant_shift_v2i64:
1130 ; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0
1133 ; AVX512VL-LABEL: splatconstant_shift_v2i64:
1134 ; AVX512VL: # %bb.0:
1135 ; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
1136 ; AVX512VL-NEXT: retq
1138 ; X32-SSE-LABEL: splatconstant_shift_v2i64:
1140 ; X32-SSE-NEXT: psllq $7, %xmm0
1141 ; X32-SSE-NEXT: retl
1142 %shift = shl <2 x i64> %a, <i64 7, i64 7>
1143 ret <2 x i64> %shift
1146 define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1147 ; SSE-LABEL: splatconstant_shift_v4i32:
1149 ; SSE-NEXT: pslld $5, %xmm0
1152 ; AVX-LABEL: splatconstant_shift_v4i32:
1154 ; AVX-NEXT: vpslld $5, %xmm0, %xmm0
1157 ; XOP-LABEL: splatconstant_shift_v4i32:
1159 ; XOP-NEXT: vpslld $5, %xmm0, %xmm0
1162 ; AVX512-LABEL: splatconstant_shift_v4i32:
1164 ; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
1167 ; AVX512VL-LABEL: splatconstant_shift_v4i32:
1168 ; AVX512VL: # %bb.0:
1169 ; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
1170 ; AVX512VL-NEXT: retq
1172 ; X32-SSE-LABEL: splatconstant_shift_v4i32:
1174 ; X32-SSE-NEXT: pslld $5, %xmm0
1175 ; X32-SSE-NEXT: retl
1176 %shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1177 ret <4 x i32> %shift
1180 define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1181 ; SSE-LABEL: splatconstant_shift_v8i16:
1183 ; SSE-NEXT: psllw $3, %xmm0
1186 ; AVX-LABEL: splatconstant_shift_v8i16:
1188 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
1191 ; XOP-LABEL: splatconstant_shift_v8i16:
1193 ; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
1196 ; AVX512-LABEL: splatconstant_shift_v8i16:
1198 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
1201 ; AVX512VL-LABEL: splatconstant_shift_v8i16:
1202 ; AVX512VL: # %bb.0:
1203 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
1204 ; AVX512VL-NEXT: retq
1206 ; X32-SSE-LABEL: splatconstant_shift_v8i16:
1208 ; X32-SSE-NEXT: psllw $3, %xmm0
1209 ; X32-SSE-NEXT: retl
1210 %shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1211 ret <8 x i16> %shift
1214 define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1215 ; SSE-LABEL: splatconstant_shift_v16i8:
1217 ; SSE-NEXT: psllw $3, %xmm0
1218 ; SSE-NEXT: pand {{.*}}(%rip), %xmm0
1221 ; AVX-LABEL: splatconstant_shift_v16i8:
1223 ; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
1224 ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1227 ; XOP-LABEL: splatconstant_shift_v16i8:
1229 ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
1232 ; AVX512-LABEL: splatconstant_shift_v16i8:
1234 ; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
1235 ; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1238 ; AVX512VL-LABEL: splatconstant_shift_v16i8:
1239 ; AVX512VL: # %bb.0:
1240 ; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
1241 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
1242 ; AVX512VL-NEXT: retq
1244 ; X32-SSE-LABEL: splatconstant_shift_v16i8:
1246 ; X32-SSE-NEXT: psllw $3, %xmm0
1247 ; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
1248 ; X32-SSE-NEXT: retl
1249 %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1250 ret <16 x i8> %shift