1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X32
3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=X64
5 ; AVX2 Logical Shift Left
7 define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
8 ; X32-LABEL: test_sllw_1:
9 ; X32: ## BB#0: ## %entry
12 ; X64-LABEL: test_sllw_1:
13 ; X64: ## BB#0: ## %entry
16 %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
20 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
21 ; X32-LABEL: test_sllw_2:
22 ; X32: ## BB#0: ## %entry
23 ; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0
26 ; X64-LABEL: test_sllw_2:
27 ; X64: ## BB#0: ## %entry
28 ; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0
31 %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
35 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
36 ; X32-LABEL: test_sllw_3:
37 ; X32: ## BB#0: ## %entry
38 ; X32-NEXT: vpsllw $15, %ymm0, %ymm0
41 ; X64-LABEL: test_sllw_3:
42 ; X64: ## BB#0: ## %entry
43 ; X64-NEXT: vpsllw $15, %ymm0, %ymm0
46 %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
50 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
51 ; X32-LABEL: test_slld_1:
52 ; X32: ## BB#0: ## %entry
55 ; X64-LABEL: test_slld_1:
56 ; X64: ## BB#0: ## %entry
59 %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
63 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
64 ; X32-LABEL: test_slld_2:
65 ; X32: ## BB#0: ## %entry
66 ; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
69 ; X64-LABEL: test_slld_2:
70 ; X64: ## BB#0: ## %entry
71 ; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
74 %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
78 define <8 x i32> @test_vpslld_var(i32 %shift) {
79 ; X32-LABEL: test_vpslld_var:
81 ; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
82 ; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
83 ; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0
86 ; X64-LABEL: test_vpslld_var:
88 ; X64-NEXT: vmovd %edi, %xmm0
89 ; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
90 ; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
92 %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
93 %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
97 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
98 ; X32-LABEL: test_slld_3:
99 ; X32: ## BB#0: ## %entry
100 ; X32-NEXT: vpslld $31, %ymm0, %ymm0
103 ; X64-LABEL: test_slld_3:
104 ; X64: ## BB#0: ## %entry
105 ; X64-NEXT: vpslld $31, %ymm0, %ymm0
108 %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
112 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
113 ; X32-LABEL: test_sllq_1:
114 ; X32: ## BB#0: ## %entry
117 ; X64-LABEL: test_sllq_1:
118 ; X64: ## BB#0: ## %entry
121 %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
125 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
126 ; X32-LABEL: test_sllq_2:
127 ; X32: ## BB#0: ## %entry
128 ; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
131 ; X64-LABEL: test_sllq_2:
132 ; X64: ## BB#0: ## %entry
133 ; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
136 %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
140 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
141 ; X32-LABEL: test_sllq_3:
142 ; X32: ## BB#0: ## %entry
143 ; X32-NEXT: vpsllq $63, %ymm0, %ymm0
146 ; X64-LABEL: test_sllq_3:
147 ; X64: ## BB#0: ## %entry
148 ; X64-NEXT: vpsllq $63, %ymm0, %ymm0
151 %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
155 ; AVX2 Arithmetic Shift
157 define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
158 ; X32-LABEL: test_sraw_1:
159 ; X32: ## BB#0: ## %entry
162 ; X64-LABEL: test_sraw_1:
163 ; X64: ## BB#0: ## %entry
166 %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
170 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
171 ; X32-LABEL: test_sraw_2:
172 ; X32: ## BB#0: ## %entry
173 ; X32-NEXT: vpsraw $1, %ymm0, %ymm0
176 ; X64-LABEL: test_sraw_2:
177 ; X64: ## BB#0: ## %entry
178 ; X64-NEXT: vpsraw $1, %ymm0, %ymm0
181 %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
185 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
186 ; X32-LABEL: test_sraw_3:
187 ; X32: ## BB#0: ## %entry
188 ; X32-NEXT: vpsraw $15, %ymm0, %ymm0
191 ; X64-LABEL: test_sraw_3:
192 ; X64: ## BB#0: ## %entry
193 ; X64-NEXT: vpsraw $15, %ymm0, %ymm0
196 %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
200 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
201 ; X32-LABEL: test_srad_1:
202 ; X32: ## BB#0: ## %entry
205 ; X64-LABEL: test_srad_1:
206 ; X64: ## BB#0: ## %entry
209 %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
213 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
214 ; X32-LABEL: test_srad_2:
215 ; X32: ## BB#0: ## %entry
216 ; X32-NEXT: vpsrad $1, %ymm0, %ymm0
219 ; X64-LABEL: test_srad_2:
220 ; X64: ## BB#0: ## %entry
221 ; X64-NEXT: vpsrad $1, %ymm0, %ymm0
224 %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
228 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
229 ; X32-LABEL: test_srad_3:
230 ; X32: ## BB#0: ## %entry
231 ; X32-NEXT: vpsrad $31, %ymm0, %ymm0
234 ; X64-LABEL: test_srad_3:
235 ; X64: ## BB#0: ## %entry
236 ; X64-NEXT: vpsrad $31, %ymm0, %ymm0
239 %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
243 ; SSE Logical Shift Right
245 define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
246 ; X32-LABEL: test_srlw_1:
247 ; X32: ## BB#0: ## %entry
250 ; X64-LABEL: test_srlw_1:
251 ; X64: ## BB#0: ## %entry
254 %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
258 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
259 ; X32-LABEL: test_srlw_2:
260 ; X32: ## BB#0: ## %entry
261 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm0
264 ; X64-LABEL: test_srlw_2:
265 ; X64: ## BB#0: ## %entry
266 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm0
269 %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
273 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
274 ; X32-LABEL: test_srlw_3:
275 ; X32: ## BB#0: ## %entry
276 ; X32-NEXT: vpsrlw $15, %ymm0, %ymm0
279 ; X64-LABEL: test_srlw_3:
280 ; X64: ## BB#0: ## %entry
281 ; X64-NEXT: vpsrlw $15, %ymm0, %ymm0
284 %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
288 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
289 ; X32-LABEL: test_srld_1:
290 ; X32: ## BB#0: ## %entry
293 ; X64-LABEL: test_srld_1:
294 ; X64: ## BB#0: ## %entry
297 %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
301 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
302 ; X32-LABEL: test_srld_2:
303 ; X32: ## BB#0: ## %entry
304 ; X32-NEXT: vpsrld $1, %ymm0, %ymm0
307 ; X64-LABEL: test_srld_2:
308 ; X64: ## BB#0: ## %entry
309 ; X64-NEXT: vpsrld $1, %ymm0, %ymm0
312 %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
316 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
317 ; X32-LABEL: test_srld_3:
318 ; X32: ## BB#0: ## %entry
319 ; X32-NEXT: vpsrld $31, %ymm0, %ymm0
322 ; X64-LABEL: test_srld_3:
323 ; X64: ## BB#0: ## %entry
324 ; X64-NEXT: vpsrld $31, %ymm0, %ymm0
327 %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
331 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
332 ; X32-LABEL: test_srlq_1:
333 ; X32: ## BB#0: ## %entry
336 ; X64-LABEL: test_srlq_1:
337 ; X64: ## BB#0: ## %entry
340 %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
344 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
345 ; X32-LABEL: test_srlq_2:
346 ; X32: ## BB#0: ## %entry
347 ; X32-NEXT: vpsrlq $1, %ymm0, %ymm0
350 ; X64-LABEL: test_srlq_2:
351 ; X64: ## BB#0: ## %entry
352 ; X64-NEXT: vpsrlq $1, %ymm0, %ymm0
355 %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
359 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
360 ; X32-LABEL: test_srlq_3:
361 ; X32: ## BB#0: ## %entry
362 ; X32-NEXT: vpsrlq $63, %ymm0, %ymm0
365 ; X64-LABEL: test_srlq_3:
366 ; X64: ## BB#0: ## %entry
367 ; X64-NEXT: vpsrlq $63, %ymm0, %ymm0
370 %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
374 define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
375 ; X32-LABEL: srl_trunc_and_v4i64:
377 ; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
378 ; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
379 ; X32-NEXT: vpbroadcastd LCPI25_0, %xmm2
380 ; X32-NEXT: vpand %xmm2, %xmm1, %xmm1
381 ; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
382 ; X32-NEXT: vzeroupper
385 ; X64-LABEL: srl_trunc_and_v4i64:
387 ; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
388 ; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
389 ; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
390 ; X64-NEXT: vpand %xmm2, %xmm1, %xmm1
391 ; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
392 ; X64-NEXT: vzeroupper
394 %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
395 %trunc = trunc <4 x i64> %and to <4 x i32>
396 %sra = lshr <4 x i32> %x, %trunc
401 ; Vectorized byte shifts
404 define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
405 ; X32-LABEL: shl_8i16:
407 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
408 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
409 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
410 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
411 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
412 ; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
413 ; X32-NEXT: vzeroupper
416 ; X64-LABEL: shl_8i16:
418 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
419 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
420 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
421 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
422 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
423 ; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
424 ; X64-NEXT: vzeroupper
426 %shl = shl <8 x i16> %r, %a
430 define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
431 ; X32-LABEL: shl_16i16:
433 ; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
434 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
435 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
436 ; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
437 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
438 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
439 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
440 ; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
441 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
442 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
445 ; X64-LABEL: shl_16i16:
447 ; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
448 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
449 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
450 ; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
451 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
452 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
453 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
454 ; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
455 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
456 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
458 %shl = shl <16 x i16> %r, %a
462 define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
463 ; X32-LABEL: shl_32i8:
465 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
466 ; X32-NEXT: vpsllw $4, %ymm0, %ymm2
467 ; X32-NEXT: vpand LCPI28_0, %ymm2, %ymm2
468 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
469 ; X32-NEXT: vpsllw $2, %ymm0, %ymm2
470 ; X32-NEXT: vpand LCPI28_1, %ymm2, %ymm2
471 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
472 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
473 ; X32-NEXT: vpaddb %ymm0, %ymm0, %ymm2
474 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
475 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
478 ; X64-LABEL: shl_32i8:
480 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
481 ; X64-NEXT: vpsllw $4, %ymm0, %ymm2
482 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
483 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
484 ; X64-NEXT: vpsllw $2, %ymm0, %ymm2
485 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
486 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
487 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
488 ; X64-NEXT: vpaddb %ymm0, %ymm0, %ymm2
489 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
490 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
492 %shl = shl <32 x i8> %r, %a
496 define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
497 ; X32-LABEL: ashr_8i16:
499 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
500 ; X32-NEXT: vpmovsxwd %xmm0, %ymm0
501 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
502 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
503 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
504 ; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
505 ; X32-NEXT: vzeroupper
508 ; X64-LABEL: ashr_8i16:
510 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
511 ; X64-NEXT: vpmovsxwd %xmm0, %ymm0
512 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
513 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
514 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
515 ; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
516 ; X64-NEXT: vzeroupper
518 %ashr = ashr <8 x i16> %r, %a
522 define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
523 ; X32-LABEL: ashr_16i16:
525 ; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
526 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
527 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
528 ; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3
529 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
530 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
531 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
532 ; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
533 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
534 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
537 ; X64-LABEL: ashr_16i16:
539 ; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
540 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
541 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
542 ; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
543 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
544 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
545 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
546 ; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
547 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
548 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
550 %ashr = ashr <16 x i16> %r, %a
554 define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
555 ; X32-LABEL: ashr_32i8:
557 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
558 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
559 ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
560 ; X32-NEXT: vpsraw $4, %ymm3, %ymm4
561 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
562 ; X32-NEXT: vpsraw $2, %ymm3, %ymm4
563 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
564 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
565 ; X32-NEXT: vpsraw $1, %ymm3, %ymm4
566 ; X32-NEXT: vpaddw %ymm2, %ymm2, %ymm2
567 ; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
568 ; X32-NEXT: vpsrlw $8, %ymm2, %ymm2
569 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
570 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
571 ; X32-NEXT: vpsraw $4, %ymm0, %ymm3
572 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
573 ; X32-NEXT: vpsraw $2, %ymm0, %ymm3
574 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
575 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
576 ; X32-NEXT: vpsraw $1, %ymm0, %ymm3
577 ; X32-NEXT: vpaddw %ymm1, %ymm1, %ymm1
578 ; X32-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
579 ; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
580 ; X32-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
583 ; X64-LABEL: ashr_32i8:
585 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
586 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
587 ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
588 ; X64-NEXT: vpsraw $4, %ymm3, %ymm4
589 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
590 ; X64-NEXT: vpsraw $2, %ymm3, %ymm4
591 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
592 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
593 ; X64-NEXT: vpsraw $1, %ymm3, %ymm4
594 ; X64-NEXT: vpaddw %ymm2, %ymm2, %ymm2
595 ; X64-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
596 ; X64-NEXT: vpsrlw $8, %ymm2, %ymm2
597 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
598 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
599 ; X64-NEXT: vpsraw $4, %ymm0, %ymm3
600 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
601 ; X64-NEXT: vpsraw $2, %ymm0, %ymm3
602 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
603 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
604 ; X64-NEXT: vpsraw $1, %ymm0, %ymm3
605 ; X64-NEXT: vpaddw %ymm1, %ymm1, %ymm1
606 ; X64-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
607 ; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
608 ; X64-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
610 %ashr = ashr <32 x i8> %r, %a
614 define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
615 ; X32-LABEL: lshr_8i16:
617 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
618 ; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
619 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
620 ; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
621 ; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
622 ; X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
623 ; X32-NEXT: vzeroupper
626 ; X64-LABEL: lshr_8i16:
628 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
629 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
630 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
631 ; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
632 ; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
633 ; X64-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
634 ; X64-NEXT: vzeroupper
636 %lshr = lshr <8 x i16> %r, %a
640 define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
641 ; X32-LABEL: lshr_16i16:
643 ; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
644 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
645 ; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
646 ; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
647 ; X32-NEXT: vpsrld $16, %ymm3, %ymm3
648 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
649 ; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
650 ; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
651 ; X32-NEXT: vpsrld $16, %ymm0, %ymm0
652 ; X32-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
655 ; X64-LABEL: lshr_16i16:
657 ; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
658 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
659 ; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
660 ; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
661 ; X64-NEXT: vpsrld $16, %ymm3, %ymm3
662 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
663 ; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
664 ; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
665 ; X64-NEXT: vpsrld $16, %ymm0, %ymm0
666 ; X64-NEXT: vpackusdw %ymm3, %ymm0, %ymm0
668 %lshr = lshr <16 x i16> %r, %a
672 define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
673 ; X32-LABEL: lshr_32i8:
675 ; X32-NEXT: vpsllw $5, %ymm1, %ymm1
676 ; X32-NEXT: vpsrlw $4, %ymm0, %ymm2
677 ; X32-NEXT: vpand LCPI34_0, %ymm2, %ymm2
678 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
679 ; X32-NEXT: vpsrlw $2, %ymm0, %ymm2
680 ; X32-NEXT: vpand LCPI34_1, %ymm2, %ymm2
681 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
682 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
683 ; X32-NEXT: vpsrlw $1, %ymm0, %ymm2
684 ; X32-NEXT: vpand LCPI34_2, %ymm2, %ymm2
685 ; X32-NEXT: vpaddb %ymm1, %ymm1, %ymm1
686 ; X32-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
689 ; X64-LABEL: lshr_32i8:
691 ; X64-NEXT: vpsllw $5, %ymm1, %ymm1
692 ; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
693 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
694 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
695 ; X64-NEXT: vpsrlw $2, %ymm0, %ymm2
696 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
697 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
698 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
699 ; X64-NEXT: vpsrlw $1, %ymm0, %ymm2
700 ; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
701 ; X64-NEXT: vpaddb %ymm1, %ymm1, %ymm1
702 ; X64-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
704 %lshr = lshr <32 x i8> %r, %a