1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6 ; Unary shuffle indices from registers
9 define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
10 ; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
12 ; ALL-NEXT: pushq %rbp
13 ; ALL-NEXT: movq %rsp, %rbp
14 ; ALL-NEXT: andq $-32, %rsp
15 ; ALL-NEXT: subq $64, %rsp
16 ; ALL-NEXT: andl $3, %ecx
17 ; ALL-NEXT: andl $3, %edx
18 ; ALL-NEXT: andl $3, %esi
19 ; ALL-NEXT: andl $3, %edi
20 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
21 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
22 ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
23 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
24 ; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
25 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
26 ; ALL-NEXT: movq %rbp, %rsp
29 %x0 = extractelement <4 x double> %x, i64 %i0
30 %x1 = extractelement <4 x double> %x, i64 %i1
31 %x2 = extractelement <4 x double> %x, i64 %i2
32 %x3 = extractelement <4 x double> %x, i64 %i3
33 %r0 = insertelement <4 x double> undef, double %x0, i32 0
34 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
35 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
36 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
40 define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
41 ; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
43 ; ALL-NEXT: pushq %rbp
44 ; ALL-NEXT: movq %rsp, %rbp
45 ; ALL-NEXT: andq $-32, %rsp
46 ; ALL-NEXT: subq $64, %rsp
47 ; ALL-NEXT: andl $3, %edx
48 ; ALL-NEXT: andl $3, %esi
49 ; ALL-NEXT: vmovaps %ymm0, (%rsp)
50 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
51 ; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
52 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
53 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
54 ; ALL-NEXT: movq %rbp, %rsp
57 %x0 = extractelement <4 x double> %x, i64 %i0
58 %x1 = extractelement <4 x double> %x, i64 %i1
59 %x2 = extractelement <4 x double> %x, i64 %i2
60 %x3 = extractelement <4 x double> %x, i64 %i3
61 %r0 = insertelement <4 x double> undef, double undef, i32 0
62 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
63 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
64 %r3 = insertelement <4 x double> %r2, double 0.0, i32 3
68 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
69 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
71 ; ALL-NEXT: andl $1, %ecx
72 ; ALL-NEXT: andl $1, %edx
73 ; ALL-NEXT: andl $1, %esi
74 ; ALL-NEXT: andl $1, %edi
75 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
76 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
77 ; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
78 ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
79 ; ALL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
80 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
82 %x0 = extractelement <2 x double> %x, i64 %i0
83 %x1 = extractelement <2 x double> %x, i64 %i1
84 %x2 = extractelement <2 x double> %x, i64 %i2
85 %x3 = extractelement <2 x double> %x, i64 %i3
86 %r0 = insertelement <4 x double> undef, double %x0, i32 0
87 %r1 = insertelement <4 x double> %r0, double %x1, i32 1
88 %r2 = insertelement <4 x double> %r1, double %x2, i32 2
89 %r3 = insertelement <4 x double> %r2, double %x3, i32 3
93 define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
94 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
96 ; AVX1-NEXT: pushq %rbp
97 ; AVX1-NEXT: movq %rsp, %rbp
98 ; AVX1-NEXT: andq $-32, %rsp
99 ; AVX1-NEXT: subq $64, %rsp
100 ; AVX1-NEXT: andl $3, %ecx
101 ; AVX1-NEXT: andl $3, %edx
102 ; AVX1-NEXT: andl $3, %esi
103 ; AVX1-NEXT: andl $3, %edi
104 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
105 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
106 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
107 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
108 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
109 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
110 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
111 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
112 ; AVX1-NEXT: movq %rbp, %rsp
113 ; AVX1-NEXT: popq %rbp
116 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
118 ; AVX2-NEXT: pushq %rbp
119 ; AVX2-NEXT: movq %rsp, %rbp
120 ; AVX2-NEXT: andq $-32, %rsp
121 ; AVX2-NEXT: subq $64, %rsp
122 ; AVX2-NEXT: andl $3, %ecx
123 ; AVX2-NEXT: andl $3, %edx
124 ; AVX2-NEXT: andl $3, %esi
125 ; AVX2-NEXT: andl $3, %edi
126 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
127 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
128 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
129 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
130 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
131 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
132 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
133 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
134 ; AVX2-NEXT: movq %rbp, %rsp
135 ; AVX2-NEXT: popq %rbp
137 %x0 = extractelement <4 x i64> %x, i64 %i0
138 %x1 = extractelement <4 x i64> %x, i64 %i1
139 %x2 = extractelement <4 x i64> %x, i64 %i2
140 %x3 = extractelement <4 x i64> %x, i64 %i3
141 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
142 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
143 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
144 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
148 define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
149 ; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
151 ; AVX1-NEXT: pushq %rbp
152 ; AVX1-NEXT: movq %rsp, %rbp
153 ; AVX1-NEXT: andq $-32, %rsp
154 ; AVX1-NEXT: subq $64, %rsp
155 ; AVX1-NEXT: andl $3, %esi
156 ; AVX1-NEXT: andl $3, %edi
157 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
158 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
159 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
160 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
161 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
162 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
163 ; AVX1-NEXT: movq %rbp, %rsp
164 ; AVX1-NEXT: popq %rbp
167 ; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
169 ; AVX2-NEXT: pushq %rbp
170 ; AVX2-NEXT: movq %rsp, %rbp
171 ; AVX2-NEXT: andq $-32, %rsp
172 ; AVX2-NEXT: subq $64, %rsp
173 ; AVX2-NEXT: andl $3, %esi
174 ; AVX2-NEXT: andl $3, %edi
175 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
176 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
177 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
178 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
179 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
180 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
181 ; AVX2-NEXT: movq %rbp, %rsp
182 ; AVX2-NEXT: popq %rbp
184 %x0 = extractelement <4 x i64> %x, i64 %i0
185 %x1 = extractelement <4 x i64> %x, i64 %i1
186 %x2 = extractelement <4 x i64> %x, i64 %i2
187 %x3 = extractelement <4 x i64> %x, i64 %i3
188 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
189 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
190 %r2 = insertelement <4 x i64> %r1, i64 0, i32 2
191 %r3 = insertelement <4 x i64> %r2, i64 0, i32 3
195 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
196 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
198 ; AVX1-NEXT: andl $1, %ecx
199 ; AVX1-NEXT: andl $1, %edx
200 ; AVX1-NEXT: andl $1, %esi
201 ; AVX1-NEXT: andl $1, %edi
202 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
203 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
204 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
205 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
206 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
207 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
208 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
209 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
212 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
214 ; AVX2-NEXT: andl $1, %ecx
215 ; AVX2-NEXT: andl $1, %edx
216 ; AVX2-NEXT: andl $1, %esi
217 ; AVX2-NEXT: andl $1, %edi
218 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
219 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
220 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
221 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
222 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
223 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
224 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
225 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
227 %x0 = extractelement <2 x i64> %x, i64 %i0
228 %x1 = extractelement <2 x i64> %x, i64 %i1
229 %x2 = extractelement <2 x i64> %x, i64 %i2
230 %x3 = extractelement <2 x i64> %x, i64 %i3
231 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
232 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
233 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
234 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
238 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
239 ; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
241 ; AVX1-NEXT: pushq %rbp
242 ; AVX1-NEXT: movq %rsp, %rbp
243 ; AVX1-NEXT: andq $-32, %rsp
244 ; AVX1-NEXT: subq $64, %rsp
245 ; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
246 ; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
247 ; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
248 ; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
249 ; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
250 ; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
251 ; AVX1-NEXT: andl $7, %edi
252 ; AVX1-NEXT: andl $7, %esi
253 ; AVX1-NEXT: andl $7, %edx
254 ; AVX1-NEXT: andl $7, %ecx
255 ; AVX1-NEXT: andl $7, %r8d
256 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
257 ; AVX1-NEXT: andl $7, %r9d
258 ; AVX1-NEXT: movl 16(%rbp), %r10d
259 ; AVX1-NEXT: andl $7, %r10d
260 ; AVX1-NEXT: movl 24(%rbp), %eax
261 ; AVX1-NEXT: andl $7, %eax
262 ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
263 ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
264 ; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
265 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
266 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
267 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
268 ; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
269 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
270 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
271 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
272 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
273 ; AVX1-NEXT: movq %rbp, %rsp
274 ; AVX1-NEXT: popq %rbp
277 ; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
279 ; AVX2-NEXT: vmovd %edi, %xmm1
280 ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
281 ; AVX2-NEXT: vmovd %esi, %xmm2
282 ; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm2
283 ; AVX2-NEXT: vmovd %edx, %xmm3
284 ; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
285 ; AVX2-NEXT: vmovd %ecx, %xmm4
286 ; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm4
287 ; AVX2-NEXT: vmovd %r8d, %xmm5
288 ; AVX2-NEXT: vpermps %ymm0, %ymm5, %ymm5
289 ; AVX2-NEXT: vmovd %r9d, %xmm6
290 ; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm6
291 ; AVX2-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
292 ; AVX2-NEXT: vpermps %ymm0, %ymm7, %ymm7
293 ; AVX2-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
294 ; AVX2-NEXT: vpermps %ymm0, %ymm8, %ymm0
295 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
296 ; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
297 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
298 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
299 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
300 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
301 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
303 %x0 = extractelement <8 x float> %x, i32 %i0
304 %x1 = extractelement <8 x float> %x, i32 %i1
305 %x2 = extractelement <8 x float> %x, i32 %i2
306 %x3 = extractelement <8 x float> %x, i32 %i3
307 %x4 = extractelement <8 x float> %x, i32 %i4
308 %x5 = extractelement <8 x float> %x, i32 %i5
309 %x6 = extractelement <8 x float> %x, i32 %i6
310 %x7 = extractelement <8 x float> %x, i32 %i7
311 %r0 = insertelement <8 x float> undef, float %x0, i32 0
312 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
313 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
314 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
315 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
316 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
317 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
318 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
322 define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
323 ; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
325 ; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
326 ; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
327 ; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
328 ; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
329 ; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
330 ; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
331 ; ALL-NEXT: andl $3, %edi
332 ; ALL-NEXT: andl $3, %esi
333 ; ALL-NEXT: andl $3, %edx
334 ; ALL-NEXT: andl $3, %ecx
335 ; ALL-NEXT: andl $3, %r8d
336 ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
337 ; ALL-NEXT: andl $3, %r9d
338 ; ALL-NEXT: movl {{[0-9]+}}(%rsp), %r10d
339 ; ALL-NEXT: andl $3, %r10d
340 ; ALL-NEXT: movl {{[0-9]+}}(%rsp), %eax
341 ; ALL-NEXT: andl $3, %eax
342 ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
343 ; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
344 ; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
345 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
346 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
347 ; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
348 ; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
349 ; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
350 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
351 ; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
352 ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
354 %x0 = extractelement <4 x float> %x, i32 %i0
355 %x1 = extractelement <4 x float> %x, i32 %i1
356 %x2 = extractelement <4 x float> %x, i32 %i2
357 %x3 = extractelement <4 x float> %x, i32 %i3
358 %x4 = extractelement <4 x float> %x, i32 %i4
359 %x5 = extractelement <4 x float> %x, i32 %i5
360 %x6 = extractelement <4 x float> %x, i32 %i6
361 %x7 = extractelement <4 x float> %x, i32 %i7
362 %r0 = insertelement <8 x float> undef, float %x0, i32 0
363 %r1 = insertelement <8 x float> %r0, float %x1, i32 1
364 %r2 = insertelement <8 x float> %r1, float %x2, i32 2
365 %r3 = insertelement <8 x float> %r2, float %x3, i32 3
366 %r4 = insertelement <8 x float> %r3, float %x4, i32 4
367 %r5 = insertelement <8 x float> %r4, float %x5, i32 5
368 %r6 = insertelement <8 x float> %r5, float %x6, i32 6
369 %r7 = insertelement <8 x float> %r6, float %x7, i32 7
373 define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
374 ; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
376 ; AVX1-NEXT: pushq %rbp
377 ; AVX1-NEXT: movq %rsp, %rbp
378 ; AVX1-NEXT: andq $-32, %rsp
379 ; AVX1-NEXT: subq $64, %rsp
380 ; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
381 ; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
382 ; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
383 ; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
384 ; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
385 ; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
386 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
387 ; AVX1-NEXT: movl 32(%rbp), %eax
388 ; AVX1-NEXT: andl $15, %eax
389 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
390 ; AVX1-NEXT: vmovd %eax, %xmm0
391 ; AVX1-NEXT: movl 40(%rbp), %eax
392 ; AVX1-NEXT: andl $15, %eax
393 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
394 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
395 ; AVX1-NEXT: movl 48(%rbp), %eax
396 ; AVX1-NEXT: andl $15, %eax
397 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
398 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
399 ; AVX1-NEXT: movl 56(%rbp), %eax
400 ; AVX1-NEXT: andl $15, %eax
401 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
402 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
403 ; AVX1-NEXT: movl 64(%rbp), %eax
404 ; AVX1-NEXT: andl $15, %eax
405 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
406 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
407 ; AVX1-NEXT: movl 72(%rbp), %eax
408 ; AVX1-NEXT: andl $15, %eax
409 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
410 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
411 ; AVX1-NEXT: movl 80(%rbp), %eax
412 ; AVX1-NEXT: andl $15, %eax
413 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
414 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
415 ; AVX1-NEXT: movl 88(%rbp), %eax
416 ; AVX1-NEXT: andl $15, %eax
417 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
418 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
419 ; AVX1-NEXT: andl $15, %edi
420 ; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax
421 ; AVX1-NEXT: vmovd %eax, %xmm1
422 ; AVX1-NEXT: andl $15, %esi
423 ; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
424 ; AVX1-NEXT: andl $15, %edx
425 ; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
426 ; AVX1-NEXT: andl $15, %ecx
427 ; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
428 ; AVX1-NEXT: andl $15, %r8d
429 ; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
430 ; AVX1-NEXT: andl $15, %r9d
431 ; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
432 ; AVX1-NEXT: movl 16(%rbp), %eax
433 ; AVX1-NEXT: andl $15, %eax
434 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
435 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
436 ; AVX1-NEXT: movl 24(%rbp), %eax
437 ; AVX1-NEXT: andl $15, %eax
438 ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
439 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
440 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
441 ; AVX1-NEXT: movq %rbp, %rsp
442 ; AVX1-NEXT: popq %rbp
445 ; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
447 ; AVX2-NEXT: pushq %rbp
448 ; AVX2-NEXT: movq %rsp, %rbp
449 ; AVX2-NEXT: andq $-32, %rsp
450 ; AVX2-NEXT: subq $64, %rsp
451 ; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
452 ; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
453 ; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
454 ; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
455 ; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
456 ; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
457 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
458 ; AVX2-NEXT: movl 32(%rbp), %eax
459 ; AVX2-NEXT: andl $15, %eax
460 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
461 ; AVX2-NEXT: vmovd %eax, %xmm0
462 ; AVX2-NEXT: movl 40(%rbp), %eax
463 ; AVX2-NEXT: andl $15, %eax
464 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
465 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
466 ; AVX2-NEXT: movl 48(%rbp), %eax
467 ; AVX2-NEXT: andl $15, %eax
468 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
469 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
470 ; AVX2-NEXT: movl 56(%rbp), %eax
471 ; AVX2-NEXT: andl $15, %eax
472 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
473 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
474 ; AVX2-NEXT: movl 64(%rbp), %eax
475 ; AVX2-NEXT: andl $15, %eax
476 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
477 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
478 ; AVX2-NEXT: movl 72(%rbp), %eax
479 ; AVX2-NEXT: andl $15, %eax
480 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
481 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
482 ; AVX2-NEXT: movl 80(%rbp), %eax
483 ; AVX2-NEXT: andl $15, %eax
484 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
485 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
486 ; AVX2-NEXT: movl 88(%rbp), %eax
487 ; AVX2-NEXT: andl $15, %eax
488 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
489 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
490 ; AVX2-NEXT: andl $15, %edi
491 ; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax
492 ; AVX2-NEXT: vmovd %eax, %xmm1
493 ; AVX2-NEXT: andl $15, %esi
494 ; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
495 ; AVX2-NEXT: andl $15, %edx
496 ; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
497 ; AVX2-NEXT: andl $15, %ecx
498 ; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
499 ; AVX2-NEXT: andl $15, %r8d
500 ; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
501 ; AVX2-NEXT: andl $15, %r9d
502 ; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
503 ; AVX2-NEXT: movl 16(%rbp), %eax
504 ; AVX2-NEXT: andl $15, %eax
505 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
506 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
507 ; AVX2-NEXT: movl 24(%rbp), %eax
508 ; AVX2-NEXT: andl $15, %eax
509 ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
510 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
511 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
512 ; AVX2-NEXT: movq %rbp, %rsp
513 ; AVX2-NEXT: popq %rbp
515 %x0 = extractelement <16 x i16> %x, i32 %i0
516 %x1 = extractelement <16 x i16> %x, i32 %i1
517 %x2 = extractelement <16 x i16> %x, i32 %i2
518 %x3 = extractelement <16 x i16> %x, i32 %i3
519 %x4 = extractelement <16 x i16> %x, i32 %i4
520 %x5 = extractelement <16 x i16> %x, i32 %i5
521 %x6 = extractelement <16 x i16> %x, i32 %i6
522 %x7 = extractelement <16 x i16> %x, i32 %i7
523 %x8 = extractelement <16 x i16> %x, i32 %i8
524 %x9 = extractelement <16 x i16> %x, i32 %i9
525 %x10 = extractelement <16 x i16> %x, i32 %i10
526 %x11 = extractelement <16 x i16> %x, i32 %i11
527 %x12 = extractelement <16 x i16> %x, i32 %i12
528 %x13 = extractelement <16 x i16> %x, i32 %i13
529 %x14 = extractelement <16 x i16> %x, i32 %i14
530 %x15 = extractelement <16 x i16> %x, i32 %i15
531 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
532 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
533 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
534 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
535 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
536 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
537 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
538 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
539 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
540 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
541 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
542 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
543 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
544 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
545 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
546 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
550 define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
551 ; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
553 ; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
554 ; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
555 ; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
556 ; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
557 ; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
558 ; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
559 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
560 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
561 ; AVX1-NEXT: andl $7, %eax
562 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
563 ; AVX1-NEXT: vmovd %eax, %xmm0
564 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
565 ; AVX1-NEXT: andl $7, %eax
566 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
567 ; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
568 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
569 ; AVX1-NEXT: andl $7, %eax
570 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
571 ; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
572 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
573 ; AVX1-NEXT: andl $7, %eax
574 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
575 ; AVX1-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
576 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
577 ; AVX1-NEXT: andl $7, %eax
578 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
579 ; AVX1-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
580 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
581 ; AVX1-NEXT: andl $7, %eax
582 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
583 ; AVX1-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
584 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
585 ; AVX1-NEXT: andl $7, %eax
586 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
587 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
588 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
589 ; AVX1-NEXT: andl $7, %eax
590 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
591 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
592 ; AVX1-NEXT: andl $7, %edi
593 ; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax
594 ; AVX1-NEXT: vmovd %eax, %xmm1
595 ; AVX1-NEXT: andl $7, %esi
596 ; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
597 ; AVX1-NEXT: andl $7, %edx
598 ; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
599 ; AVX1-NEXT: andl $7, %ecx
600 ; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
601 ; AVX1-NEXT: andl $7, %r8d
602 ; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
603 ; AVX1-NEXT: andl $7, %r9d
604 ; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
605 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
606 ; AVX1-NEXT: andl $7, %eax
607 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
608 ; AVX1-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
609 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
610 ; AVX1-NEXT: andl $7, %eax
611 ; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
612 ; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
613 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
616 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
618 ; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
619 ; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
620 ; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
621 ; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
622 ; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
623 ; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
624 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
625 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
626 ; AVX2-NEXT: andl $7, %eax
627 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
628 ; AVX2-NEXT: vmovd %eax, %xmm0
629 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
630 ; AVX2-NEXT: andl $7, %eax
631 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
632 ; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
633 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
634 ; AVX2-NEXT: andl $7, %eax
635 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
636 ; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
637 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
638 ; AVX2-NEXT: andl $7, %eax
639 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
640 ; AVX2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
641 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
642 ; AVX2-NEXT: andl $7, %eax
643 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
644 ; AVX2-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
645 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
646 ; AVX2-NEXT: andl $7, %eax
647 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
648 ; AVX2-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
649 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
650 ; AVX2-NEXT: andl $7, %eax
651 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
652 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
653 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
654 ; AVX2-NEXT: andl $7, %eax
655 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
656 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
657 ; AVX2-NEXT: andl $7, %edi
658 ; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
659 ; AVX2-NEXT: vmovd %eax, %xmm1
660 ; AVX2-NEXT: andl $7, %esi
661 ; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
662 ; AVX2-NEXT: andl $7, %edx
663 ; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
664 ; AVX2-NEXT: andl $7, %ecx
665 ; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
666 ; AVX2-NEXT: andl $7, %r8d
667 ; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
668 ; AVX2-NEXT: andl $7, %r9d
669 ; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
670 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
671 ; AVX2-NEXT: andl $7, %eax
672 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
673 ; AVX2-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
674 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
675 ; AVX2-NEXT: andl $7, %eax
676 ; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
677 ; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
678 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
680 %x0 = extractelement <8 x i16> %x, i32 %i0
681 %x1 = extractelement <8 x i16> %x, i32 %i1
682 %x2 = extractelement <8 x i16> %x, i32 %i2
683 %x3 = extractelement <8 x i16> %x, i32 %i3
684 %x4 = extractelement <8 x i16> %x, i32 %i4
685 %x5 = extractelement <8 x i16> %x, i32 %i5
686 %x6 = extractelement <8 x i16> %x, i32 %i6
687 %x7 = extractelement <8 x i16> %x, i32 %i7
688 %x8 = extractelement <8 x i16> %x, i32 %i8
689 %x9 = extractelement <8 x i16> %x, i32 %i9
690 %x10 = extractelement <8 x i16> %x, i32 %i10
691 %x11 = extractelement <8 x i16> %x, i32 %i11
692 %x12 = extractelement <8 x i16> %x, i32 %i12
693 %x13 = extractelement <8 x i16> %x, i32 %i13
694 %x14 = extractelement <8 x i16> %x, i32 %i14
695 %x15 = extractelement <8 x i16> %x, i32 %i15
696 %r0 = insertelement <16 x i16> undef, i16 %x0 , i32 0
697 %r1 = insertelement <16 x i16> %r0 , i16 %x1 , i32 1
698 %r2 = insertelement <16 x i16> %r1 , i16 %x2 , i32 2
699 %r3 = insertelement <16 x i16> %r2 , i16 %x3 , i32 3
700 %r4 = insertelement <16 x i16> %r3 , i16 %x4 , i32 4
701 %r5 = insertelement <16 x i16> %r4 , i16 %x5 , i32 5
702 %r6 = insertelement <16 x i16> %r5 , i16 %x6 , i32 6
703 %r7 = insertelement <16 x i16> %r6 , i16 %x7 , i32 7
704 %r8 = insertelement <16 x i16> %r7 , i16 %x8 , i32 8
705 %r9 = insertelement <16 x i16> %r8 , i16 %x9 , i32 9
706 %r10 = insertelement <16 x i16> %r9 , i16 %x10, i32 10
707 %r11 = insertelement <16 x i16> %r10, i16 %x11, i32 11
708 %r12 = insertelement <16 x i16> %r11, i16 %x12, i32 12
709 %r13 = insertelement <16 x i16> %r12, i16 %x13, i32 13
710 %r14 = insertelement <16 x i16> %r13, i16 %x14, i32 14
711 %r15 = insertelement <16 x i16> %r14, i16 %x15, i32 15
716 ; Unary shuffle indices from memory
719 define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
720 ; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
722 ; AVX1-NEXT: pushq %rbp
723 ; AVX1-NEXT: movq %rsp, %rbp
724 ; AVX1-NEXT: andq $-32, %rsp
725 ; AVX1-NEXT: subq $64, %rsp
726 ; AVX1-NEXT: movq (%rdi), %rax
727 ; AVX1-NEXT: movq 8(%rdi), %rcx
728 ; AVX1-NEXT: andl $3, %eax
729 ; AVX1-NEXT: andl $3, %ecx
730 ; AVX1-NEXT: movq 16(%rdi), %rdx
731 ; AVX1-NEXT: andl $3, %edx
732 ; AVX1-NEXT: movq 24(%rdi), %rsi
733 ; AVX1-NEXT: andl $3, %esi
734 ; AVX1-NEXT: vmovaps %ymm0, (%rsp)
735 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
736 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
737 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
738 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
739 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
740 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
741 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
742 ; AVX1-NEXT: movq %rbp, %rsp
743 ; AVX1-NEXT: popq %rbp
746 ; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
748 ; AVX2-NEXT: pushq %rbp
749 ; AVX2-NEXT: movq %rsp, %rbp
750 ; AVX2-NEXT: andq $-32, %rsp
751 ; AVX2-NEXT: subq $64, %rsp
752 ; AVX2-NEXT: movq (%rdi), %rax
753 ; AVX2-NEXT: movq 8(%rdi), %rcx
754 ; AVX2-NEXT: andl $3, %eax
755 ; AVX2-NEXT: andl $3, %ecx
756 ; AVX2-NEXT: movq 16(%rdi), %rdx
757 ; AVX2-NEXT: andl $3, %edx
758 ; AVX2-NEXT: movq 24(%rdi), %rsi
759 ; AVX2-NEXT: andl $3, %esi
760 ; AVX2-NEXT: vmovaps %ymm0, (%rsp)
761 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
762 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
763 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
764 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
765 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
766 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
767 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
768 ; AVX2-NEXT: movq %rbp, %rsp
769 ; AVX2-NEXT: popq %rbp
771 %p0 = getelementptr inbounds i64, i64* %i, i32 0
772 %p1 = getelementptr inbounds i64, i64* %i, i32 1
773 %p2 = getelementptr inbounds i64, i64* %i, i32 2
774 %p3 = getelementptr inbounds i64, i64* %i, i32 3
775 %i0 = load i64, i64* %p0, align 4
776 %i1 = load i64, i64* %p1, align 4
777 %i2 = load i64, i64* %p2, align 4
778 %i3 = load i64, i64* %p3, align 4
779 %x0 = extractelement <4 x i64> %x, i64 %i0
780 %x1 = extractelement <4 x i64> %x, i64 %i1
781 %x2 = extractelement <4 x i64> %x, i64 %i2
782 %x3 = extractelement <4 x i64> %x, i64 %i3
783 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
784 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
785 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
786 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3
790 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
791 ; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
793 ; AVX1-NEXT: movq (%rdi), %rax
794 ; AVX1-NEXT: movq 8(%rdi), %rcx
795 ; AVX1-NEXT: andl $1, %eax
796 ; AVX1-NEXT: andl $1, %ecx
797 ; AVX1-NEXT: movq 16(%rdi), %rdx
798 ; AVX1-NEXT: andl $1, %edx
799 ; AVX1-NEXT: movq 24(%rdi), %rsi
800 ; AVX1-NEXT: andl $1, %esi
801 ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
802 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
803 ; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
804 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
805 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
806 ; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
807 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
808 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
811 ; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
813 ; AVX2-NEXT: movq (%rdi), %rax
814 ; AVX2-NEXT: movq 8(%rdi), %rcx
815 ; AVX2-NEXT: andl $1, %eax
816 ; AVX2-NEXT: andl $1, %ecx
817 ; AVX2-NEXT: movq 16(%rdi), %rdx
818 ; AVX2-NEXT: andl $1, %edx
819 ; AVX2-NEXT: movq 24(%rdi), %rsi
820 ; AVX2-NEXT: andl $1, %esi
821 ; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
822 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
823 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
824 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
825 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
826 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
827 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
828 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
830 %p0 = getelementptr inbounds i64, i64* %i, i32 0
831 %p1 = getelementptr inbounds i64, i64* %i, i32 1
832 %p2 = getelementptr inbounds i64, i64* %i, i32 2
833 %p3 = getelementptr inbounds i64, i64* %i, i32 3
834 %i0 = load i64, i64* %p0, align 4
835 %i1 = load i64, i64* %p1, align 4
836 %i2 = load i64, i64* %p2, align 4
837 %i3 = load i64, i64* %p3, align 4
838 %x0 = extractelement <2 x i64> %x, i64 %i0
839 %x1 = extractelement <2 x i64> %x, i64 %i1
840 %x2 = extractelement <2 x i64> %x, i64 %i2
841 %x3 = extractelement <2 x i64> %x, i64 %i3
842 %r0 = insertelement <4 x i64> undef, i64 %x0, i32 0
843 %r1 = insertelement <4 x i64> %r0, i64 %x1, i32 1
844 %r2 = insertelement <4 x i64> %r1, i64 %x2, i32 2
845 %r3 = insertelement <4 x i64> %r2, i64 %x3, i32 3