1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
11 define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
12 ; GENERIC-LABEL: test_blendpd:
14 ; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
15 ; GENERIC-NEXT: addpd %xmm1, %xmm0
16 ; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
19 ; SLM-LABEL: test_blendpd:
21 ; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
22 ; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
23 ; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [4:1.00]
24 ; SLM-NEXT: retq # sched: [4:1.00]
26 ; SANDY-LABEL: test_blendpd:
28 ; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
29 ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
30 ; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:1.00]
31 ; SANDY-NEXT: retq # sched: [1:1.00]
33 ; HASWELL-LABEL: test_blendpd:
35 ; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
36 ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
37 ; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
38 ; HASWELL-NEXT: retq # sched: [1:1.00]
40 ; BTVER2-LABEL: test_blendpd:
42 ; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
43 ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
44 ; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
45 ; BTVER2-NEXT: retq # sched: [4:1.00]
46 %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
47 %2 = load <2 x double>, <2 x double> *%a2, align 16
48 %3 = fadd <2 x double> %a1, %1
49 %4 = shufflevector <2 x double> %3, <2 x double> %2, <2 x i32> <i32 0, i32 3>
53 define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
54 ; GENERIC-LABEL: test_blendps:
56 ; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
57 ; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
60 ; SLM-LABEL: test_blendps:
62 ; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
63 ; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [4:1.00]
64 ; SLM-NEXT: retq # sched: [4:1.00]
66 ; SANDY-LABEL: test_blendps:
68 ; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
69 ; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:1.00]
70 ; SANDY-NEXT: retq # sched: [1:1.00]
72 ; HASWELL-LABEL: test_blendps:
74 ; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
75 ; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
76 ; HASWELL-NEXT: retq # sched: [1:1.00]
78 ; BTVER2-LABEL: test_blendps:
80 ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
81 ; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
82 ; BTVER2-NEXT: retq # sched: [4:1.00]
83 %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
84 %2 = load <4 x float>, <4 x float> *%a2, align 16
85 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
89 define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
90 ; GENERIC-LABEL: test_blendvpd:
92 ; GENERIC-NEXT: movapd %xmm0, %xmm3
93 ; GENERIC-NEXT: movaps %xmm2, %xmm0
94 ; GENERIC-NEXT: blendvpd %xmm0, %xmm1, %xmm3
95 ; GENERIC-NEXT: blendvpd %xmm0, (%rdi), %xmm3
96 ; GENERIC-NEXT: movapd %xmm3, %xmm0
99 ; SLM-LABEL: test_blendvpd:
101 ; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00]
102 ; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
103 ; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
104 ; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
105 ; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00]
106 ; SLM-NEXT: retq # sched: [4:1.00]
108 ; SANDY-LABEL: test_blendvpd:
110 ; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
111 ; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
112 ; SANDY-NEXT: retq # sched: [1:1.00]
114 ; HASWELL-LABEL: test_blendvpd:
116 ; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
117 ; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
118 ; HASWELL-NEXT: retq # sched: [1:1.00]
120 ; BTVER2-LABEL: test_blendvpd:
122 ; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
123 ; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
124 ; BTVER2-NEXT: retq # sched: [4:1.00]
125 %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
126 %2 = load <2 x double>, <2 x double> *%a3, align 16
127 %3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
130 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
132 define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
133 ; GENERIC-LABEL: test_blendvps:
135 ; GENERIC-NEXT: movaps %xmm0, %xmm3
136 ; GENERIC-NEXT: movaps %xmm2, %xmm0
137 ; GENERIC-NEXT: blendvps %xmm0, %xmm1, %xmm3
138 ; GENERIC-NEXT: blendvps %xmm0, (%rdi), %xmm3
139 ; GENERIC-NEXT: movaps %xmm3, %xmm0
142 ; SLM-LABEL: test_blendvps:
144 ; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
145 ; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
146 ; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
147 ; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
148 ; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
149 ; SLM-NEXT: retq # sched: [4:1.00]
151 ; SANDY-LABEL: test_blendvps:
153 ; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
154 ; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
155 ; SANDY-NEXT: retq # sched: [1:1.00]
157 ; HASWELL-LABEL: test_blendvps:
159 ; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
160 ; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
161 ; HASWELL-NEXT: retq # sched: [1:1.00]
163 ; BTVER2-LABEL: test_blendvps:
165 ; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
166 ; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
167 ; BTVER2-NEXT: retq # sched: [4:1.00]
168 %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
169 %2 = load <4 x float>, <4 x float> *%a3
170 %3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
173 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
175 define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
176 ; GENERIC-LABEL: test_dppd:
178 ; GENERIC-NEXT: dppd $7, %xmm1, %xmm0
179 ; GENERIC-NEXT: dppd $7, (%rdi), %xmm0
182 ; SLM-LABEL: test_dppd:
184 ; SLM-NEXT: dppd $7, %xmm1, %xmm0 # sched: [3:1.00]
185 ; SLM-NEXT: dppd $7, (%rdi), %xmm0 # sched: [6:1.00]
186 ; SLM-NEXT: retq # sched: [4:1.00]
188 ; SANDY-LABEL: test_dppd:
190 ; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
191 ; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
192 ; SANDY-NEXT: retq # sched: [1:1.00]
194 ; HASWELL-LABEL: test_dppd:
196 ; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
197 ; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
198 ; HASWELL-NEXT: retq # sched: [1:1.00]
200 ; BTVER2-LABEL: test_dppd:
202 ; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
203 ; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
204 ; BTVER2-NEXT: retq # sched: [4:1.00]
205 %1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
206 %2 = load <2 x double>, <2 x double> *%a2, align 16
207 %3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
210 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
212 define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
213 ; GENERIC-LABEL: test_dpps:
215 ; GENERIC-NEXT: dpps $7, %xmm1, %xmm0
216 ; GENERIC-NEXT: dpps $7, (%rdi), %xmm0
219 ; SLM-LABEL: test_dpps:
221 ; SLM-NEXT: dpps $7, %xmm1, %xmm0 # sched: [3:1.00]
222 ; SLM-NEXT: dpps $7, (%rdi), %xmm0 # sched: [6:1.00]
223 ; SLM-NEXT: retq # sched: [4:1.00]
225 ; SANDY-LABEL: test_dpps:
227 ; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
228 ; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
229 ; SANDY-NEXT: retq # sched: [1:1.00]
231 ; HASWELL-LABEL: test_dpps:
233 ; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
234 ; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
235 ; HASWELL-NEXT: retq # sched: [1:1.00]
237 ; BTVER2-LABEL: test_dpps:
239 ; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
240 ; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
241 ; BTVER2-NEXT: retq # sched: [4:1.00]
242 %1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
243 %2 = load <4 x float>, <4 x float> *%a2, align 16
244 %3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
247 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
249 define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) {
250 ; GENERIC-LABEL: test_insertps:
252 ; GENERIC-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
253 ; GENERIC-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
256 ; SLM-LABEL: test_insertps:
258 ; SLM-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
259 ; SLM-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [4:1.00]
260 ; SLM-NEXT: retq # sched: [4:1.00]
262 ; SANDY-LABEL: test_insertps:
264 ; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
265 ; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
266 ; SANDY-NEXT: retq # sched: [1:1.00]
268 ; HASWELL-LABEL: test_insertps:
270 ; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
271 ; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
272 ; HASWELL-NEXT: retq # sched: [1:1.00]
274 ; BTVER2-LABEL: test_insertps:
276 ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
277 ; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
278 ; BTVER2-NEXT: retq # sched: [4:1.00]
279 %1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
280 %2 = load float, float *%a2
281 %3 = insertelement <4 x float> %1, float %2, i32 3
284 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
286 define <2 x i64> @test_movntdqa(i8* %a0) {
287 ; GENERIC-LABEL: test_movntdqa:
289 ; GENERIC-NEXT: movntdqa (%rdi), %xmm0
292 ; SLM-LABEL: test_movntdqa:
294 ; SLM-NEXT: movntdqa (%rdi), %xmm0 # sched: [3:1.00]
295 ; SLM-NEXT: retq # sched: [4:1.00]
297 ; SANDY-LABEL: test_movntdqa:
299 ; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
300 ; SANDY-NEXT: retq # sched: [1:1.00]
302 ; HASWELL-LABEL: test_movntdqa:
304 ; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
305 ; HASWELL-NEXT: retq # sched: [1:1.00]
307 ; BTVER2-LABEL: test_movntdqa:
309 ; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
310 ; BTVER2-NEXT: retq # sched: [4:1.00]
311 %1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
314 declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
316 define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
317 ; GENERIC-LABEL: test_mpsadbw:
319 ; GENERIC-NEXT: mpsadbw $7, %xmm1, %xmm0
320 ; GENERIC-NEXT: mpsadbw $7, (%rdi), %xmm0
323 ; SLM-LABEL: test_mpsadbw:
325 ; SLM-NEXT: mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00]
326 ; SLM-NEXT: mpsadbw $7, (%rdi), %xmm0 # sched: [10:1.00]
327 ; SLM-NEXT: retq # sched: [4:1.00]
329 ; SANDY-LABEL: test_mpsadbw:
331 ; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
332 ; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
333 ; SANDY-NEXT: retq # sched: [1:1.00]
335 ; HASWELL-LABEL: test_mpsadbw:
337 ; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
338 ; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
339 ; HASWELL-NEXT: retq # sched: [1:1.00]
341 ; BTVER2-LABEL: test_mpsadbw:
343 ; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
344 ; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
345 ; BTVER2-NEXT: retq # sched: [4:1.00]
346 %1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
347 %2 = bitcast <8 x i16> %1 to <16 x i8>
348 %3 = load <16 x i8>, <16 x i8> *%a2, align 16
349 %4 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %2, <16 x i8> %3, i8 7)
352 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
354 define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
355 ; GENERIC-LABEL: test_packusdw:
357 ; GENERIC-NEXT: packusdw %xmm1, %xmm0
358 ; GENERIC-NEXT: packusdw (%rdi), %xmm0
361 ; SLM-LABEL: test_packusdw:
363 ; SLM-NEXT: packusdw %xmm1, %xmm0 # sched: [1:1.00]
364 ; SLM-NEXT: packusdw (%rdi), %xmm0 # sched: [4:1.00]
365 ; SLM-NEXT: retq # sched: [4:1.00]
367 ; SANDY-LABEL: test_packusdw:
369 ; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
370 ; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
371 ; SANDY-NEXT: retq # sched: [1:1.00]
373 ; HASWELL-LABEL: test_packusdw:
375 ; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
376 ; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
377 ; HASWELL-NEXT: retq # sched: [1:1.00]
379 ; BTVER2-LABEL: test_packusdw:
381 ; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
382 ; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
383 ; BTVER2-NEXT: retq # sched: [4:1.00]
384 %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
385 %2 = bitcast <8 x i16> %1 to <4 x i32>
386 %3 = load <4 x i32>, <4 x i32> *%a2, align 16
387 %4 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %2, <4 x i32> %3)
390 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
392 define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> *%a3) {
393 ; GENERIC-LABEL: test_pblendvb:
395 ; GENERIC-NEXT: movdqa %xmm0, %xmm3
396 ; GENERIC-NEXT: movaps %xmm2, %xmm0
397 ; GENERIC-NEXT: pblendvb %xmm0, %xmm1, %xmm3
398 ; GENERIC-NEXT: pblendvb %xmm0, (%rdi), %xmm3
399 ; GENERIC-NEXT: movdqa %xmm3, %xmm0
402 ; SLM-LABEL: test_pblendvb:
404 ; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50]
405 ; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
406 ; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
407 ; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
408 ; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50]
409 ; SLM-NEXT: retq # sched: [4:1.00]
411 ; SANDY-LABEL: test_pblendvb:
413 ; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
414 ; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
415 ; SANDY-NEXT: retq # sched: [1:1.00]
417 ; HASWELL-LABEL: test_pblendvb:
419 ; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
420 ; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
421 ; HASWELL-NEXT: retq # sched: [1:1.00]
423 ; BTVER2-LABEL: test_pblendvb:
425 ; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
426 ; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
427 ; BTVER2-NEXT: retq # sched: [4:1.00]
428 %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
429 %2 = load <16 x i8>, <16 x i8> *%a3, align 16
430 %3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
433 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
435 define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
436 ; GENERIC-LABEL: test_pblendw:
438 ; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
439 ; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7]
442 ; SLM-LABEL: test_pblendw:
444 ; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
445 ; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
446 ; SLM-NEXT: retq # sched: [4:1.00]
448 ; SANDY-LABEL: test_pblendw:
450 ; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
451 ; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
452 ; SANDY-NEXT: retq # sched: [1:1.00]
454 ; HASWELL-LABEL: test_pblendw:
456 ; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
457 ; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
458 ; HASWELL-NEXT: retq # sched: [1:1.00]
460 ; BTVER2-LABEL: test_pblendw:
462 ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
463 ; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
464 ; BTVER2-NEXT: retq # sched: [4:1.00]
465 %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
466 %2 = load <8 x i16>, <8 x i16> *%a2, align 16
467 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
471 define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
472 ; GENERIC-LABEL: test_pcmpeqq:
474 ; GENERIC-NEXT: pcmpeqq %xmm1, %xmm0
475 ; GENERIC-NEXT: pcmpeqq (%rdi), %xmm0
478 ; SLM-LABEL: test_pcmpeqq:
480 ; SLM-NEXT: pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
481 ; SLM-NEXT: pcmpeqq (%rdi), %xmm0 # sched: [4:1.00]
482 ; SLM-NEXT: retq # sched: [4:1.00]
484 ; SANDY-LABEL: test_pcmpeqq:
486 ; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
487 ; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
488 ; SANDY-NEXT: retq # sched: [1:1.00]
490 ; HASWELL-LABEL: test_pcmpeqq:
492 ; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
493 ; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
494 ; HASWELL-NEXT: retq # sched: [1:1.00]
496 ; BTVER2-LABEL: test_pcmpeqq:
498 ; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
499 ; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
500 ; BTVER2-NEXT: retq # sched: [4:1.00]
501 %1 = icmp eq <2 x i64> %a0, %a1
502 %2 = sext <2 x i1> %1 to <2 x i64>
503 %3 = load <2 x i64>, <2 x i64>*%a2, align 16
504 %4 = icmp eq <2 x i64> %2, %3
505 %5 = sext <2 x i1> %4 to <2 x i64>
509 define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
510 ; GENERIC-LABEL: test_pextrb:
512 ; GENERIC-NEXT: pextrb $3, %xmm0, %eax
513 ; GENERIC-NEXT: pextrb $1, %xmm0, (%rdi)
516 ; SLM-LABEL: test_pextrb:
518 ; SLM-NEXT: pextrb $3, %xmm0, %eax # sched: [1:1.00]
519 ; SLM-NEXT: pextrb $1, %xmm0, (%rdi) # sched: [4:2.00]
520 ; SLM-NEXT: retq # sched: [4:1.00]
522 ; SANDY-LABEL: test_pextrb:
524 ; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
525 ; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
526 ; SANDY-NEXT: retq # sched: [1:1.00]
528 ; HASWELL-LABEL: test_pextrb:
530 ; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00]
531 ; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
532 ; HASWELL-NEXT: retq # sched: [1:1.00]
534 ; BTVER2-LABEL: test_pextrb:
536 ; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
537 ; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
538 ; BTVER2-NEXT: retq # sched: [4:1.00]
539 %1 = extractelement <16 x i8> %a0, i32 3
540 %2 = extractelement <16 x i8> %a0, i32 1
542 %3 = zext i8 %1 to i32
546 define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
547 ; GENERIC-LABEL: test_pextrd:
549 ; GENERIC-NEXT: pextrd $3, %xmm0, %eax
550 ; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi)
553 ; SLM-LABEL: test_pextrd:
555 ; SLM-NEXT: pextrd $3, %xmm0, %eax # sched: [1:1.00]
556 ; SLM-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [4:2.00]
557 ; SLM-NEXT: retq # sched: [4:1.00]
559 ; SANDY-LABEL: test_pextrd:
561 ; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
562 ; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
563 ; SANDY-NEXT: retq # sched: [1:1.00]
565 ; HASWELL-LABEL: test_pextrd:
567 ; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00]
568 ; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
569 ; HASWELL-NEXT: retq # sched: [1:1.00]
571 ; BTVER2-LABEL: test_pextrd:
573 ; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
574 ; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
575 ; BTVER2-NEXT: retq # sched: [4:1.00]
576 %1 = extractelement <4 x i32> %a0, i32 3
577 %2 = extractelement <4 x i32> %a0, i32 1
578 store i32 %2, i32 *%a1
582 define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
583 ; GENERIC-LABEL: test_pextrq:
585 ; GENERIC-NEXT: pextrq $1, %xmm0, %rax
586 ; GENERIC-NEXT: pextrq $1, %xmm0, (%rdi)
589 ; SLM-LABEL: test_pextrq:
591 ; SLM-NEXT: pextrq $1, %xmm0, %rax # sched: [1:1.00]
592 ; SLM-NEXT: pextrq $1, %xmm0, (%rdi) # sched: [4:2.00]
593 ; SLM-NEXT: retq # sched: [4:1.00]
595 ; SANDY-LABEL: test_pextrq:
597 ; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
598 ; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
599 ; SANDY-NEXT: retq # sched: [1:1.00]
601 ; HASWELL-LABEL: test_pextrq:
603 ; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00]
604 ; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
605 ; HASWELL-NEXT: retq # sched: [1:1.00]
607 ; BTVER2-LABEL: test_pextrq:
609 ; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
610 ; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
611 ; BTVER2-NEXT: retq # sched: [4:1.00]
612 %1 = extractelement <2 x i64> %a0, i32 1
613 %2 = extractelement <2 x i64> %a0, i32 1
614 store i64 %2, i64 *%a2
618 define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
619 ; GENERIC-LABEL: test_pextrw:
621 ; GENERIC-NEXT: pextrw $3, %xmm0, %eax
622 ; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi)
625 ; SLM-LABEL: test_pextrw:
627 ; SLM-NEXT: pextrw $3, %xmm0, %eax # sched: [4:1.00]
628 ; SLM-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [4:2.00]
629 ; SLM-NEXT: retq # sched: [4:1.00]
631 ; SANDY-LABEL: test_pextrw:
633 ; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
634 ; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
635 ; SANDY-NEXT: retq # sched: [1:1.00]
637 ; HASWELL-LABEL: test_pextrw:
639 ; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00]
640 ; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
641 ; HASWELL-NEXT: retq # sched: [1:1.00]
643 ; BTVER2-LABEL: test_pextrw:
645 ; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
646 ; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
647 ; BTVER2-NEXT: retq # sched: [4:1.00]
648 %1 = extractelement <8 x i16> %a0, i32 3
649 %2 = extractelement <8 x i16> %a0, i32 1
650 store i16 %2, i16 *%a1
651 %3 = zext i16 %1 to i32
655 define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
656 ; GENERIC-LABEL: test_phminposuw:
658 ; GENERIC-NEXT: phminposuw (%rdi), %xmm0
659 ; GENERIC-NEXT: phminposuw %xmm0, %xmm0
662 ; SLM-LABEL: test_phminposuw:
664 ; SLM-NEXT: phminposuw (%rdi), %xmm0 # sched: [7:1.00]
665 ; SLM-NEXT: phminposuw %xmm0, %xmm0 # sched: [4:1.00]
666 ; SLM-NEXT: retq # sched: [4:1.00]
668 ; SANDY-LABEL: test_phminposuw:
670 ; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
671 ; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
672 ; SANDY-NEXT: retq # sched: [1:1.00]
674 ; HASWELL-LABEL: test_phminposuw:
676 ; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
677 ; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
678 ; HASWELL-NEXT: retq # sched: [1:1.00]
680 ; BTVER2-LABEL: test_phminposuw:
682 ; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
683 ; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
684 ; BTVER2-NEXT: retq # sched: [4:1.00]
685 %1 = load <8 x i16>, <8 x i16> *%a0, align 16
686 %2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
687 %3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
690 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
692 define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
693 ; GENERIC-LABEL: test_pinsrb:
695 ; GENERIC-NEXT: pinsrb $1, %edi, %xmm0
696 ; GENERIC-NEXT: pinsrb $3, (%rsi), %xmm0
699 ; SLM-LABEL: test_pinsrb:
701 ; SLM-NEXT: pinsrb $1, %edi, %xmm0 # sched: [1:1.00]
702 ; SLM-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [4:1.00]
703 ; SLM-NEXT: retq # sched: [4:1.00]
705 ; SANDY-LABEL: test_pinsrb:
707 ; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
708 ; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
709 ; SANDY-NEXT: retq # sched: [1:1.00]
711 ; HASWELL-LABEL: test_pinsrb:
713 ; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
714 ; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
715 ; HASWELL-NEXT: retq # sched: [1:1.00]
717 ; BTVER2-LABEL: test_pinsrb:
719 ; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
720 ; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
721 ; BTVER2-NEXT: retq # sched: [4:1.00]
722 %1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
723 %2 = load i8, i8 *%a2
724 %3 = insertelement <16 x i8> %1, i8 %2, i32 3
728 define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
729 ; GENERIC-LABEL: test_pinsrd:
731 ; GENERIC-NEXT: pinsrd $1, %edi, %xmm0
732 ; GENERIC-NEXT: pinsrd $3, (%rsi), %xmm0
735 ; SLM-LABEL: test_pinsrd:
737 ; SLM-NEXT: pinsrd $1, %edi, %xmm0 # sched: [1:1.00]
738 ; SLM-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [4:1.00]
739 ; SLM-NEXT: retq # sched: [4:1.00]
741 ; SANDY-LABEL: test_pinsrd:
743 ; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
744 ; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
745 ; SANDY-NEXT: retq # sched: [1:1.00]
747 ; HASWELL-LABEL: test_pinsrd:
749 ; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
750 ; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
751 ; HASWELL-NEXT: retq # sched: [1:1.00]
753 ; BTVER2-LABEL: test_pinsrd:
755 ; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
756 ; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
757 ; BTVER2-NEXT: retq # sched: [4:1.00]
758 %1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
759 %2 = load i32, i32 *%a2
760 %3 = insertelement <4 x i32> %1, i32 %2, i32 3
764 define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
765 ; GENERIC-LABEL: test_pinsrq:
767 ; GENERIC-NEXT: pinsrq $1, %rdi, %xmm0
768 ; GENERIC-NEXT: pinsrq $1, (%rsi), %xmm1
769 ; GENERIC-NEXT: paddq %xmm1, %xmm0
772 ; SLM-LABEL: test_pinsrq:
774 ; SLM-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
775 ; SLM-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [1:1.00]
776 ; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
777 ; SLM-NEXT: retq # sched: [4:1.00]
779 ; SANDY-LABEL: test_pinsrq:
781 ; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
782 ; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
783 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
784 ; SANDY-NEXT: retq # sched: [1:1.00]
786 ; HASWELL-LABEL: test_pinsrq:
788 ; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
789 ; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
790 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
791 ; HASWELL-NEXT: retq # sched: [1:1.00]
793 ; BTVER2-LABEL: test_pinsrq:
795 ; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
796 ; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
797 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
798 ; BTVER2-NEXT: retq # sched: [4:1.00]
799 %1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
800 %2 = load i64, i64 *%a3
801 %3 = insertelement <2 x i64> %a1, i64 %2, i32 1
802 %4 = add <2 x i64> %1, %3
806 define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
807 ; GENERIC-LABEL: test_pmaxsb:
809 ; GENERIC-NEXT: pmaxsb %xmm1, %xmm0
810 ; GENERIC-NEXT: pmaxsb (%rdi), %xmm0
813 ; SLM-LABEL: test_pmaxsb:
815 ; SLM-NEXT: pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
816 ; SLM-NEXT: pmaxsb (%rdi), %xmm0 # sched: [4:1.00]
817 ; SLM-NEXT: retq # sched: [4:1.00]
819 ; SANDY-LABEL: test_pmaxsb:
821 ; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
822 ; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
823 ; SANDY-NEXT: retq # sched: [1:1.00]
825 ; HASWELL-LABEL: test_pmaxsb:
827 ; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
828 ; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
829 ; HASWELL-NEXT: retq # sched: [1:1.00]
831 ; BTVER2-LABEL: test_pmaxsb:
833 ; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
834 ; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
835 ; BTVER2-NEXT: retq # sched: [4:1.00]
836 %1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
837 %2 = load <16 x i8>, <16 x i8> *%a2, align 16
838 %3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
841 declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
843 define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
844 ; GENERIC-LABEL: test_pmaxsd:
846 ; GENERIC-NEXT: pmaxsd %xmm1, %xmm0
847 ; GENERIC-NEXT: pmaxsd (%rdi), %xmm0
850 ; SLM-LABEL: test_pmaxsd:
852 ; SLM-NEXT: pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
853 ; SLM-NEXT: pmaxsd (%rdi), %xmm0 # sched: [4:1.00]
854 ; SLM-NEXT: retq # sched: [4:1.00]
856 ; SANDY-LABEL: test_pmaxsd:
858 ; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
859 ; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
860 ; SANDY-NEXT: retq # sched: [1:1.00]
862 ; HASWELL-LABEL: test_pmaxsd:
864 ; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
865 ; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
866 ; HASWELL-NEXT: retq # sched: [1:1.00]
868 ; BTVER2-LABEL: test_pmaxsd:
870 ; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
871 ; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
872 ; BTVER2-NEXT: retq # sched: [4:1.00]
873 %1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
874 %2 = load <4 x i32>, <4 x i32> *%a2, align 16
875 %3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
878 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
880 define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
881 ; GENERIC-LABEL: test_pmaxud:
883 ; GENERIC-NEXT: pmaxud %xmm1, %xmm0
884 ; GENERIC-NEXT: pmaxud (%rdi), %xmm0
887 ; SLM-LABEL: test_pmaxud:
889 ; SLM-NEXT: pmaxud %xmm1, %xmm0 # sched: [1:0.50]
890 ; SLM-NEXT: pmaxud (%rdi), %xmm0 # sched: [4:1.00]
891 ; SLM-NEXT: retq # sched: [4:1.00]
893 ; SANDY-LABEL: test_pmaxud:
895 ; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
896 ; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
897 ; SANDY-NEXT: retq # sched: [1:1.00]
899 ; HASWELL-LABEL: test_pmaxud:
901 ; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
902 ; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
903 ; HASWELL-NEXT: retq # sched: [1:1.00]
905 ; BTVER2-LABEL: test_pmaxud:
907 ; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
908 ; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
909 ; BTVER2-NEXT: retq # sched: [4:1.00]
910 %1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
911 %2 = load <4 x i32>, <4 x i32> *%a2, align 16
912 %3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
915 declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
917 define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
918 ; GENERIC-LABEL: test_pmaxuw:
920 ; GENERIC-NEXT: pmaxuw %xmm1, %xmm0
921 ; GENERIC-NEXT: pmaxuw (%rdi), %xmm0
924 ; SLM-LABEL: test_pmaxuw:
926 ; SLM-NEXT: pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
927 ; SLM-NEXT: pmaxuw (%rdi), %xmm0 # sched: [4:1.00]
928 ; SLM-NEXT: retq # sched: [4:1.00]
930 ; SANDY-LABEL: test_pmaxuw:
932 ; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
933 ; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
934 ; SANDY-NEXT: retq # sched: [1:1.00]
936 ; HASWELL-LABEL: test_pmaxuw:
938 ; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
939 ; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
940 ; HASWELL-NEXT: retq # sched: [1:1.00]
942 ; BTVER2-LABEL: test_pmaxuw:
944 ; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
945 ; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
946 ; BTVER2-NEXT: retq # sched: [4:1.00]
947 %1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
948 %2 = load <8 x i16>, <8 x i16> *%a2, align 16
949 %3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
952 declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
954 define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
955 ; GENERIC-LABEL: test_pminsb:
957 ; GENERIC-NEXT: pminsb %xmm1, %xmm0
958 ; GENERIC-NEXT: pminsb (%rdi), %xmm0
961 ; SLM-LABEL: test_pminsb:
963 ; SLM-NEXT: pminsb %xmm1, %xmm0 # sched: [1:0.50]
964 ; SLM-NEXT: pminsb (%rdi), %xmm0 # sched: [4:1.00]
965 ; SLM-NEXT: retq # sched: [4:1.00]
967 ; SANDY-LABEL: test_pminsb:
969 ; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
970 ; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
971 ; SANDY-NEXT: retq # sched: [1:1.00]
973 ; HASWELL-LABEL: test_pminsb:
975 ; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
976 ; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
977 ; HASWELL-NEXT: retq # sched: [1:1.00]
979 ; BTVER2-LABEL: test_pminsb:
981 ; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
982 ; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
983 ; BTVER2-NEXT: retq # sched: [4:1.00]
984 %1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
985 %2 = load <16 x i8>, <16 x i8> *%a2, align 16
986 %3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
989 declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
991 define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
992 ; GENERIC-LABEL: test_pminsd:
994 ; GENERIC-NEXT: pminsd %xmm1, %xmm0
995 ; GENERIC-NEXT: pminsd (%rdi), %xmm0
998 ; SLM-LABEL: test_pminsd:
1000 ; SLM-NEXT: pminsd %xmm1, %xmm0 # sched: [1:0.50]
1001 ; SLM-NEXT: pminsd (%rdi), %xmm0 # sched: [4:1.00]
1002 ; SLM-NEXT: retq # sched: [4:1.00]
1004 ; SANDY-LABEL: test_pminsd:
1006 ; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1007 ; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
1008 ; SANDY-NEXT: retq # sched: [1:1.00]
1010 ; HASWELL-LABEL: test_pminsd:
1012 ; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1013 ; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
1014 ; HASWELL-NEXT: retq # sched: [1:1.00]
1016 ; BTVER2-LABEL: test_pminsd:
1018 ; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1019 ; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
1020 ; BTVER2-NEXT: retq # sched: [4:1.00]
1021 %1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
1022 %2 = load <4 x i32>, <4 x i32> *%a2, align 16
1023 %3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
1026 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
1028 define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
1029 ; GENERIC-LABEL: test_pminud:
1031 ; GENERIC-NEXT: pminud %xmm1, %xmm0
1032 ; GENERIC-NEXT: pminud (%rdi), %xmm0
1033 ; GENERIC-NEXT: retq
1035 ; SLM-LABEL: test_pminud:
1037 ; SLM-NEXT: pminud %xmm1, %xmm0 # sched: [1:0.50]
1038 ; SLM-NEXT: pminud (%rdi), %xmm0 # sched: [4:1.00]
1039 ; SLM-NEXT: retq # sched: [4:1.00]
1041 ; SANDY-LABEL: test_pminud:
1043 ; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1044 ; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
1045 ; SANDY-NEXT: retq # sched: [1:1.00]
1047 ; HASWELL-LABEL: test_pminud:
1049 ; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1050 ; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
1051 ; HASWELL-NEXT: retq # sched: [1:1.00]
1053 ; BTVER2-LABEL: test_pminud:
1055 ; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1056 ; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
1057 ; BTVER2-NEXT: retq # sched: [4:1.00]
1058 %1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
1059 %2 = load <4 x i32>, <4 x i32> *%a2, align 16
1060 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
1063 declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
1065 define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
1066 ; GENERIC-LABEL: test_pminuw:
1068 ; GENERIC-NEXT: pminuw %xmm1, %xmm0
1069 ; GENERIC-NEXT: pminuw (%rdi), %xmm0
1070 ; GENERIC-NEXT: retq
1072 ; SLM-LABEL: test_pminuw:
1074 ; SLM-NEXT: pminuw %xmm1, %xmm0 # sched: [1:0.50]
1075 ; SLM-NEXT: pminuw (%rdi), %xmm0 # sched: [4:1.00]
1076 ; SLM-NEXT: retq # sched: [4:1.00]
1078 ; SANDY-LABEL: test_pminuw:
1080 ; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1081 ; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
1082 ; SANDY-NEXT: retq # sched: [1:1.00]
1084 ; HASWELL-LABEL: test_pminuw:
1086 ; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1087 ; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
1088 ; HASWELL-NEXT: retq # sched: [1:1.00]
1090 ; BTVER2-LABEL: test_pminuw:
1092 ; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1093 ; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
1094 ; BTVER2-NEXT: retq # sched: [4:1.00]
1095 %1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
1096 %2 = load <8 x i16>, <8 x i16> *%a2, align 16
1097 %3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
1100 declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
1102 define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
1103 ; GENERIC-LABEL: test_pmovsxbw:
1105 ; GENERIC-NEXT: pmovsxbw %xmm0, %xmm1
1106 ; GENERIC-NEXT: pmovsxbw (%rdi), %xmm0
1107 ; GENERIC-NEXT: paddw %xmm1, %xmm0
1108 ; GENERIC-NEXT: retq
1110 ; SLM-LABEL: test_pmovsxbw:
1112 ; SLM-NEXT: pmovsxbw (%rdi), %xmm1 # sched: [4:1.00]
1113 ; SLM-NEXT: pmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
1114 ; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
1115 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1116 ; SLM-NEXT: retq # sched: [4:1.00]
1118 ; SANDY-LABEL: test_pmovsxbw:
1120 ; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
1121 ; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
1122 ; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1123 ; SANDY-NEXT: retq # sched: [1:1.00]
1125 ; HASWELL-LABEL: test_pmovsxbw:
1127 ; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
1128 ; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
1129 ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1130 ; HASWELL-NEXT: retq # sched: [1:1.00]
1132 ; BTVER2-LABEL: test_pmovsxbw:
1134 ; BTVER2-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
1135 ; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
1136 ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1137 ; BTVER2-NEXT: retq # sched: [4:1.00]
1138 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1139 %2 = sext <8 x i8> %1 to <8 x i16>
1140 %3 = load <8 x i8>, <8 x i8>* %a1, align 1
1141 %4 = sext <8 x i8> %3 to <8 x i16>
1142 %5 = add <8 x i16> %2, %4
1146 define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
1147 ; GENERIC-LABEL: test_pmovsxbd:
1149 ; GENERIC-NEXT: pmovsxbd %xmm0, %xmm1
1150 ; GENERIC-NEXT: pmovsxbd (%rdi), %xmm0
1151 ; GENERIC-NEXT: paddd %xmm1, %xmm0
1152 ; GENERIC-NEXT: retq
1154 ; SLM-LABEL: test_pmovsxbd:
1156 ; SLM-NEXT: pmovsxbd (%rdi), %xmm1 # sched: [4:1.00]
1157 ; SLM-NEXT: pmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
1158 ; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
1159 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1160 ; SLM-NEXT: retq # sched: [4:1.00]
1162 ; SANDY-LABEL: test_pmovsxbd:
1164 ; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
1165 ; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
1166 ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1167 ; SANDY-NEXT: retq # sched: [1:1.00]
1169 ; HASWELL-LABEL: test_pmovsxbd:
1171 ; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
1172 ; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
1173 ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1174 ; HASWELL-NEXT: retq # sched: [1:1.00]
1176 ; BTVER2-LABEL: test_pmovsxbd:
1178 ; BTVER2-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
1179 ; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
1180 ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1181 ; BTVER2-NEXT: retq # sched: [4:1.00]
1182 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1183 %2 = sext <4 x i8> %1 to <4 x i32>
1184 %3 = load <4 x i8>, <4 x i8>* %a1, align 1
1185 %4 = sext <4 x i8> %3 to <4 x i32>
1186 %5 = add <4 x i32> %2, %4
1190 define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
1191 ; GENERIC-LABEL: test_pmovsxbq:
1193 ; GENERIC-NEXT: pmovsxbq %xmm0, %xmm1
1194 ; GENERIC-NEXT: pmovsxbq (%rdi), %xmm0
1195 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1196 ; GENERIC-NEXT: retq
1198 ; SLM-LABEL: test_pmovsxbq:
1200 ; SLM-NEXT: pmovsxbq (%rdi), %xmm1 # sched: [4:1.00]
1201 ; SLM-NEXT: pmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
1202 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1203 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1204 ; SLM-NEXT: retq # sched: [4:1.00]
1206 ; SANDY-LABEL: test_pmovsxbq:
1208 ; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
1209 ; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
1210 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1211 ; SANDY-NEXT: retq # sched: [1:1.00]
1213 ; HASWELL-LABEL: test_pmovsxbq:
1215 ; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
1216 ; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
1217 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1218 ; HASWELL-NEXT: retq # sched: [1:1.00]
1220 ; BTVER2-LABEL: test_pmovsxbq:
1222 ; BTVER2-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
1223 ; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
1224 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1225 ; BTVER2-NEXT: retq # sched: [4:1.00]
1226 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
1227 %2 = sext <2 x i8> %1 to <2 x i64>
1228 %3 = load <2 x i8>, <2 x i8>* %a1, align 1
1229 %4 = sext <2 x i8> %3 to <2 x i64>
1230 %5 = add <2 x i64> %2, %4
1234 define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
1235 ; GENERIC-LABEL: test_pmovsxdq:
1237 ; GENERIC-NEXT: pmovsxdq %xmm0, %xmm1
1238 ; GENERIC-NEXT: pmovsxdq (%rdi), %xmm0
1239 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1240 ; GENERIC-NEXT: retq
1242 ; SLM-LABEL: test_pmovsxdq:
1244 ; SLM-NEXT: pmovsxdq (%rdi), %xmm1 # sched: [4:1.00]
1245 ; SLM-NEXT: pmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
1246 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1247 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1248 ; SLM-NEXT: retq # sched: [4:1.00]
1250 ; SANDY-LABEL: test_pmovsxdq:
1252 ; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
1253 ; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
1254 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1255 ; SANDY-NEXT: retq # sched: [1:1.00]
1257 ; HASWELL-LABEL: test_pmovsxdq:
1259 ; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
1260 ; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
1261 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1262 ; HASWELL-NEXT: retq # sched: [1:1.00]
1264 ; BTVER2-LABEL: test_pmovsxdq:
1266 ; BTVER2-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
1267 ; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
1268 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1269 ; BTVER2-NEXT: retq # sched: [4:1.00]
1270 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1271 %2 = sext <2 x i32> %1 to <2 x i64>
1272 %3 = load <2 x i32>, <2 x i32>* %a1, align 1
1273 %4 = sext <2 x i32> %3 to <2 x i64>
1274 %5 = add <2 x i64> %2, %4
1278 define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
1279 ; GENERIC-LABEL: test_pmovsxwd:
1281 ; GENERIC-NEXT: pmovsxwd %xmm0, %xmm1
1282 ; GENERIC-NEXT: pmovsxwd (%rdi), %xmm0
1283 ; GENERIC-NEXT: paddd %xmm1, %xmm0
1284 ; GENERIC-NEXT: retq
1286 ; SLM-LABEL: test_pmovsxwd:
1288 ; SLM-NEXT: pmovsxwd (%rdi), %xmm1 # sched: [4:1.00]
1289 ; SLM-NEXT: pmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
1290 ; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
1291 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1292 ; SLM-NEXT: retq # sched: [4:1.00]
1294 ; SANDY-LABEL: test_pmovsxwd:
1296 ; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
1297 ; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
1298 ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1299 ; SANDY-NEXT: retq # sched: [1:1.00]
1301 ; HASWELL-LABEL: test_pmovsxwd:
1303 ; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
1304 ; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
1305 ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1306 ; HASWELL-NEXT: retq # sched: [1:1.00]
1308 ; BTVER2-LABEL: test_pmovsxwd:
1310 ; BTVER2-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
1311 ; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
1312 ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1313 ; BTVER2-NEXT: retq # sched: [4:1.00]
1314 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1315 %2 = sext <4 x i16> %1 to <4 x i32>
1316 %3 = load <4 x i16>, <4 x i16>* %a1, align 1
1317 %4 = sext <4 x i16> %3 to <4 x i32>
1318 %5 = add <4 x i32> %2, %4
1322 define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
1323 ; GENERIC-LABEL: test_pmovsxwq:
1325 ; GENERIC-NEXT: pmovsxwq %xmm0, %xmm1
1326 ; GENERIC-NEXT: pmovsxwq (%rdi), %xmm0
1327 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1328 ; GENERIC-NEXT: retq
1330 ; SLM-LABEL: test_pmovsxwq:
1332 ; SLM-NEXT: pmovsxwq (%rdi), %xmm1 # sched: [4:1.00]
1333 ; SLM-NEXT: pmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
1334 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1335 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1336 ; SLM-NEXT: retq # sched: [4:1.00]
1338 ; SANDY-LABEL: test_pmovsxwq:
1340 ; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
1341 ; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
1342 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1343 ; SANDY-NEXT: retq # sched: [1:1.00]
1345 ; HASWELL-LABEL: test_pmovsxwq:
1347 ; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
1348 ; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
1349 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1350 ; HASWELL-NEXT: retq # sched: [1:1.00]
1352 ; BTVER2-LABEL: test_pmovsxwq:
1354 ; BTVER2-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
1355 ; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
1356 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1357 ; BTVER2-NEXT: retq # sched: [4:1.00]
1358 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1359 %2 = sext <2 x i16> %1 to <2 x i64>
1360 %3 = load <2 x i16>, <2 x i16>* %a1, align 1
1361 %4 = sext <2 x i16> %3 to <2 x i64>
1362 %5 = add <2 x i64> %2, %4
1366 define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
1367 ; GENERIC-LABEL: test_pmovzxbw:
1369 ; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1370 ; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
1371 ; GENERIC-NEXT: paddw %xmm1, %xmm0
1372 ; GENERIC-NEXT: retq
1374 ; SLM-LABEL: test_pmovzxbw:
1376 ; SLM-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [4:1.00]
1377 ; SLM-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
1378 ; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
1379 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1380 ; SLM-NEXT: retq # sched: [4:1.00]
1382 ; SANDY-LABEL: test_pmovzxbw:
1384 ; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
1385 ; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
1386 ; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1387 ; SANDY-NEXT: retq # sched: [1:1.00]
1389 ; HASWELL-LABEL: test_pmovzxbw:
1391 ; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
1392 ; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
1393 ; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1394 ; HASWELL-NEXT: retq # sched: [1:1.00]
1396 ; BTVER2-LABEL: test_pmovzxbw:
1398 ; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
1399 ; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
1400 ; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1401 ; BTVER2-NEXT: retq # sched: [4:1.00]
1402 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1403 %2 = zext <8 x i8> %1 to <8 x i16>
1404 %3 = load <8 x i8>, <8 x i8>* %a1, align 1
1405 %4 = zext <8 x i8> %3 to <8 x i16>
1406 %5 = add <8 x i16> %2, %4
1410 define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
1411 ; GENERIC-LABEL: test_pmovzxbd:
1413 ; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1414 ; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
1415 ; GENERIC-NEXT: paddd %xmm1, %xmm0
1416 ; GENERIC-NEXT: retq
1418 ; SLM-LABEL: test_pmovzxbd:
1420 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [4:1.00]
1421 ; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
1422 ; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
1423 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1424 ; SLM-NEXT: retq # sched: [4:1.00]
1426 ; SANDY-LABEL: test_pmovzxbd:
1428 ; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
1429 ; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
1430 ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1431 ; SANDY-NEXT: retq # sched: [1:1.00]
1433 ; HASWELL-LABEL: test_pmovzxbd:
1435 ; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
1436 ; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
1437 ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1438 ; HASWELL-NEXT: retq # sched: [1:1.00]
1440 ; BTVER2-LABEL: test_pmovzxbd:
1442 ; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
1443 ; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
1444 ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1445 ; BTVER2-NEXT: retq # sched: [4:1.00]
1446 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1447 %2 = zext <4 x i8> %1 to <4 x i32>
1448 %3 = load <4 x i8>, <4 x i8>* %a1, align 1
1449 %4 = zext <4 x i8> %3 to <4 x i32>
1450 %5 = add <4 x i32> %2, %4
1454 define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
1455 ; GENERIC-LABEL: test_pmovzxbq:
1457 ; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1458 ; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
1459 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1460 ; GENERIC-NEXT: retq
1462 ; SLM-LABEL: test_pmovzxbq:
1464 ; SLM-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [4:1.00]
1465 ; SLM-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
1466 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1467 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1468 ; SLM-NEXT: retq # sched: [4:1.00]
1470 ; SANDY-LABEL: test_pmovzxbq:
1472 ; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
1473 ; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
1474 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1475 ; SANDY-NEXT: retq # sched: [1:1.00]
1477 ; HASWELL-LABEL: test_pmovzxbq:
1479 ; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
1480 ; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
1481 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1482 ; HASWELL-NEXT: retq # sched: [1:1.00]
1484 ; BTVER2-LABEL: test_pmovzxbq:
1486 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
1487 ; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
1488 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1489 ; BTVER2-NEXT: retq # sched: [4:1.00]
1490 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
1491 %2 = zext <2 x i8> %1 to <2 x i64>
1492 %3 = load <2 x i8>, <2 x i8>* %a1, align 1
1493 %4 = zext <2 x i8> %3 to <2 x i64>
1494 %5 = add <2 x i64> %2, %4
1498 define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
1499 ; GENERIC-LABEL: test_pmovzxdq:
1501 ; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
1502 ; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
1503 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1504 ; GENERIC-NEXT: retq
1506 ; SLM-LABEL: test_pmovzxdq:
1508 ; SLM-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [4:1.00]
1509 ; SLM-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
1510 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1511 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1512 ; SLM-NEXT: retq # sched: [4:1.00]
1514 ; SANDY-LABEL: test_pmovzxdq:
1516 ; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
1517 ; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
1518 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1519 ; SANDY-NEXT: retq # sched: [1:1.00]
1521 ; HASWELL-LABEL: test_pmovzxdq:
1523 ; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
1524 ; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
1525 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1526 ; HASWELL-NEXT: retq # sched: [1:1.00]
1528 ; BTVER2-LABEL: test_pmovzxdq:
1530 ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
1531 ; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
1532 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1533 ; BTVER2-NEXT: retq # sched: [4:1.00]
1534 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1535 %2 = zext <2 x i32> %1 to <2 x i64>
1536 %3 = load <2 x i32>, <2 x i32>* %a1, align 1
1537 %4 = zext <2 x i32> %3 to <2 x i64>
1538 %5 = add <2 x i64> %2, %4
1542 define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
1543 ; GENERIC-LABEL: test_pmovzxwd:
1545 ; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1546 ; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
1547 ; GENERIC-NEXT: paddd %xmm1, %xmm0
1548 ; GENERIC-NEXT: retq
1550 ; SLM-LABEL: test_pmovzxwd:
1552 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [4:1.00]
1553 ; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
1554 ; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
1555 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1556 ; SLM-NEXT: retq # sched: [4:1.00]
1558 ; SANDY-LABEL: test_pmovzxwd:
1560 ; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
1561 ; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
1562 ; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1563 ; SANDY-NEXT: retq # sched: [1:1.00]
1565 ; HASWELL-LABEL: test_pmovzxwd:
1567 ; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
1568 ; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
1569 ; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1570 ; HASWELL-NEXT: retq # sched: [1:1.00]
1572 ; BTVER2-LABEL: test_pmovzxwd:
1574 ; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
1575 ; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
1576 ; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1577 ; BTVER2-NEXT: retq # sched: [4:1.00]
1578 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1579 %2 = zext <4 x i16> %1 to <4 x i32>
1580 %3 = load <4 x i16>, <4 x i16>* %a1, align 1
1581 %4 = zext <4 x i16> %3 to <4 x i32>
1582 %5 = add <4 x i32> %2, %4
1586 define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
1587 ; GENERIC-LABEL: test_pmovzxwq:
1589 ; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1590 ; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
1591 ; GENERIC-NEXT: paddq %xmm1, %xmm0
1592 ; GENERIC-NEXT: retq
1594 ; SLM-LABEL: test_pmovzxwq:
1596 ; SLM-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [4:1.00]
1597 ; SLM-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
1598 ; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
1599 ; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
1600 ; SLM-NEXT: retq # sched: [4:1.00]
1602 ; SANDY-LABEL: test_pmovzxwq:
1604 ; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
1605 ; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
1606 ; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1607 ; SANDY-NEXT: retq # sched: [1:1.00]
1609 ; HASWELL-LABEL: test_pmovzxwq:
1611 ; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
1612 ; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
1613 ; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1614 ; HASWELL-NEXT: retq # sched: [1:1.00]
1616 ; BTVER2-LABEL: test_pmovzxwq:
1618 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
1619 ; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
1620 ; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
1621 ; BTVER2-NEXT: retq # sched: [4:1.00]
1622 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
1623 %2 = zext <2 x i16> %1 to <2 x i64>
1624 %3 = load <2 x i16>, <2 x i16>* %a1, align 1
1625 %4 = zext <2 x i16> %3 to <2 x i64>
1626 %5 = add <2 x i64> %2, %4
1630 define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
1631 ; GENERIC-LABEL: test_pmuldq:
1633 ; GENERIC-NEXT: pmuldq %xmm1, %xmm0
1634 ; GENERIC-NEXT: pmuldq (%rdi), %xmm0
1635 ; GENERIC-NEXT: retq
1637 ; SLM-LABEL: test_pmuldq:
1639 ; SLM-NEXT: pmuldq %xmm1, %xmm0 # sched: [4:1.00]
1640 ; SLM-NEXT: pmuldq (%rdi), %xmm0 # sched: [7:1.00]
1641 ; SLM-NEXT: retq # sched: [4:1.00]
1643 ; SANDY-LABEL: test_pmuldq:
1645 ; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1646 ; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
1647 ; SANDY-NEXT: retq # sched: [1:1.00]
1649 ; HASWELL-LABEL: test_pmuldq:
1651 ; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
1652 ; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
1653 ; HASWELL-NEXT: retq # sched: [1:1.00]
1655 ; BTVER2-LABEL: test_pmuldq:
1657 ; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
1658 ; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
1659 ; BTVER2-NEXT: retq # sched: [4:1.00]
1660 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
1661 %2 = bitcast <2 x i64> %1 to <4 x i32>
1662 %3 = load <4 x i32>, <4 x i32> *%a2, align 16
1663 %4 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %2, <4 x i32> %3)
1666 declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
1668 define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
1669 ; GENERIC-LABEL: test_pmulld:
1671 ; GENERIC-NEXT: pmulld %xmm1, %xmm0
1672 ; GENERIC-NEXT: pmulld (%rdi), %xmm0
1673 ; GENERIC-NEXT: retq
1675 ; SLM-LABEL: test_pmulld:
1677 ; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00]
1678 ; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00]
1679 ; SLM-NEXT: retq # sched: [4:1.00]
1681 ; SANDY-LABEL: test_pmulld:
1683 ; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1684 ; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
1685 ; SANDY-NEXT: retq # sched: [1:1.00]
1687 ; HASWELL-LABEL: test_pmulld:
1689 ; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
1690 ; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
1691 ; HASWELL-NEXT: retq # sched: [1:1.00]
1693 ; BTVER2-LABEL: test_pmulld:
1695 ; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
1696 ; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
1697 ; BTVER2-NEXT: retq # sched: [4:1.00]
1698 %1 = mul <4 x i32> %a0, %a1
1699 %2 = load <4 x i32>, <4 x i32> *%a2, align 16
1700 %3 = mul <4 x i32> %1, %2
1704 define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
1705 ; GENERIC-LABEL: test_ptest:
1707 ; GENERIC-NEXT: ptest %xmm1, %xmm0
1708 ; GENERIC-NEXT: setb %al
1709 ; GENERIC-NEXT: ptest (%rdi), %xmm0
1710 ; GENERIC-NEXT: setb %cl
1711 ; GENERIC-NEXT: andb %al, %cl
1712 ; GENERIC-NEXT: movzbl %cl, %eax
1713 ; GENERIC-NEXT: retq
1715 ; SLM-LABEL: test_ptest:
1717 ; SLM-NEXT: ptest %xmm1, %xmm0 # sched: [1:0.50]
1718 ; SLM-NEXT: setb %al # sched: [1:0.50]
1719 ; SLM-NEXT: ptest (%rdi), %xmm0 # sched: [4:1.00]
1720 ; SLM-NEXT: setb %cl # sched: [1:0.50]
1721 ; SLM-NEXT: andb %al, %cl # sched: [1:0.50]
1722 ; SLM-NEXT: movzbl %cl, %eax # sched: [1:0.50]
1723 ; SLM-NEXT: retq # sched: [4:1.00]
1725 ; SANDY-LABEL: test_ptest:
1727 ; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
1728 ; SANDY-NEXT: setb %al # sched: [1:1.00]
1729 ; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
1730 ; SANDY-NEXT: setb %cl # sched: [1:1.00]
1731 ; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
1732 ; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
1733 ; SANDY-NEXT: retq # sched: [1:1.00]
1735 ; HASWELL-LABEL: test_ptest:
1737 ; HASWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
1738 ; HASWELL-NEXT: setb %al # sched: [1:0.50]
1739 ; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00]
1740 ; HASWELL-NEXT: setb %cl # sched: [1:0.50]
1741 ; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
1742 ; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
1743 ; HASWELL-NEXT: retq # sched: [1:1.00]
1745 ; BTVER2-LABEL: test_ptest:
1747 ; BTVER2-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.50]
1748 ; BTVER2-NEXT: setb %al # sched: [1:0.50]
1749 ; BTVER2-NEXT: vptest (%rdi), %xmm0 # sched: [6:1.00]
1750 ; BTVER2-NEXT: setb %cl # sched: [1:0.50]
1751 ; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50]
1752 ; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50]
1753 ; BTVER2-NEXT: retq # sched: [4:1.00]
1754 %1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
1755 %2 = load <2 x i64>, <2 x i64> *%a2, align 16
1756 %3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
1760 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
1762 define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
1763 ; GENERIC-LABEL: test_roundpd:
1765 ; GENERIC-NEXT: roundpd $7, %xmm0, %xmm1
1766 ; GENERIC-NEXT: roundpd $7, (%rdi), %xmm0
1767 ; GENERIC-NEXT: addpd %xmm1, %xmm0
1768 ; GENERIC-NEXT: retq
1770 ; SLM-LABEL: test_roundpd:
1772 ; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00]
1773 ; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
1774 ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
1775 ; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
1776 ; SLM-NEXT: retq # sched: [4:1.00]
1778 ; SANDY-LABEL: test_roundpd:
1780 ; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
1781 ; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
1782 ; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1783 ; SANDY-NEXT: retq # sched: [1:1.00]
1785 ; HASWELL-LABEL: test_roundpd:
1787 ; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
1788 ; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
1789 ; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1790 ; HASWELL-NEXT: retq # sched: [1:1.00]
1792 ; BTVER2-LABEL: test_roundpd:
1794 ; BTVER2-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [8:1.00]
1795 ; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
1796 ; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1797 ; BTVER2-NEXT: retq # sched: [4:1.00]
1798 %1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
1799 %2 = load <2 x double>, <2 x double> *%a1, align 16
1800 %3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
1801 %4 = fadd <2 x double> %1, %3
1804 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
1806 define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
1807 ; GENERIC-LABEL: test_roundps:
1809 ; GENERIC-NEXT: roundps $7, %xmm0, %xmm1
1810 ; GENERIC-NEXT: roundps $7, (%rdi), %xmm0
1811 ; GENERIC-NEXT: addps %xmm1, %xmm0
1812 ; GENERIC-NEXT: retq
1814 ; SLM-LABEL: test_roundps:
1816 ; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00]
1817 ; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00]
1818 ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
1819 ; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
1820 ; SLM-NEXT: retq # sched: [4:1.00]
1822 ; SANDY-LABEL: test_roundps:
1824 ; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
1825 ; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
1826 ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1827 ; SANDY-NEXT: retq # sched: [1:1.00]
1829 ; HASWELL-LABEL: test_roundps:
1831 ; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
1832 ; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
1833 ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1834 ; HASWELL-NEXT: retq # sched: [1:1.00]
1836 ; BTVER2-LABEL: test_roundps:
1838 ; BTVER2-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [8:1.00]
1839 ; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
1840 ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
1841 ; BTVER2-NEXT: retq # sched: [4:1.00]
1842 %1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
1843 %2 = load <4 x float>, <4 x float> *%a1, align 16
1844 %3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
1845 %4 = fadd <4 x float> %1, %3
1848 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
1850 define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
1851 ; GENERIC-LABEL: test_roundsd:
1853 ; GENERIC-NEXT: movaps %xmm0, %xmm2
1854 ; GENERIC-NEXT: roundsd $7, %xmm1, %xmm2
1855 ; GENERIC-NEXT: roundsd $7, (%rdi), %xmm0
1856 ; GENERIC-NEXT: addpd %xmm2, %xmm0
1857 ; GENERIC-NEXT: retq
1859 ; SLM-LABEL: test_roundsd:
1861 ; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
1862 ; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00]
1863 ; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
1864 ; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
1865 ; SLM-NEXT: retq # sched: [4:1.00]
1867 ; SANDY-LABEL: test_roundsd:
1869 ; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
1870 ; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
1871 ; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1872 ; SANDY-NEXT: retq # sched: [1:1.00]
1874 ; HASWELL-LABEL: test_roundsd:
1876 ; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
1877 ; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
1878 ; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1879 ; HASWELL-NEXT: retq # sched: [1:1.00]
1881 ; BTVER2-LABEL: test_roundsd:
1883 ; BTVER2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
1884 ; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
1885 ; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1886 ; BTVER2-NEXT: retq # sched: [4:1.00]
1887 %1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
1888 %2 = load <2 x double>, <2 x double>* %a2, align 16
1889 %3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
1890 %4 = fadd <2 x double> %1, %3
1893 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
1895 define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
1896 ; GENERIC-LABEL: test_roundss:
1898 ; GENERIC-NEXT: movaps %xmm0, %xmm2
1899 ; GENERIC-NEXT: roundss $7, %xmm1, %xmm2
1900 ; GENERIC-NEXT: roundss $7, (%rdi), %xmm0
1901 ; GENERIC-NEXT: addps %xmm2, %xmm0
1902 ; GENERIC-NEXT: retq
1904 ; SLM-LABEL: test_roundss:
1906 ; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
1907 ; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00]
1908 ; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
1909 ; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00]
1910 ; SLM-NEXT: retq # sched: [4:1.00]
1912 ; SANDY-LABEL: test_roundss:
1914 ; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
1915 ; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
1916 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1917 ; SANDY-NEXT: retq # sched: [1:1.00]
1919 ; HASWELL-LABEL: test_roundss:
1921 ; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
1922 ; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
1923 ; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1924 ; HASWELL-NEXT: retq # sched: [1:1.00]
1926 ; BTVER2-LABEL: test_roundss:
1928 ; BTVER2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
1929 ; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
1930 ; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
1931 ; BTVER2-NEXT: retq # sched: [4:1.00]
1932 %1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
1933 %2 = load <4 x float>, <4 x float> *%a2, align 16
1934 %3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)
1935 %4 = fadd <4 x float> %1, %3
1938 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone