1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5 ; 32-bit tests to make sure we're not doing anything stupid.
6 ; RUN: llc < %s -mtriple=i686-unknown-unknown
7 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse
8 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2
11 ; Signed Integer to Double
14 define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
15 ; SSE-LABEL: sitofp_2i64_to_2f64:
17 ; SSE-NEXT: movd %xmm0, %rax
18 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
19 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
20 ; SSE-NEXT: movd %xmm0, %rax
21 ; SSE-NEXT: xorps %xmm0, %xmm0
22 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
23 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
24 ; SSE-NEXT: movapd %xmm1, %xmm0
27 ; AVX-LABEL: sitofp_2i64_to_2f64:
29 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
30 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
31 ; AVX-NEXT: vmovq %xmm0, %rax
32 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
33 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
34 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36 %cvt = sitofp <2 x i64> %a to <2 x double>
40 define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
41 ; SSE-LABEL: sitofp_2i32_to_2f64:
43 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
46 ; AVX-LABEL: sitofp_2i32_to_2f64:
48 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
50 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
51 %cvt = sitofp <2 x i32> %shuf to <2 x double>
55 define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
56 ; SSE-LABEL: sitofp_4i32_to_2f64:
58 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
61 ; AVX-LABEL: sitofp_4i32_to_2f64:
63 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
65 ; AVX-NEXT: vzeroupper
67 %cvt = sitofp <4 x i32> %a to <4 x double>
68 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
69 ret <2 x double> %shuf
72 define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
73 ; SSE-LABEL: sitofp_2i16_to_2f64:
75 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
76 ; SSE-NEXT: psrad $16, %xmm0
77 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
80 ; AVX-LABEL: sitofp_2i16_to_2f64:
82 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
83 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
85 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
86 %cvt = sitofp <2 x i16> %shuf to <2 x double>
90 define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
91 ; SSE-LABEL: sitofp_8i16_to_2f64:
93 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
94 ; SSE-NEXT: psrad $16, %xmm0
95 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
98 ; AVX1-LABEL: sitofp_8i16_to_2f64:
100 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
101 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
103 ; AVX1-NEXT: vzeroupper
106 ; AVX2-LABEL: sitofp_8i16_to_2f64:
108 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
109 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
111 ; AVX2-NEXT: vzeroupper
113 %cvt = sitofp <8 x i16> %a to <8 x double>
114 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
115 ret <2 x double> %shuf
118 define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
119 ; SSE-LABEL: sitofp_2i8_to_2f64:
121 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
122 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
123 ; SSE-NEXT: psrad $24, %xmm0
124 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
127 ; AVX-LABEL: sitofp_2i8_to_2f64:
129 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
130 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
132 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
133 %cvt = sitofp <2 x i8> %shuf to <2 x double>
134 ret <2 x double> %cvt
137 define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
138 ; SSE-LABEL: sitofp_16i8_to_2f64:
140 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
141 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
142 ; SSE-NEXT: psrad $24, %xmm0
143 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
146 ; AVX1-LABEL: sitofp_16i8_to_2f64:
148 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
149 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
151 ; AVX1-NEXT: vzeroupper
154 ; AVX2-LABEL: sitofp_16i8_to_2f64:
156 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
157 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
158 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
160 ; AVX2-NEXT: vzeroupper
162 %cvt = sitofp <16 x i8> %a to <16 x double>
163 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
164 ret <2 x double> %shuf
167 define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
168 ; SSE-LABEL: sitofp_4i64_to_4f64:
170 ; SSE-NEXT: movd %xmm0, %rax
171 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2
172 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
173 ; SSE-NEXT: movd %xmm0, %rax
174 ; SSE-NEXT: xorps %xmm0, %xmm0
175 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
176 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
177 ; SSE-NEXT: movd %xmm1, %rax
178 ; SSE-NEXT: cvtsi2sdq %rax, %xmm3
179 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
180 ; SSE-NEXT: movd %xmm0, %rax
181 ; SSE-NEXT: xorps %xmm0, %xmm0
182 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
183 ; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
184 ; SSE-NEXT: movapd %xmm2, %xmm0
185 ; SSE-NEXT: movapd %xmm3, %xmm1
188 ; AVX1-LABEL: sitofp_4i64_to_4f64:
190 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
191 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
192 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
193 ; AVX1-NEXT: vmovq %xmm1, %rax
194 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
195 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
196 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
197 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
198 ; AVX1-NEXT: vmovq %xmm0, %rax
199 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
200 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
201 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
202 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
205 ; AVX2-LABEL: sitofp_4i64_to_4f64:
207 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
208 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
209 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
210 ; AVX2-NEXT: vmovq %xmm1, %rax
211 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
212 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
213 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
214 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
215 ; AVX2-NEXT: vmovq %xmm0, %rax
216 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
217 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
218 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
219 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
221 %cvt = sitofp <4 x i64> %a to <4 x double>
222 ret <4 x double> %cvt
225 define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
226 ; SSE-LABEL: sitofp_4i32_to_4f64:
228 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
229 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
230 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
231 ; SSE-NEXT: movaps %xmm2, %xmm0
234 ; AVX-LABEL: sitofp_4i32_to_4f64:
236 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
238 %cvt = sitofp <4 x i32> %a to <4 x double>
239 ret <4 x double> %cvt
242 define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
243 ; SSE-LABEL: sitofp_4i16_to_4f64:
245 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
246 ; SSE-NEXT: psrad $16, %xmm1
247 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
248 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
249 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
252 ; AVX-LABEL: sitofp_4i16_to_4f64:
254 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
255 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
257 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
258 %cvt = sitofp <4 x i16> %shuf to <4 x double>
259 ret <4 x double> %cvt
262 define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
263 ; SSE-LABEL: sitofp_8i16_to_4f64:
265 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
266 ; SSE-NEXT: psrad $16, %xmm1
267 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
268 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
269 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
272 ; AVX1-LABEL: sitofp_8i16_to_4f64:
274 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
275 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
278 ; AVX2-LABEL: sitofp_8i16_to_4f64:
280 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
281 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
283 %cvt = sitofp <8 x i16> %a to <8 x double>
284 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285 ret <4 x double> %shuf
288 define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
289 ; SSE-LABEL: sitofp_4i8_to_4f64:
291 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
292 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
293 ; SSE-NEXT: psrad $24, %xmm1
294 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
295 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
296 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
299 ; AVX-LABEL: sitofp_4i8_to_4f64:
301 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
302 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
304 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
305 %cvt = sitofp <4 x i8> %shuf to <4 x double>
306 ret <4 x double> %cvt
309 define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
310 ; SSE-LABEL: sitofp_16i8_to_4f64:
312 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
314 ; SSE-NEXT: psrad $24, %xmm1
315 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
316 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
317 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
320 ; AVX1-LABEL: sitofp_16i8_to_4f64:
322 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
323 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
326 ; AVX2-LABEL: sitofp_16i8_to_4f64:
328 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
329 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
330 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
332 %cvt = sitofp <16 x i8> %a to <16 x double>
333 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
334 ret <4 x double> %shuf
338 ; Unsigned Integer to Double
341 define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
342 ; SSE-LABEL: uitofp_2i64_to_2f64:
344 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
345 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
346 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
347 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
348 ; SSE-NEXT: subpd %xmm3, %xmm0
349 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
350 ; SSE-NEXT: addpd %xmm4, %xmm0
351 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
352 ; SSE-NEXT: subpd %xmm3, %xmm2
353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
354 ; SSE-NEXT: addpd %xmm2, %xmm1
355 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
358 ; AVX-LABEL: uitofp_2i64_to_2f64:
360 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
361 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
362 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
363 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
364 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
365 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
366 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
367 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
368 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
369 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
371 %cvt = uitofp <2 x i64> %a to <2 x double>
372 ret <2 x double> %cvt
375 define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
376 ; SSE-LABEL: uitofp_2i32_to_2f64:
378 ; SSE-NEXT: pxor %xmm1, %xmm1
379 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
380 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
381 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
382 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
383 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
384 ; SSE-NEXT: subpd %xmm3, %xmm0
385 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
386 ; SSE-NEXT: addpd %xmm4, %xmm0
387 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
388 ; SSE-NEXT: subpd %xmm3, %xmm2
389 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
390 ; SSE-NEXT: addpd %xmm2, %xmm1
391 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
394 ; AVX-LABEL: uitofp_2i32_to_2f64:
396 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
397 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
398 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
399 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
400 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
401 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
402 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
403 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
404 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
405 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
406 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
408 %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
409 %cvt = uitofp <2 x i32> %shuf to <2 x double>
410 ret <2 x double> %cvt
413 define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
414 ; SSE-LABEL: uitofp_4i32_to_2f64:
416 ; SSE-NEXT: pxor %xmm1, %xmm1
417 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
418 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
419 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
420 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
421 ; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
422 ; SSE-NEXT: subpd %xmm3, %xmm0
423 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
424 ; SSE-NEXT: addpd %xmm4, %xmm0
425 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
426 ; SSE-NEXT: subpd %xmm3, %xmm2
427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
428 ; SSE-NEXT: addpd %xmm2, %xmm1
429 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
432 ; AVX1-LABEL: uitofp_4i32_to_2f64:
434 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
435 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
436 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
437 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
438 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
439 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
441 ; AVX1-NEXT: vzeroupper
444 ; AVX2-LABEL: uitofp_4i32_to_2f64:
446 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
447 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
448 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
449 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
450 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
451 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
452 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
453 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
455 ; AVX2-NEXT: vzeroupper
457 %cvt = uitofp <4 x i32> %a to <4 x double>
458 %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
459 ret <2 x double> %shuf
462 define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
463 ; SSE-LABEL: uitofp_2i16_to_2f64:
465 ; SSE-NEXT: pxor %xmm1, %xmm1
466 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
467 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
470 ; AVX-LABEL: uitofp_2i16_to_2f64:
472 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
473 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
475 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
476 %cvt = uitofp <2 x i16> %shuf to <2 x double>
477 ret <2 x double> %cvt
480 define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
481 ; SSE-LABEL: uitofp_8i16_to_2f64:
483 ; SSE-NEXT: pxor %xmm1, %xmm1
484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
485 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
488 ; AVX1-LABEL: uitofp_8i16_to_2f64:
490 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
491 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
493 ; AVX1-NEXT: vzeroupper
496 ; AVX2-LABEL: uitofp_8i16_to_2f64:
498 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
499 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
501 ; AVX2-NEXT: vzeroupper
503 %cvt = uitofp <8 x i16> %a to <8 x double>
504 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
505 ret <2 x double> %shuf
508 define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
509 ; SSE-LABEL: uitofp_2i8_to_2f64:
511 ; SSE-NEXT: pxor %xmm1, %xmm1
512 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
513 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
514 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
517 ; AVX-LABEL: uitofp_2i8_to_2f64:
519 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
520 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
522 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
523 %cvt = uitofp <2 x i8> %shuf to <2 x double>
524 ret <2 x double> %cvt
527 define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
528 ; SSE-LABEL: uitofp_16i8_to_2f64:
530 ; SSE-NEXT: pxor %xmm1, %xmm1
531 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
532 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
533 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
536 ; AVX1-LABEL: uitofp_16i8_to_2f64:
538 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
539 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
541 ; AVX1-NEXT: vzeroupper
544 ; AVX2-LABEL: uitofp_16i8_to_2f64:
546 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
547 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
548 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
550 ; AVX2-NEXT: vzeroupper
552 %cvt = uitofp <16 x i8> %a to <16 x double>
553 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
554 ret <2 x double> %shuf
557 define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
558 ; SSE-LABEL: uitofp_4i64_to_4f64:
560 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
561 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
562 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
563 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
564 ; SSE-NEXT: subpd %xmm4, %xmm0
565 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
566 ; SSE-NEXT: addpd %xmm5, %xmm0
567 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
568 ; SSE-NEXT: subpd %xmm4, %xmm3
569 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
570 ; SSE-NEXT: addpd %xmm3, %xmm5
571 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
572 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
573 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
574 ; SSE-NEXT: subpd %xmm4, %xmm1
575 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,0,1]
576 ; SSE-NEXT: addpd %xmm5, %xmm1
577 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
578 ; SSE-NEXT: subpd %xmm4, %xmm3
579 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
580 ; SSE-NEXT: addpd %xmm3, %xmm2
581 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
584 ; AVX1-LABEL: uitofp_4i64_to_4f64:
586 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
587 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
588 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
589 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
590 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
591 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
592 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
593 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
594 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
595 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
596 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
597 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
598 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
599 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
600 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
601 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
602 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
603 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
604 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
605 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
608 ; AVX2-LABEL: uitofp_4i64_to_4f64:
610 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
611 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
612 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
613 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
614 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
615 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
616 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
617 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
618 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
619 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
620 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
621 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
622 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
623 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
624 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
625 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
626 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
627 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
628 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
629 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
631 %cvt = uitofp <4 x i64> %a to <4 x double>
632 ret <4 x double> %cvt
635 define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
636 ; SSE-LABEL: uitofp_4i32_to_4f64:
638 ; SSE-NEXT: movdqa %xmm0, %xmm2
639 ; SSE-NEXT: pxor %xmm1, %xmm1
640 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
641 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
642 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
643 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
644 ; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
645 ; SSE-NEXT: subpd %xmm5, %xmm0
646 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
647 ; SSE-NEXT: addpd %xmm6, %xmm0
648 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
649 ; SSE-NEXT: subpd %xmm5, %xmm4
650 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
651 ; SSE-NEXT: addpd %xmm4, %xmm6
652 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm6[0]
653 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
654 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
655 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
656 ; SSE-NEXT: subpd %xmm5, %xmm2
657 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
658 ; SSE-NEXT: addpd %xmm2, %xmm1
659 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
660 ; SSE-NEXT: subpd %xmm5, %xmm4
661 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
662 ; SSE-NEXT: addpd %xmm4, %xmm2
663 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
666 ; AVX1-LABEL: uitofp_4i32_to_4f64:
668 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
669 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
670 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
671 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
672 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
673 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
676 ; AVX2-LABEL: uitofp_4i32_to_4f64:
678 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
679 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
680 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
681 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
682 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
683 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
684 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
685 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
687 %cvt = uitofp <4 x i32> %a to <4 x double>
688 ret <4 x double> %cvt
691 define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
692 ; SSE-LABEL: uitofp_4i16_to_4f64:
694 ; SSE-NEXT: pxor %xmm1, %xmm1
695 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
696 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
697 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
698 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
699 ; SSE-NEXT: movaps %xmm2, %xmm0
702 ; AVX-LABEL: uitofp_4i16_to_4f64:
704 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
705 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
707 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
708 %cvt = uitofp <4 x i16> %shuf to <4 x double>
709 ret <4 x double> %cvt
712 define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
713 ; SSE-LABEL: uitofp_8i16_to_4f64:
715 ; SSE-NEXT: pxor %xmm1, %xmm1
716 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
717 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
718 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
719 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
720 ; SSE-NEXT: movaps %xmm2, %xmm0
723 ; AVX1-LABEL: uitofp_8i16_to_4f64:
725 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
726 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
729 ; AVX2-LABEL: uitofp_8i16_to_4f64:
731 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
732 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
734 %cvt = uitofp <8 x i16> %a to <8 x double>
735 %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
736 ret <4 x double> %shuf
739 define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
740 ; SSE-LABEL: uitofp_4i8_to_4f64:
742 ; SSE-NEXT: pxor %xmm1, %xmm1
743 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
744 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
745 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
746 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
747 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
748 ; SSE-NEXT: movaps %xmm2, %xmm0
751 ; AVX-LABEL: uitofp_4i8_to_4f64:
753 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
754 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
756 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
757 %cvt = uitofp <4 x i8> %shuf to <4 x double>
758 ret <4 x double> %cvt
761 define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
762 ; SSE-LABEL: uitofp_16i8_to_4f64:
764 ; SSE-NEXT: pxor %xmm1, %xmm1
765 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
766 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
767 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
768 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
769 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
770 ; SSE-NEXT: movaps %xmm2, %xmm0
773 ; AVX1-LABEL: uitofp_16i8_to_4f64:
775 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
776 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
779 ; AVX2-LABEL: uitofp_16i8_to_4f64:
781 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
782 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
783 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
785 %cvt = uitofp <16 x i8> %a to <16 x double>
786 %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
787 ret <4 x double> %shuf
791 ; Signed Integer to Float
794 define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
795 ; SSE-LABEL: sitofp_2i64_to_4f32:
797 ; SSE-NEXT: movd %xmm0, %rax
798 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
799 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
800 ; SSE-NEXT: movd %xmm0, %rax
801 ; SSE-NEXT: xorps %xmm0, %xmm0
802 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
803 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
804 ; SSE-NEXT: movaps %xmm1, %xmm0
807 ; AVX-LABEL: sitofp_2i64_to_4f32:
809 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
810 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
811 ; AVX-NEXT: vmovq %xmm0, %rax
812 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
813 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
814 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
815 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
816 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
817 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
819 %cvt = sitofp <2 x i64> %a to <2 x float>
820 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
824 define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
825 ; SSE-LABEL: sitofp_4i64_to_4f32_undef:
827 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
828 ; SSE-NEXT: movd %xmm0, %rax
829 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
830 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
831 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
832 ; SSE-NEXT: movd %xmm0, %rax
833 ; SSE-NEXT: xorps %xmm0, %xmm0
834 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
835 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
836 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
837 ; SSE-NEXT: movaps %xmm1, %xmm0
840 ; AVX-LABEL: sitofp_4i64_to_4f32_undef:
842 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
843 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
844 ; AVX-NEXT: vmovq %xmm0, %rax
845 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
846 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
847 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
848 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
849 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
850 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
852 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
853 %cvt = sitofp <4 x i64> %ext to <4 x float>
857 define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
858 ; SSE-LABEL: sitofp_4i32_to_4f32:
860 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
863 ; AVX-LABEL: sitofp_4i32_to_4f32:
865 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
867 %cvt = sitofp <4 x i32> %a to <4 x float>
871 define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
872 ; SSE-LABEL: sitofp_4i16_to_4f32:
874 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
875 ; SSE-NEXT: psrad $16, %xmm0
876 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
879 ; AVX-LABEL: sitofp_4i16_to_4f32:
881 ; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
882 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
884 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
885 %cvt = sitofp <4 x i16> %shuf to <4 x float>
889 define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
890 ; SSE-LABEL: sitofp_8i16_to_4f32:
892 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
893 ; SSE-NEXT: psrad $16, %xmm0
894 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
897 ; AVX1-LABEL: sitofp_8i16_to_4f32:
899 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
900 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
901 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
902 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
903 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
905 ; AVX1-NEXT: vzeroupper
908 ; AVX2-LABEL: sitofp_8i16_to_4f32:
910 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
911 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
913 ; AVX2-NEXT: vzeroupper
915 %cvt = sitofp <8 x i16> %a to <8 x float>
916 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
917 ret <4 x float> %shuf
920 define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
921 ; SSE-LABEL: sitofp_4i8_to_4f32:
923 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
924 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
925 ; SSE-NEXT: psrad $24, %xmm0
926 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
929 ; AVX-LABEL: sitofp_4i8_to_4f32:
931 ; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
932 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
934 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
935 %cvt = sitofp <4 x i8> %shuf to <4 x float>
939 define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
940 ; SSE-LABEL: sitofp_16i8_to_4f32:
942 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
943 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
944 ; SSE-NEXT: psrad $24, %xmm0
945 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
948 ; AVX1-LABEL: sitofp_16i8_to_4f32:
950 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
951 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
952 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
953 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
954 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
956 ; AVX1-NEXT: vzeroupper
959 ; AVX2-LABEL: sitofp_16i8_to_4f32:
961 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
962 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
963 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
965 ; AVX2-NEXT: vzeroupper
967 %cvt = sitofp <16 x i8> %a to <16 x float>
968 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
969 ret <4 x float> %shuf
972 define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
973 ; SSE-LABEL: sitofp_4i64_to_4f32:
975 ; SSE-NEXT: movd %xmm1, %rax
976 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
977 ; SSE-NEXT: movd %xmm0, %rax
978 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
979 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
980 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
981 ; SSE-NEXT: movd %xmm1, %rax
982 ; SSE-NEXT: xorps %xmm1, %xmm1
983 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
984 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
985 ; SSE-NEXT: movd %xmm0, %rax
986 ; SSE-NEXT: xorps %xmm0, %xmm0
987 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
988 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
989 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
990 ; SSE-NEXT: movaps %xmm2, %xmm0
993 ; AVX1-LABEL: sitofp_4i64_to_4f32:
995 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
996 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
997 ; AVX1-NEXT: vmovq %xmm0, %rax
998 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
999 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1000 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1001 ; AVX1-NEXT: vmovq %xmm0, %rax
1002 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1003 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1004 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1005 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1006 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1007 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1008 ; AVX1-NEXT: vzeroupper
1011 ; AVX2-LABEL: sitofp_4i64_to_4f32:
1013 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1014 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1015 ; AVX2-NEXT: vmovq %xmm0, %rax
1016 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1017 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1018 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1019 ; AVX2-NEXT: vmovq %xmm0, %rax
1020 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1021 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1022 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1023 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1024 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1025 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1026 ; AVX2-NEXT: vzeroupper
1028 %cvt = sitofp <4 x i64> %a to <4 x float>
1029 ret <4 x float> %cvt
1032 define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
1033 ; SSE-LABEL: sitofp_8i32_to_8f32:
1035 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1036 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
1039 ; AVX-LABEL: sitofp_8i32_to_8f32:
1041 ; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
1043 %cvt = sitofp <8 x i32> %a to <8 x float>
1044 ret <8 x float> %cvt
1047 define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
1048 ; SSE-LABEL: sitofp_8i16_to_8f32:
1050 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1051 ; SSE-NEXT: psrad $16, %xmm1
1052 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1053 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1054 ; SSE-NEXT: psrad $16, %xmm0
1055 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1056 ; SSE-NEXT: movaps %xmm2, %xmm0
1059 ; AVX1-LABEL: sitofp_8i16_to_8f32:
1061 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
1062 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1063 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
1064 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1065 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1068 ; AVX2-LABEL: sitofp_8i16_to_8f32:
1070 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1071 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1073 %cvt = sitofp <8 x i16> %a to <8 x float>
1074 ret <8 x float> %cvt
1077 define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
1078 ; SSE-LABEL: sitofp_8i8_to_8f32:
1080 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1081 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1082 ; SSE-NEXT: psrad $24, %xmm1
1083 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1085 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1086 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1087 ; SSE-NEXT: psrad $24, %xmm0
1088 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1089 ; SSE-NEXT: movaps %xmm2, %xmm0
1092 ; AVX1-LABEL: sitofp_8i8_to_8f32:
1094 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1095 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1096 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1097 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1098 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1101 ; AVX2-LABEL: sitofp_8i8_to_8f32:
1103 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
1104 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1106 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1107 %cvt = sitofp <8 x i8> %shuf to <8 x float>
1108 ret <8 x float> %cvt
1111 define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
1112 ; SSE-LABEL: sitofp_16i8_to_8f32:
1114 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1115 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
1116 ; SSE-NEXT: psrad $24, %xmm1
1117 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
1118 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1119 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1120 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1121 ; SSE-NEXT: psrad $24, %xmm0
1122 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1123 ; SSE-NEXT: movaps %xmm2, %xmm0
1126 ; AVX1-LABEL: sitofp_16i8_to_8f32:
1128 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
1129 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1130 ; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
1131 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1132 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1135 ; AVX2-LABEL: sitofp_16i8_to_8f32:
1137 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
1138 ; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
1139 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1141 %cvt = sitofp <16 x i8> %a to <16 x float>
1142 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1143 ret <8 x float> %shuf
1147 ; Unsigned Integer to Float
1150 define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
1151 ; SSE-LABEL: uitofp_2i64_to_4f32:
1153 ; SSE-NEXT: movdqa %xmm0, %xmm1
1154 ; SSE-NEXT: movd %xmm1, %rax
1155 ; SSE-NEXT: movl %eax, %ecx
1156 ; SSE-NEXT: andl $1, %ecx
1157 ; SSE-NEXT: testq %rax, %rax
1158 ; SSE-NEXT: js .LBB38_1
1160 ; SSE-NEXT: xorps %xmm0, %xmm0
1161 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1162 ; SSE-NEXT: jmp .LBB38_3
1163 ; SSE-NEXT: .LBB38_1:
1164 ; SSE-NEXT: shrq %rax
1165 ; SSE-NEXT: orq %rax, %rcx
1166 ; SSE-NEXT: xorps %xmm0, %xmm0
1167 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1168 ; SSE-NEXT: addss %xmm0, %xmm0
1169 ; SSE-NEXT: .LBB38_3:
1170 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1171 ; SSE-NEXT: movd %xmm1, %rax
1172 ; SSE-NEXT: movl %eax, %ecx
1173 ; SSE-NEXT: andl $1, %ecx
1174 ; SSE-NEXT: testq %rax, %rax
1175 ; SSE-NEXT: js .LBB38_4
1177 ; SSE-NEXT: xorps %xmm1, %xmm1
1178 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1179 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1181 ; SSE-NEXT: .LBB38_4:
1182 ; SSE-NEXT: shrq %rax
1183 ; SSE-NEXT: orq %rax, %rcx
1184 ; SSE-NEXT: xorps %xmm1, %xmm1
1185 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1186 ; SSE-NEXT: addss %xmm1, %xmm1
1187 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1190 ; AVX-LABEL: uitofp_2i64_to_4f32:
1192 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1193 ; AVX-NEXT: movl %eax, %ecx
1194 ; AVX-NEXT: andl $1, %ecx
1195 ; AVX-NEXT: testq %rax, %rax
1196 ; AVX-NEXT: js .LBB38_1
1198 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1199 ; AVX-NEXT: jmp .LBB38_3
1200 ; AVX-NEXT: .LBB38_1:
1201 ; AVX-NEXT: shrq %rax
1202 ; AVX-NEXT: orq %rax, %rcx
1203 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1204 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1205 ; AVX-NEXT: .LBB38_3:
1206 ; AVX-NEXT: vmovq %xmm0, %rax
1207 ; AVX-NEXT: movl %eax, %ecx
1208 ; AVX-NEXT: andl $1, %ecx
1209 ; AVX-NEXT: testq %rax, %rax
1210 ; AVX-NEXT: js .LBB38_4
1212 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1213 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1214 ; AVX-NEXT: jmp .LBB38_6
1215 ; AVX-NEXT: .LBB38_4:
1216 ; AVX-NEXT: shrq %rax
1217 ; AVX-NEXT: orq %rax, %rcx
1218 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1219 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1220 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1221 ; AVX-NEXT: .LBB38_6:
1222 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1223 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1224 ; AVX-NEXT: testq %rax, %rax
1225 ; AVX-NEXT: js .LBB38_8
1227 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1228 ; AVX-NEXT: .LBB38_8:
1229 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1230 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1232 %cvt = uitofp <2 x i64> %a to <2 x float>
1233 %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1234 ret <4 x float> %ext
1237 define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
1238 ; SSE-LABEL: uitofp_4i64_to_4f32_undef:
1240 ; SSE-NEXT: movdqa %xmm0, %xmm1
1241 ; SSE-NEXT: testq %rax, %rax
1242 ; SSE-NEXT: xorps %xmm2, %xmm2
1243 ; SSE-NEXT: js .LBB39_2
1245 ; SSE-NEXT: xorps %xmm2, %xmm2
1246 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1247 ; SSE-NEXT: .LBB39_2:
1248 ; SSE-NEXT: movd %xmm1, %rax
1249 ; SSE-NEXT: movl %eax, %ecx
1250 ; SSE-NEXT: andl $1, %ecx
1251 ; SSE-NEXT: testq %rax, %rax
1252 ; SSE-NEXT: js .LBB39_3
1254 ; SSE-NEXT: xorps %xmm0, %xmm0
1255 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1256 ; SSE-NEXT: jmp .LBB39_5
1257 ; SSE-NEXT: .LBB39_3:
1258 ; SSE-NEXT: shrq %rax
1259 ; SSE-NEXT: orq %rax, %rcx
1260 ; SSE-NEXT: xorps %xmm0, %xmm0
1261 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1262 ; SSE-NEXT: addss %xmm0, %xmm0
1263 ; SSE-NEXT: .LBB39_5:
1264 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1265 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1266 ; SSE-NEXT: movd %xmm1, %rax
1267 ; SSE-NEXT: movl %eax, %ecx
1268 ; SSE-NEXT: andl $1, %ecx
1269 ; SSE-NEXT: testq %rax, %rax
1270 ; SSE-NEXT: js .LBB39_6
1272 ; SSE-NEXT: xorps %xmm1, %xmm1
1273 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1274 ; SSE-NEXT: jmp .LBB39_8
1275 ; SSE-NEXT: .LBB39_6:
1276 ; SSE-NEXT: shrq %rax
1277 ; SSE-NEXT: orq %rax, %rcx
1278 ; SSE-NEXT: xorps %xmm1, %xmm1
1279 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1280 ; SSE-NEXT: addss %xmm1, %xmm1
1281 ; SSE-NEXT: .LBB39_8:
1282 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1283 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1286 ; AVX-LABEL: uitofp_4i64_to_4f32_undef:
1288 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1289 ; AVX-NEXT: movl %eax, %ecx
1290 ; AVX-NEXT: andl $1, %ecx
1291 ; AVX-NEXT: testq %rax, %rax
1292 ; AVX-NEXT: js .LBB39_1
1294 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1295 ; AVX-NEXT: jmp .LBB39_3
1296 ; AVX-NEXT: .LBB39_1:
1297 ; AVX-NEXT: shrq %rax
1298 ; AVX-NEXT: orq %rax, %rcx
1299 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1300 ; AVX-NEXT: vaddss %xmm1, %xmm1, %xmm1
1301 ; AVX-NEXT: .LBB39_3:
1302 ; AVX-NEXT: vmovq %xmm0, %rax
1303 ; AVX-NEXT: movl %eax, %ecx
1304 ; AVX-NEXT: andl $1, %ecx
1305 ; AVX-NEXT: testq %rax, %rax
1306 ; AVX-NEXT: js .LBB39_4
1308 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1309 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1310 ; AVX-NEXT: jmp .LBB39_6
1311 ; AVX-NEXT: .LBB39_4:
1312 ; AVX-NEXT: shrq %rax
1313 ; AVX-NEXT: orq %rax, %rcx
1314 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1315 ; AVX-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1316 ; AVX-NEXT: vaddss %xmm0, %xmm0, %xmm0
1317 ; AVX-NEXT: .LBB39_6:
1318 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1319 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
1320 ; AVX-NEXT: testq %rax, %rax
1321 ; AVX-NEXT: js .LBB39_8
1323 ; AVX-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1324 ; AVX-NEXT: .LBB39_8:
1325 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1326 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1328 %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1329 %cvt = uitofp <4 x i64> %ext to <4 x float>
1330 ret <4 x float> %cvt
1333 define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
1334 ; SSE-LABEL: uitofp_4i32_to_4f32:
1336 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
1337 ; SSE-NEXT: pand %xmm0, %xmm1
1338 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
1339 ; SSE-NEXT: psrld $16, %xmm0
1340 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
1341 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0
1342 ; SSE-NEXT: addps %xmm1, %xmm0
1345 ; AVX1-LABEL: uitofp_4i32_to_4f32:
1347 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1348 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1349 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
1350 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
1351 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
1354 ; AVX2-LABEL: uitofp_4i32_to_4f32:
1356 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
1357 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1358 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
1359 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
1360 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
1361 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
1362 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
1363 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
1365 %cvt = uitofp <4 x i32> %a to <4 x float>
1366 ret <4 x float> %cvt
1369 define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
1370 ; SSE-LABEL: uitofp_4i16_to_4f32:
1372 ; SSE-NEXT: pxor %xmm1, %xmm1
1373 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1374 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1377 ; AVX-LABEL: uitofp_4i16_to_4f32:
1379 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1380 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1382 %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1383 %cvt = uitofp <4 x i16> %shuf to <4 x float>
1384 ret <4 x float> %cvt
1387 define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
1388 ; SSE-LABEL: uitofp_8i16_to_4f32:
1390 ; SSE-NEXT: pxor %xmm1, %xmm1
1391 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1392 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1395 ; AVX1-LABEL: uitofp_8i16_to_4f32:
1397 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1398 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1399 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1400 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1401 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1403 ; AVX1-NEXT: vzeroupper
1406 ; AVX2-LABEL: uitofp_8i16_to_4f32:
1408 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1409 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1411 ; AVX2-NEXT: vzeroupper
1413 %cvt = uitofp <8 x i16> %a to <8 x float>
1414 %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1415 ret <4 x float> %shuf
1418 define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
1419 ; SSE-LABEL: uitofp_4i8_to_4f32:
1421 ; SSE-NEXT: pxor %xmm1, %xmm1
1422 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1423 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1424 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1427 ; AVX-LABEL: uitofp_4i8_to_4f32:
1429 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1430 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
1432 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1433 %cvt = uitofp <4 x i8> %shuf to <4 x float>
1434 ret <4 x float> %cvt
1437 define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
1438 ; SSE-LABEL: uitofp_16i8_to_4f32:
1440 ; SSE-NEXT: pxor %xmm1, %xmm1
1441 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1442 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1443 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1446 ; AVX1-LABEL: uitofp_16i8_to_4f32:
1448 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1449 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1450 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1451 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1452 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1454 ; AVX1-NEXT: vzeroupper
1457 ; AVX2-LABEL: uitofp_16i8_to_4f32:
1459 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1460 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1461 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1463 ; AVX2-NEXT: vzeroupper
1465 %cvt = uitofp <16 x i8> %a to <16 x float>
1466 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1467 ret <4 x float> %shuf
1470 define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
1471 ; SSE-LABEL: uitofp_4i64_to_4f32:
1473 ; SSE-NEXT: movd %xmm1, %rax
1474 ; SSE-NEXT: movl %eax, %ecx
1475 ; SSE-NEXT: andl $1, %ecx
1476 ; SSE-NEXT: testq %rax, %rax
1477 ; SSE-NEXT: js .LBB45_1
1479 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
1480 ; SSE-NEXT: jmp .LBB45_3
1481 ; SSE-NEXT: .LBB45_1:
1482 ; SSE-NEXT: shrq %rax
1483 ; SSE-NEXT: orq %rax, %rcx
1484 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
1485 ; SSE-NEXT: addss %xmm3, %xmm3
1486 ; SSE-NEXT: .LBB45_3:
1487 ; SSE-NEXT: movd %xmm0, %rax
1488 ; SSE-NEXT: movl %eax, %ecx
1489 ; SSE-NEXT: andl $1, %ecx
1490 ; SSE-NEXT: testq %rax, %rax
1491 ; SSE-NEXT: js .LBB45_4
1493 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
1494 ; SSE-NEXT: jmp .LBB45_6
1495 ; SSE-NEXT: .LBB45_4:
1496 ; SSE-NEXT: shrq %rax
1497 ; SSE-NEXT: orq %rax, %rcx
1498 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
1499 ; SSE-NEXT: addss %xmm2, %xmm2
1500 ; SSE-NEXT: .LBB45_6:
1501 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1502 ; SSE-NEXT: movd %xmm1, %rax
1503 ; SSE-NEXT: movl %eax, %ecx
1504 ; SSE-NEXT: andl $1, %ecx
1505 ; SSE-NEXT: testq %rax, %rax
1506 ; SSE-NEXT: js .LBB45_7
1508 ; SSE-NEXT: xorps %xmm1, %xmm1
1509 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
1510 ; SSE-NEXT: jmp .LBB45_9
1511 ; SSE-NEXT: .LBB45_7:
1512 ; SSE-NEXT: shrq %rax
1513 ; SSE-NEXT: orq %rax, %rcx
1514 ; SSE-NEXT: xorps %xmm1, %xmm1
1515 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
1516 ; SSE-NEXT: addss %xmm1, %xmm1
1517 ; SSE-NEXT: .LBB45_9:
1518 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1519 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1520 ; SSE-NEXT: movd %xmm0, %rax
1521 ; SSE-NEXT: movl %eax, %ecx
1522 ; SSE-NEXT: andl $1, %ecx
1523 ; SSE-NEXT: testq %rax, %rax
1524 ; SSE-NEXT: js .LBB45_10
1525 ; SSE-NEXT: # BB#11:
1526 ; SSE-NEXT: xorps %xmm0, %xmm0
1527 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
1528 ; SSE-NEXT: jmp .LBB45_12
1529 ; SSE-NEXT: .LBB45_10:
1530 ; SSE-NEXT: shrq %rax
1531 ; SSE-NEXT: orq %rax, %rcx
1532 ; SSE-NEXT: xorps %xmm0, %xmm0
1533 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
1534 ; SSE-NEXT: addss %xmm0, %xmm0
1535 ; SSE-NEXT: .LBB45_12:
1536 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1537 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1538 ; SSE-NEXT: movaps %xmm2, %xmm0
1541 ; AVX1-LABEL: uitofp_4i64_to_4f32:
1543 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1544 ; AVX1-NEXT: movl %eax, %ecx
1545 ; AVX1-NEXT: andl $1, %ecx
1546 ; AVX1-NEXT: testq %rax, %rax
1547 ; AVX1-NEXT: js .LBB45_1
1548 ; AVX1-NEXT: # BB#2:
1549 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1550 ; AVX1-NEXT: jmp .LBB45_3
1551 ; AVX1-NEXT: .LBB45_1:
1552 ; AVX1-NEXT: shrq %rax
1553 ; AVX1-NEXT: orq %rax, %rcx
1554 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1555 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
1556 ; AVX1-NEXT: .LBB45_3:
1557 ; AVX1-NEXT: vmovq %xmm0, %rax
1558 ; AVX1-NEXT: movl %eax, %ecx
1559 ; AVX1-NEXT: andl $1, %ecx
1560 ; AVX1-NEXT: testq %rax, %rax
1561 ; AVX1-NEXT: js .LBB45_4
1562 ; AVX1-NEXT: # BB#5:
1563 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1564 ; AVX1-NEXT: jmp .LBB45_6
1565 ; AVX1-NEXT: .LBB45_4:
1566 ; AVX1-NEXT: shrq %rax
1567 ; AVX1-NEXT: orq %rax, %rcx
1568 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1569 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1570 ; AVX1-NEXT: .LBB45_6:
1571 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1572 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1573 ; AVX1-NEXT: vmovq %xmm0, %rax
1574 ; AVX1-NEXT: movl %eax, %ecx
1575 ; AVX1-NEXT: andl $1, %ecx
1576 ; AVX1-NEXT: testq %rax, %rax
1577 ; AVX1-NEXT: js .LBB45_7
1578 ; AVX1-NEXT: # BB#8:
1579 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1580 ; AVX1-NEXT: jmp .LBB45_9
1581 ; AVX1-NEXT: .LBB45_7:
1582 ; AVX1-NEXT: shrq %rax
1583 ; AVX1-NEXT: orq %rax, %rcx
1584 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1585 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
1586 ; AVX1-NEXT: .LBB45_9:
1587 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1588 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1589 ; AVX1-NEXT: movl %eax, %ecx
1590 ; AVX1-NEXT: andl $1, %ecx
1591 ; AVX1-NEXT: testq %rax, %rax
1592 ; AVX1-NEXT: js .LBB45_10
1593 ; AVX1-NEXT: # BB#11:
1594 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1595 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1596 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1597 ; AVX1-NEXT: vzeroupper
1599 ; AVX1-NEXT: .LBB45_10:
1600 ; AVX1-NEXT: shrq %rax
1601 ; AVX1-NEXT: orq %rax, %rcx
1602 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1603 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1604 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
1605 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1606 ; AVX1-NEXT: vzeroupper
1609 ; AVX2-LABEL: uitofp_4i64_to_4f32:
1611 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1612 ; AVX2-NEXT: movl %eax, %ecx
1613 ; AVX2-NEXT: andl $1, %ecx
1614 ; AVX2-NEXT: testq %rax, %rax
1615 ; AVX2-NEXT: js .LBB45_1
1616 ; AVX2-NEXT: # BB#2:
1617 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
1618 ; AVX2-NEXT: jmp .LBB45_3
1619 ; AVX2-NEXT: .LBB45_1:
1620 ; AVX2-NEXT: shrq %rax
1621 ; AVX2-NEXT: orq %rax, %rcx
1622 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
1623 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
1624 ; AVX2-NEXT: .LBB45_3:
1625 ; AVX2-NEXT: vmovq %xmm0, %rax
1626 ; AVX2-NEXT: movl %eax, %ecx
1627 ; AVX2-NEXT: andl $1, %ecx
1628 ; AVX2-NEXT: testq %rax, %rax
1629 ; AVX2-NEXT: js .LBB45_4
1630 ; AVX2-NEXT: # BB#5:
1631 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1632 ; AVX2-NEXT: jmp .LBB45_6
1633 ; AVX2-NEXT: .LBB45_4:
1634 ; AVX2-NEXT: shrq %rax
1635 ; AVX2-NEXT: orq %rax, %rcx
1636 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1637 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1638 ; AVX2-NEXT: .LBB45_6:
1639 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
1640 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
1641 ; AVX2-NEXT: vmovq %xmm0, %rax
1642 ; AVX2-NEXT: movl %eax, %ecx
1643 ; AVX2-NEXT: andl $1, %ecx
1644 ; AVX2-NEXT: testq %rax, %rax
1645 ; AVX2-NEXT: js .LBB45_7
1646 ; AVX2-NEXT: # BB#8:
1647 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
1648 ; AVX2-NEXT: jmp .LBB45_9
1649 ; AVX2-NEXT: .LBB45_7:
1650 ; AVX2-NEXT: shrq %rax
1651 ; AVX2-NEXT: orq %rax, %rcx
1652 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
1653 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
1654 ; AVX2-NEXT: .LBB45_9:
1655 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1656 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1657 ; AVX2-NEXT: movl %eax, %ecx
1658 ; AVX2-NEXT: andl $1, %ecx
1659 ; AVX2-NEXT: testq %rax, %rax
1660 ; AVX2-NEXT: js .LBB45_10
1661 ; AVX2-NEXT: # BB#11:
1662 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1663 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
1664 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1665 ; AVX2-NEXT: vzeroupper
1667 ; AVX2-NEXT: .LBB45_10:
1668 ; AVX2-NEXT: shrq %rax
1669 ; AVX2-NEXT: orq %rax, %rcx
1670 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1671 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
1672 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
1673 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1674 ; AVX2-NEXT: vzeroupper
1676 %cvt = uitofp <4 x i64> %a to <4 x float>
1677 ret <4 x float> %cvt
1680 define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
1681 ; SSE-LABEL: uitofp_8i32_to_8f32:
1683 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
1684 ; SSE-NEXT: movdqa %xmm0, %xmm3
1685 ; SSE-NEXT: pand %xmm2, %xmm3
1686 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
1687 ; SSE-NEXT: por %xmm4, %xmm3
1688 ; SSE-NEXT: psrld $16, %xmm0
1689 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
1690 ; SSE-NEXT: por %xmm5, %xmm0
1691 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
1692 ; SSE-NEXT: addps %xmm6, %xmm0
1693 ; SSE-NEXT: addps %xmm3, %xmm0
1694 ; SSE-NEXT: pand %xmm1, %xmm2
1695 ; SSE-NEXT: por %xmm4, %xmm2
1696 ; SSE-NEXT: psrld $16, %xmm1
1697 ; SSE-NEXT: por %xmm5, %xmm1
1698 ; SSE-NEXT: addps %xmm6, %xmm1
1699 ; SSE-NEXT: addps %xmm2, %xmm1
1702 ; AVX1-LABEL: uitofp_8i32_to_8f32:
1704 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
1705 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
1706 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
1707 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
1708 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
1709 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
1710 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1711 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
1712 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
1715 ; AVX2-LABEL: uitofp_8i32_to_8f32:
1717 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
1718 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
1719 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
1720 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
1721 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
1722 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
1723 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
1724 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
1726 %cvt = uitofp <8 x i32> %a to <8 x float>
1727 ret <8 x float> %cvt
1730 define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
1731 ; SSE-LABEL: uitofp_8i16_to_8f32:
1733 ; SSE-NEXT: pxor %xmm1, %xmm1
1734 ; SSE-NEXT: movdqa %xmm0, %xmm2
1735 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1736 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1737 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1738 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1739 ; SSE-NEXT: movaps %xmm2, %xmm0
1742 ; AVX1-LABEL: uitofp_8i16_to_8f32:
1744 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1745 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1746 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
1747 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1748 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1751 ; AVX2-LABEL: uitofp_8i16_to_8f32:
1753 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1754 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1756 %cvt = uitofp <8 x i16> %a to <8 x float>
1757 ret <8 x float> %cvt
1760 define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
1761 ; SSE-LABEL: uitofp_8i8_to_8f32:
1763 ; SSE-NEXT: pxor %xmm1, %xmm1
1764 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1765 ; SSE-NEXT: movdqa %xmm0, %xmm2
1766 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1767 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1768 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1769 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1770 ; SSE-NEXT: movaps %xmm2, %xmm0
1773 ; AVX1-LABEL: uitofp_8i8_to_8f32:
1775 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1776 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1777 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1778 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1779 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1782 ; AVX2-LABEL: uitofp_8i8_to_8f32:
1784 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
1785 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1787 %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1788 %cvt = uitofp <8 x i8> %shuf to <8 x float>
1789 ret <8 x float> %cvt
1792 define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
1793 ; SSE-LABEL: uitofp_16i8_to_8f32:
1795 ; SSE-NEXT: pxor %xmm1, %xmm1
1796 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1797 ; SSE-NEXT: movdqa %xmm0, %xmm2
1798 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1799 ; SSE-NEXT: cvtdq2ps %xmm2, %xmm2
1800 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1801 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
1802 ; SSE-NEXT: movaps %xmm2, %xmm0
1805 ; AVX1-LABEL: uitofp_16i8_to_8f32:
1807 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1808 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
1809 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1810 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1811 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
1814 ; AVX2-LABEL: uitofp_16i8_to_8f32:
1816 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1817 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1818 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
1820 %cvt = uitofp <16 x i8> %a to <16 x float>
1821 %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1822 ret <8 x float> %shuf
1826 ; Load Signed Integer to Double
1829 define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
1830 ; SSE-LABEL: sitofp_load_2i64_to_2f64:
1832 ; SSE-NEXT: movdqa (%rdi), %xmm1
1833 ; SSE-NEXT: movd %xmm1, %rax
1834 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
1835 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1836 ; SSE-NEXT: movd %xmm1, %rax
1837 ; SSE-NEXT: xorps %xmm1, %xmm1
1838 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
1839 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1842 ; AVX-LABEL: sitofp_load_2i64_to_2f64:
1844 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1845 ; AVX-NEXT: vpextrq $1, %xmm0, %rax
1846 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
1847 ; AVX-NEXT: vmovq %xmm0, %rax
1848 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
1849 ; AVX-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
1850 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1852 %ld = load <2 x i64>, <2 x i64> *%a
1853 %cvt = sitofp <2 x i64> %ld to <2 x double>
1854 ret <2 x double> %cvt
1857 define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
1858 ; SSE-LABEL: sitofp_load_2i32_to_2f64:
1860 ; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
1863 ; AVX-LABEL: sitofp_load_2i32_to_2f64:
1865 ; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
1867 %ld = load <2 x i32>, <2 x i32> *%a
1868 %cvt = sitofp <2 x i32> %ld to <2 x double>
1869 ret <2 x double> %cvt
1872 define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
1873 ; SSE-LABEL: sitofp_load_2i16_to_2f64:
1875 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1876 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1877 ; SSE-NEXT: psrad $16, %xmm0
1878 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
1881 ; AVX-LABEL: sitofp_load_2i16_to_2f64:
1883 ; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
1884 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1885 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
1887 %ld = load <2 x i16>, <2 x i16> *%a
1888 %cvt = sitofp <2 x i16> %ld to <2 x double>
1889 ret <2 x double> %cvt
1892 define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
1893 ; SSE-LABEL: sitofp_load_2i8_to_2f64:
1895 ; SSE-NEXT: movzwl (%rdi), %eax
1896 ; SSE-NEXT: movd %eax, %xmm0
1897 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1898 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1899 ; SSE-NEXT: psrad $24, %xmm0
1900 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
1903 ; AVX-LABEL: sitofp_load_2i8_to_2f64:
1905 ; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
1906 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1907 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
1909 %ld = load <2 x i8>, <2 x i8> *%a
1910 %cvt = sitofp <2 x i8> %ld to <2 x double>
1911 ret <2 x double> %cvt
1914 define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
1915 ; SSE-LABEL: sitofp_load_4i64_to_4f64:
1917 ; SSE-NEXT: movdqa (%rdi), %xmm1
1918 ; SSE-NEXT: movdqa 16(%rdi), %xmm2
1919 ; SSE-NEXT: movd %xmm1, %rax
1920 ; SSE-NEXT: cvtsi2sdq %rax, %xmm0
1921 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1922 ; SSE-NEXT: movd %xmm1, %rax
1923 ; SSE-NEXT: xorps %xmm1, %xmm1
1924 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
1925 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1926 ; SSE-NEXT: movd %xmm2, %rax
1927 ; SSE-NEXT: xorps %xmm1, %xmm1
1928 ; SSE-NEXT: cvtsi2sdq %rax, %xmm1
1929 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
1930 ; SSE-NEXT: movd %xmm2, %rax
1931 ; SSE-NEXT: xorps %xmm2, %xmm2
1932 ; SSE-NEXT: cvtsi2sdq %rax, %xmm2
1933 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1936 ; AVX1-LABEL: sitofp_load_4i64_to_4f64:
1938 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
1939 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
1940 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
1941 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
1942 ; AVX1-NEXT: vmovq %xmm1, %rax
1943 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
1944 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1945 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
1946 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
1947 ; AVX1-NEXT: vmovq %xmm0, %rax
1948 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
1949 ; AVX1-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
1950 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1951 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1954 ; AVX2-LABEL: sitofp_load_4i64_to_4f64:
1956 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1957 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1958 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
1959 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
1960 ; AVX2-NEXT: vmovq %xmm1, %rax
1961 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm1
1962 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1963 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
1964 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm2
1965 ; AVX2-NEXT: vmovq %xmm0, %rax
1966 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1967 ; AVX2-NEXT: vcvtsi2sdq %rax, %xmm0, %xmm0
1968 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1969 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1971 %ld = load <4 x i64>, <4 x i64> *%a
1972 %cvt = sitofp <4 x i64> %ld to <4 x double>
1973 ret <4 x double> %cvt
1976 define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
1977 ; SSE-LABEL: sitofp_load_4i32_to_4f64:
1979 ; SSE-NEXT: movdqa (%rdi), %xmm1
1980 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
1981 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1982 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
1985 ; AVX-LABEL: sitofp_load_4i32_to_4f64:
1987 ; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
1989 %ld = load <4 x i32>, <4 x i32> *%a
1990 %cvt = sitofp <4 x i32> %ld to <4 x double>
1991 ret <4 x double> %cvt
1994 define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
1995 ; SSE-LABEL: sitofp_load_4i16_to_4f64:
1997 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
1998 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1999 ; SSE-NEXT: psrad $16, %xmm1
2000 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
2001 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2002 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
2005 ; AVX-LABEL: sitofp_load_4i16_to_4f64:
2007 ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
2008 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
2010 %ld = load <4 x i16>, <4 x i16> *%a
2011 %cvt = sitofp <4 x i16> %ld to <4 x double>
2012 ret <4 x double> %cvt
2015 define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
2016 ; SSE-LABEL: sitofp_load_4i8_to_4f64:
2018 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2019 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2020 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2021 ; SSE-NEXT: psrad $24, %xmm1
2022 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
2023 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2024 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
2027 ; AVX-LABEL: sitofp_load_4i8_to_4f64:
2029 ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
2030 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
2032 %ld = load <4 x i8>, <4 x i8> *%a
2033 %cvt = sitofp <4 x i8> %ld to <4 x double>
2034 ret <4 x double> %cvt
2038 ; Load Unsigned Integer to Double
2041 define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
2042 ; SSE-LABEL: uitofp_load_2i64_to_2f64:
2044 ; SSE-NEXT: movdqa (%rdi), %xmm1
2045 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2046 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
2047 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2048 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2049 ; SSE-NEXT: subpd %xmm4, %xmm1
2050 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2051 ; SSE-NEXT: addpd %xmm1, %xmm0
2052 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2053 ; SSE-NEXT: subpd %xmm4, %xmm3
2054 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
2055 ; SSE-NEXT: addpd %xmm3, %xmm1
2056 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2059 ; AVX-LABEL: uitofp_load_2i64_to_2f64:
2061 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2062 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
2063 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2064 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
2065 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
2066 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
2067 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2068 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2069 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
2070 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
2071 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2073 %ld = load <2 x i64>, <2 x i64> *%a
2074 %cvt = uitofp <2 x i64> %ld to <2 x double>
2075 ret <2 x double> %cvt
2078 define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
2079 ; SSE-LABEL: uitofp_load_2i32_to_2f64:
2081 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2082 ; SSE-NEXT: pxor %xmm0, %xmm0
2083 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2084 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2085 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
2086 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2087 ; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2088 ; SSE-NEXT: subpd %xmm4, %xmm1
2089 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2090 ; SSE-NEXT: addpd %xmm1, %xmm0
2091 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2092 ; SSE-NEXT: subpd %xmm4, %xmm3
2093 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
2094 ; SSE-NEXT: addpd %xmm3, %xmm1
2095 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2098 ; AVX-LABEL: uitofp_load_2i32_to_2f64:
2100 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
2101 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
2102 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2103 ; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
2104 ; AVX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
2105 ; AVX-NEXT: vhaddpd %xmm2, %xmm2, %xmm2
2106 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2107 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2108 ; AVX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
2109 ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
2110 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2112 %ld = load <2 x i32>, <2 x i32> *%a
2113 %cvt = uitofp <2 x i32> %ld to <2 x double>
2114 ret <2 x double> %cvt
2117 define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
2118 ; SSE-LABEL: uitofp_load_2i16_to_2f64:
2120 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2121 ; SSE-NEXT: pxor %xmm1, %xmm1
2122 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2123 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
2126 ; AVX-LABEL: uitofp_load_2i16_to_2f64:
2128 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2129 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
2130 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
2132 %ld = load <2 x i16>, <2 x i16> *%a
2133 %cvt = uitofp <2 x i16> %ld to <2 x double>
2134 ret <2 x double> %cvt
2137 define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
2138 ; SSE-LABEL: uitofp_load_2i8_to_2f64:
2140 ; SSE-NEXT: movzwl (%rdi), %eax
2141 ; SSE-NEXT: movd %eax, %xmm0
2142 ; SSE-NEXT: pxor %xmm1, %xmm1
2143 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2144 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2145 ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
2148 ; AVX-LABEL: uitofp_load_2i8_to_2f64:
2150 ; AVX-NEXT: movzwl (%rdi), %eax
2151 ; AVX-NEXT: vmovd %eax, %xmm0
2152 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2153 ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
2155 %ld = load <2 x i8>, <2 x i8> *%a
2156 %cvt = uitofp <2 x i8> %ld to <2 x double>
2157 ret <2 x double> %cvt
2160 define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
2161 ; SSE-LABEL: uitofp_load_4i64_to_4f64:
2163 ; SSE-NEXT: movdqa (%rdi), %xmm1
2164 ; SSE-NEXT: movdqa 16(%rdi), %xmm2
2165 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
2166 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
2167 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2168 ; SSE-NEXT: movapd {{.*#+}} xmm5 = [4.503600e+15,1.934281e+25]
2169 ; SSE-NEXT: subpd %xmm5, %xmm1
2170 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
2171 ; SSE-NEXT: addpd %xmm1, %xmm0
2172 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2173 ; SSE-NEXT: subpd %xmm5, %xmm4
2174 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1]
2175 ; SSE-NEXT: addpd %xmm4, %xmm1
2176 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2177 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
2178 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2179 ; SSE-NEXT: subpd %xmm5, %xmm2
2180 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
2181 ; SSE-NEXT: addpd %xmm2, %xmm1
2182 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2183 ; SSE-NEXT: subpd %xmm5, %xmm4
2184 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
2185 ; SSE-NEXT: addpd %xmm4, %xmm2
2186 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2189 ; AVX1-LABEL: uitofp_load_4i64_to_4f64:
2191 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
2192 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
2193 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2194 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2195 ; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2196 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
2197 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
2198 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2199 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2200 ; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
2201 ; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
2202 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
2203 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2204 ; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
2205 ; AVX1-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
2206 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2207 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2208 ; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
2209 ; AVX1-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
2210 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
2211 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2214 ; AVX2-LABEL: uitofp_load_4i64_to_4f64:
2216 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2217 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
2218 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
2219 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2220 ; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
2221 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
2222 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
2223 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2224 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2225 ; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
2226 ; AVX2-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
2227 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm1[0]
2228 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2229 ; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
2230 ; AVX2-NEXT: vhaddpd %xmm3, %xmm3, %xmm3
2231 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2232 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2233 ; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
2234 ; AVX2-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
2235 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm3[0],xmm0[0]
2236 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2238 %ld = load <4 x i64>, <4 x i64> *%a
2239 %cvt = uitofp <4 x i64> %ld to <4 x double>
2240 ret <4 x double> %cvt
2243 define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
2244 ; SSE-LABEL: uitofp_load_4i32_to_4f64:
2246 ; SSE-NEXT: movdqa (%rdi), %xmm2
2247 ; SSE-NEXT: pxor %xmm1, %xmm1
2248 ; SSE-NEXT: movdqa %xmm2, %xmm3
2249 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2250 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1127219200,1160773632,0,0]
2251 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,0,1]
2252 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2253 ; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.503600e+15,1.934281e+25]
2254 ; SSE-NEXT: subpd %xmm6, %xmm3
2255 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
2256 ; SSE-NEXT: addpd %xmm3, %xmm0
2257 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2258 ; SSE-NEXT: subpd %xmm6, %xmm5
2259 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
2260 ; SSE-NEXT: addpd %xmm5, %xmm3
2261 ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
2262 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2263 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
2264 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2265 ; SSE-NEXT: subpd %xmm6, %xmm2
2266 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,0,1]
2267 ; SSE-NEXT: addpd %xmm2, %xmm1
2268 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2269 ; SSE-NEXT: subpd %xmm6, %xmm3
2270 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
2271 ; SSE-NEXT: addpd %xmm3, %xmm2
2272 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
2275 ; AVX1-LABEL: uitofp_load_4i32_to_4f64:
2277 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
2278 ; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
2279 ; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
2280 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
2281 ; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
2282 ; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
2283 ; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
2286 ; AVX2-LABEL: uitofp_load_4i32_to_4f64:
2288 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2289 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
2290 ; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
2291 ; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
2292 ; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
2293 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
2294 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
2295 ; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
2296 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
2298 %ld = load <4 x i32>, <4 x i32> *%a
2299 %cvt = uitofp <4 x i32> %ld to <4 x double>
2300 ret <4 x double> %cvt
2303 define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
2304 ; SSE-LABEL: uitofp_load_4i16_to_4f64:
2306 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2307 ; SSE-NEXT: pxor %xmm0, %xmm0
2308 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2309 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
2310 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2311 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
2314 ; AVX-LABEL: uitofp_load_4i16_to_4f64:
2316 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2317 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
2319 %ld = load <4 x i16>, <4 x i16> *%a
2320 %cvt = uitofp <4 x i16> %ld to <4 x double>
2321 ret <4 x double> %cvt
2324 define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
2325 ; SSE-LABEL: uitofp_load_4i8_to_4f64:
2327 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2328 ; SSE-NEXT: pxor %xmm0, %xmm0
2329 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2330 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2331 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
2332 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2333 ; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
2336 ; AVX-LABEL: uitofp_load_4i8_to_4f64:
2338 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2339 ; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
2341 %ld = load <4 x i8>, <4 x i8> *%a
2342 %cvt = uitofp <4 x i8> %ld to <4 x double>
2343 ret <4 x double> %cvt
2347 ; Load Signed Integer to Float
2350 define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
2351 ; SSE-LABEL: sitofp_load_4i64_to_4f32:
2353 ; SSE-NEXT: movdqa (%rdi), %xmm1
2354 ; SSE-NEXT: movdqa 16(%rdi), %xmm2
2355 ; SSE-NEXT: movd %xmm2, %rax
2356 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
2357 ; SSE-NEXT: movd %xmm1, %rax
2358 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
2359 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2360 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2361 ; SSE-NEXT: movd %xmm2, %rax
2362 ; SSE-NEXT: xorps %xmm2, %xmm2
2363 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
2364 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2365 ; SSE-NEXT: movd %xmm1, %rax
2366 ; SSE-NEXT: xorps %xmm1, %xmm1
2367 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
2368 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2369 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2372 ; AVX1-LABEL: sitofp_load_4i64_to_4f32:
2374 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
2375 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2376 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2377 ; AVX1-NEXT: vmovq %xmm0, %rax
2378 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2379 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2380 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2381 ; AVX1-NEXT: vmovq %xmm0, %rax
2382 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2383 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2384 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2385 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2386 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2387 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2388 ; AVX1-NEXT: vzeroupper
2391 ; AVX2-LABEL: sitofp_load_4i64_to_4f32:
2393 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2394 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2395 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2396 ; AVX2-NEXT: vmovq %xmm0, %rax
2397 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2398 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2399 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2400 ; AVX2-NEXT: vmovq %xmm0, %rax
2401 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2402 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2403 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2404 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2405 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2406 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2407 ; AVX2-NEXT: vzeroupper
2409 %ld = load <4 x i64>, <4 x i64> *%a
2410 %cvt = sitofp <4 x i64> %ld to <4 x float>
2411 ret <4 x float> %cvt
2414 define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
2415 ; SSE-LABEL: sitofp_load_4i32_to_4f32:
2417 ; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
2420 ; AVX-LABEL: sitofp_load_4i32_to_4f32:
2422 ; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
2424 %ld = load <4 x i32>, <4 x i32> *%a
2425 %cvt = sitofp <4 x i32> %ld to <4 x float>
2426 ret <4 x float> %cvt
2429 define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
2430 ; SSE-LABEL: sitofp_load_4i16_to_4f32:
2432 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2433 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2434 ; SSE-NEXT: psrad $16, %xmm0
2435 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2438 ; AVX-LABEL: sitofp_load_4i16_to_4f32:
2440 ; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
2441 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
2443 %ld = load <4 x i16>, <4 x i16> *%a
2444 %cvt = sitofp <4 x i16> %ld to <4 x float>
2445 ret <4 x float> %cvt
2448 define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
2449 ; SSE-LABEL: sitofp_load_4i8_to_4f32:
2451 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2452 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2453 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2454 ; SSE-NEXT: psrad $24, %xmm0
2455 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2458 ; AVX-LABEL: sitofp_load_4i8_to_4f32:
2460 ; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
2461 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
2463 %ld = load <4 x i8>, <4 x i8> *%a
2464 %cvt = sitofp <4 x i8> %ld to <4 x float>
2465 ret <4 x float> %cvt
2468 define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
2469 ; SSE-LABEL: sitofp_load_8i64_to_8f32:
2471 ; SSE-NEXT: movdqa (%rdi), %xmm1
2472 ; SSE-NEXT: movdqa 16(%rdi), %xmm2
2473 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
2474 ; SSE-NEXT: movdqa 48(%rdi), %xmm4
2475 ; SSE-NEXT: movd %xmm2, %rax
2476 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5
2477 ; SSE-NEXT: movd %xmm1, %rax
2478 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
2479 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
2480 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2481 ; SSE-NEXT: movd %xmm2, %rax
2482 ; SSE-NEXT: xorps %xmm2, %xmm2
2483 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
2484 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2485 ; SSE-NEXT: movd %xmm1, %rax
2486 ; SSE-NEXT: xorps %xmm1, %xmm1
2487 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
2488 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2489 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2490 ; SSE-NEXT: movd %xmm4, %rax
2491 ; SSE-NEXT: xorps %xmm2, %xmm2
2492 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
2493 ; SSE-NEXT: movd %xmm3, %rax
2494 ; SSE-NEXT: xorps %xmm1, %xmm1
2495 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
2496 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2497 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1]
2498 ; SSE-NEXT: movd %xmm2, %rax
2499 ; SSE-NEXT: xorps %xmm2, %xmm2
2500 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
2501 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2502 ; SSE-NEXT: movd %xmm3, %rax
2503 ; SSE-NEXT: xorps %xmm3, %xmm3
2504 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
2505 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2506 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2509 ; AVX1-LABEL: sitofp_load_8i64_to_8f32:
2511 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
2512 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
2513 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
2514 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2515 ; AVX1-NEXT: vmovq %xmm1, %rax
2516 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2517 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2518 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
2519 ; AVX1-NEXT: vmovq %xmm1, %rax
2520 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2521 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2522 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax
2523 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2524 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2525 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2526 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2527 ; AVX1-NEXT: vmovq %xmm0, %rax
2528 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2529 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2530 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2531 ; AVX1-NEXT: vmovq %xmm0, %rax
2532 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2533 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2534 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2535 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2536 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2537 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2538 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2541 ; AVX2-LABEL: sitofp_load_8i64_to_8f32:
2543 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2544 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2545 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2546 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2547 ; AVX2-NEXT: vmovq %xmm1, %rax
2548 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2549 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2550 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
2551 ; AVX2-NEXT: vmovq %xmm1, %rax
2552 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2553 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2554 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax
2555 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2556 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2557 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2558 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2559 ; AVX2-NEXT: vmovq %xmm0, %rax
2560 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2561 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2562 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2563 ; AVX2-NEXT: vmovq %xmm0, %rax
2564 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
2565 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
2566 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2567 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2568 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2569 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
2570 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2572 %ld = load <8 x i64>, <8 x i64> *%a
2573 %cvt = sitofp <8 x i64> %ld to <8 x float>
2574 ret <8 x float> %cvt
2577 define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
2578 ; SSE-LABEL: sitofp_load_8i32_to_8f32:
2580 ; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
2581 ; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
2584 ; AVX-LABEL: sitofp_load_8i32_to_8f32:
2586 ; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
2588 %ld = load <8 x i32>, <8 x i32> *%a
2589 %cvt = sitofp <8 x i32> %ld to <8 x float>
2590 ret <8 x float> %cvt
2593 define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
2594 ; SSE-LABEL: sitofp_load_8i16_to_8f32:
2596 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2597 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2598 ; SSE-NEXT: psrad $16, %xmm0
2599 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2600 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
2601 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2602 ; SSE-NEXT: psrad $16, %xmm1
2603 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
2606 ; AVX1-LABEL: sitofp_load_8i16_to_8f32:
2608 ; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
2609 ; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
2610 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2611 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
2614 ; AVX2-LABEL: sitofp_load_8i16_to_8f32:
2616 ; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
2617 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
2619 %ld = load <8 x i16>, <8 x i16> *%a
2620 %cvt = sitofp <8 x i16> %ld to <8 x float>
2621 ret <8 x float> %cvt
2624 define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
2625 ; SSE-LABEL: sitofp_load_8i8_to_8f32:
2627 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2628 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2629 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
2630 ; SSE-NEXT: psrad $24, %xmm0
2631 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2632 ; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2633 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2634 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
2635 ; SSE-NEXT: psrad $24, %xmm1
2636 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
2639 ; AVX1-LABEL: sitofp_load_8i8_to_8f32:
2641 ; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
2642 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
2643 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2644 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
2645 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2646 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
2649 ; AVX2-LABEL: sitofp_load_8i8_to_8f32:
2651 ; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
2652 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
2654 %ld = load <8 x i8>, <8 x i8> *%a
2655 %cvt = sitofp <8 x i8> %ld to <8 x float>
2656 ret <8 x float> %cvt
2660 ; Load Unsigned Integer to Float
2663 define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
2664 ; SSE-LABEL: uitofp_load_4i64_to_4f32:
2666 ; SSE-NEXT: movdqa (%rdi), %xmm1
2667 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
2668 ; SSE-NEXT: movd %xmm3, %rax
2669 ; SSE-NEXT: movl %eax, %ecx
2670 ; SSE-NEXT: andl $1, %ecx
2671 ; SSE-NEXT: testq %rax, %rax
2672 ; SSE-NEXT: js .LBB74_1
2674 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
2675 ; SSE-NEXT: jmp .LBB74_3
2676 ; SSE-NEXT: .LBB74_1:
2677 ; SSE-NEXT: shrq %rax
2678 ; SSE-NEXT: orq %rax, %rcx
2679 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
2680 ; SSE-NEXT: addss %xmm2, %xmm2
2681 ; SSE-NEXT: .LBB74_3:
2682 ; SSE-NEXT: movd %xmm1, %rax
2683 ; SSE-NEXT: movl %eax, %ecx
2684 ; SSE-NEXT: andl $1, %ecx
2685 ; SSE-NEXT: testq %rax, %rax
2686 ; SSE-NEXT: js .LBB74_4
2688 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
2689 ; SSE-NEXT: jmp .LBB74_6
2690 ; SSE-NEXT: .LBB74_4:
2691 ; SSE-NEXT: shrq %rax
2692 ; SSE-NEXT: orq %rax, %rcx
2693 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
2694 ; SSE-NEXT: addss %xmm0, %xmm0
2695 ; SSE-NEXT: .LBB74_6:
2696 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2697 ; SSE-NEXT: movd %xmm3, %rax
2698 ; SSE-NEXT: movl %eax, %ecx
2699 ; SSE-NEXT: andl $1, %ecx
2700 ; SSE-NEXT: testq %rax, %rax
2701 ; SSE-NEXT: js .LBB74_7
2703 ; SSE-NEXT: xorps %xmm3, %xmm3
2704 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
2705 ; SSE-NEXT: jmp .LBB74_9
2706 ; SSE-NEXT: .LBB74_7:
2707 ; SSE-NEXT: shrq %rax
2708 ; SSE-NEXT: orq %rax, %rcx
2709 ; SSE-NEXT: xorps %xmm3, %xmm3
2710 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
2711 ; SSE-NEXT: addss %xmm3, %xmm3
2712 ; SSE-NEXT: .LBB74_9:
2713 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2714 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2715 ; SSE-NEXT: movd %xmm1, %rax
2716 ; SSE-NEXT: movl %eax, %ecx
2717 ; SSE-NEXT: andl $1, %ecx
2718 ; SSE-NEXT: testq %rax, %rax
2719 ; SSE-NEXT: js .LBB74_10
2720 ; SSE-NEXT: # BB#11:
2721 ; SSE-NEXT: xorps %xmm1, %xmm1
2722 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
2723 ; SSE-NEXT: jmp .LBB74_12
2724 ; SSE-NEXT: .LBB74_10:
2725 ; SSE-NEXT: shrq %rax
2726 ; SSE-NEXT: orq %rax, %rcx
2727 ; SSE-NEXT: xorps %xmm1, %xmm1
2728 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
2729 ; SSE-NEXT: addss %xmm1, %xmm1
2730 ; SSE-NEXT: .LBB74_12:
2731 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2732 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2735 ; AVX1-LABEL: uitofp_load_4i64_to_4f32:
2737 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
2738 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2739 ; AVX1-NEXT: movl %eax, %ecx
2740 ; AVX1-NEXT: andl $1, %ecx
2741 ; AVX1-NEXT: testq %rax, %rax
2742 ; AVX1-NEXT: js .LBB74_1
2743 ; AVX1-NEXT: # BB#2:
2744 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2745 ; AVX1-NEXT: jmp .LBB74_3
2746 ; AVX1-NEXT: .LBB74_1:
2747 ; AVX1-NEXT: shrq %rax
2748 ; AVX1-NEXT: orq %rax, %rcx
2749 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
2750 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
2751 ; AVX1-NEXT: .LBB74_3:
2752 ; AVX1-NEXT: vmovq %xmm0, %rax
2753 ; AVX1-NEXT: movl %eax, %ecx
2754 ; AVX1-NEXT: andl $1, %ecx
2755 ; AVX1-NEXT: testq %rax, %rax
2756 ; AVX1-NEXT: js .LBB74_4
2757 ; AVX1-NEXT: # BB#5:
2758 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2759 ; AVX1-NEXT: jmp .LBB74_6
2760 ; AVX1-NEXT: .LBB74_4:
2761 ; AVX1-NEXT: shrq %rax
2762 ; AVX1-NEXT: orq %rax, %rcx
2763 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
2764 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
2765 ; AVX1-NEXT: .LBB74_6:
2766 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2767 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2768 ; AVX1-NEXT: vmovq %xmm0, %rax
2769 ; AVX1-NEXT: movl %eax, %ecx
2770 ; AVX1-NEXT: andl $1, %ecx
2771 ; AVX1-NEXT: testq %rax, %rax
2772 ; AVX1-NEXT: js .LBB74_7
2773 ; AVX1-NEXT: # BB#8:
2774 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2775 ; AVX1-NEXT: jmp .LBB74_9
2776 ; AVX1-NEXT: .LBB74_7:
2777 ; AVX1-NEXT: shrq %rax
2778 ; AVX1-NEXT: orq %rax, %rcx
2779 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
2780 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
2781 ; AVX1-NEXT: .LBB74_9:
2782 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2783 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
2784 ; AVX1-NEXT: movl %eax, %ecx
2785 ; AVX1-NEXT: andl $1, %ecx
2786 ; AVX1-NEXT: testq %rax, %rax
2787 ; AVX1-NEXT: js .LBB74_10
2788 ; AVX1-NEXT: # BB#11:
2789 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2790 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2791 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2792 ; AVX1-NEXT: vzeroupper
2794 ; AVX1-NEXT: .LBB74_10:
2795 ; AVX1-NEXT: shrq %rax
2796 ; AVX1-NEXT: orq %rax, %rcx
2797 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
2798 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
2799 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
2800 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2801 ; AVX1-NEXT: vzeroupper
2804 ; AVX2-LABEL: uitofp_load_4i64_to_4f32:
2806 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2807 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2808 ; AVX2-NEXT: movl %eax, %ecx
2809 ; AVX2-NEXT: andl $1, %ecx
2810 ; AVX2-NEXT: testq %rax, %rax
2811 ; AVX2-NEXT: js .LBB74_1
2812 ; AVX2-NEXT: # BB#2:
2813 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
2814 ; AVX2-NEXT: jmp .LBB74_3
2815 ; AVX2-NEXT: .LBB74_1:
2816 ; AVX2-NEXT: shrq %rax
2817 ; AVX2-NEXT: orq %rax, %rcx
2818 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
2819 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
2820 ; AVX2-NEXT: .LBB74_3:
2821 ; AVX2-NEXT: vmovq %xmm0, %rax
2822 ; AVX2-NEXT: movl %eax, %ecx
2823 ; AVX2-NEXT: andl $1, %ecx
2824 ; AVX2-NEXT: testq %rax, %rax
2825 ; AVX2-NEXT: js .LBB74_4
2826 ; AVX2-NEXT: # BB#5:
2827 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2828 ; AVX2-NEXT: jmp .LBB74_6
2829 ; AVX2-NEXT: .LBB74_4:
2830 ; AVX2-NEXT: shrq %rax
2831 ; AVX2-NEXT: orq %rax, %rcx
2832 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
2833 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
2834 ; AVX2-NEXT: .LBB74_6:
2835 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
2836 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2837 ; AVX2-NEXT: vmovq %xmm0, %rax
2838 ; AVX2-NEXT: movl %eax, %ecx
2839 ; AVX2-NEXT: andl $1, %ecx
2840 ; AVX2-NEXT: testq %rax, %rax
2841 ; AVX2-NEXT: js .LBB74_7
2842 ; AVX2-NEXT: # BB#8:
2843 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
2844 ; AVX2-NEXT: jmp .LBB74_9
2845 ; AVX2-NEXT: .LBB74_7:
2846 ; AVX2-NEXT: shrq %rax
2847 ; AVX2-NEXT: orq %rax, %rcx
2848 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
2849 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
2850 ; AVX2-NEXT: .LBB74_9:
2851 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
2852 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
2853 ; AVX2-NEXT: movl %eax, %ecx
2854 ; AVX2-NEXT: andl $1, %ecx
2855 ; AVX2-NEXT: testq %rax, %rax
2856 ; AVX2-NEXT: js .LBB74_10
2857 ; AVX2-NEXT: # BB#11:
2858 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2859 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm0
2860 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2861 ; AVX2-NEXT: vzeroupper
2863 ; AVX2-NEXT: .LBB74_10:
2864 ; AVX2-NEXT: shrq %rax
2865 ; AVX2-NEXT: orq %rax, %rcx
2866 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
2867 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
2868 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
2869 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2870 ; AVX2-NEXT: vzeroupper
2872 %ld = load <4 x i64>, <4 x i64> *%a
2873 %cvt = uitofp <4 x i64> %ld to <4 x float>
2874 ret <4 x float> %cvt
2877 define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
2878 ; SSE-LABEL: uitofp_load_4i32_to_4f32:
2880 ; SSE-NEXT: movdqa (%rdi), %xmm0
2881 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
2882 ; SSE-NEXT: pand %xmm0, %xmm1
2883 ; SSE-NEXT: por {{.*}}(%rip), %xmm1
2884 ; SSE-NEXT: psrld $16, %xmm0
2885 ; SSE-NEXT: por {{.*}}(%rip), %xmm0
2886 ; SSE-NEXT: addps {{.*}}(%rip), %xmm0
2887 ; SSE-NEXT: addps %xmm1, %xmm0
2890 ; AVX1-LABEL: uitofp_load_4i32_to_4f32:
2892 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
2893 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2894 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
2895 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
2896 ; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
2897 ; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
2900 ; AVX2-LABEL: uitofp_load_4i32_to_4f32:
2902 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2903 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
2904 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2905 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
2906 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
2907 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
2908 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
2909 ; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
2910 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
2912 %ld = load <4 x i32>, <4 x i32> *%a
2913 %cvt = uitofp <4 x i32> %ld to <4 x float>
2914 ret <4 x float> %cvt
2917 define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
2918 ; SSE-LABEL: uitofp_load_4i16_to_4f32:
2920 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
2921 ; SSE-NEXT: pxor %xmm1, %xmm1
2922 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2923 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2926 ; AVX-LABEL: uitofp_load_4i16_to_4f32:
2928 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
2929 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
2931 %ld = load <4 x i16>, <4 x i16> *%a
2932 %cvt = uitofp <4 x i16> %ld to <4 x float>
2933 ret <4 x float> %cvt
2936 define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
2937 ; SSE-LABEL: uitofp_load_4i8_to_4f32:
2939 ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2940 ; SSE-NEXT: pxor %xmm1, %xmm1
2941 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2942 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2943 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
2946 ; AVX-LABEL: uitofp_load_4i8_to_4f32:
2948 ; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
2949 ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
2951 %ld = load <4 x i8>, <4 x i8> *%a
2952 %cvt = uitofp <4 x i8> %ld to <4 x float>
2953 ret <4 x float> %cvt
2956 define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
2957 ; SSE-LABEL: uitofp_load_8i64_to_8f32:
2959 ; SSE-NEXT: movdqa (%rdi), %xmm1
2960 ; SSE-NEXT: movdqa 16(%rdi), %xmm5
2961 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
2962 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
2963 ; SSE-NEXT: movd %xmm5, %rax
2964 ; SSE-NEXT: movl %eax, %ecx
2965 ; SSE-NEXT: andl $1, %ecx
2966 ; SSE-NEXT: testq %rax, %rax
2967 ; SSE-NEXT: js .LBB78_1
2969 ; SSE-NEXT: cvtsi2ssq %rax, %xmm4
2970 ; SSE-NEXT: jmp .LBB78_3
2971 ; SSE-NEXT: .LBB78_1:
2972 ; SSE-NEXT: shrq %rax
2973 ; SSE-NEXT: orq %rax, %rcx
2974 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm4
2975 ; SSE-NEXT: addss %xmm4, %xmm4
2976 ; SSE-NEXT: .LBB78_3:
2977 ; SSE-NEXT: movd %xmm1, %rax
2978 ; SSE-NEXT: movl %eax, %ecx
2979 ; SSE-NEXT: andl $1, %ecx
2980 ; SSE-NEXT: testq %rax, %rax
2981 ; SSE-NEXT: js .LBB78_4
2983 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0
2984 ; SSE-NEXT: jmp .LBB78_6
2985 ; SSE-NEXT: .LBB78_4:
2986 ; SSE-NEXT: shrq %rax
2987 ; SSE-NEXT: orq %rax, %rcx
2988 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm0
2989 ; SSE-NEXT: addss %xmm0, %xmm0
2990 ; SSE-NEXT: .LBB78_6:
2991 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2992 ; SSE-NEXT: movd %xmm5, %rax
2993 ; SSE-NEXT: movl %eax, %ecx
2994 ; SSE-NEXT: andl $1, %ecx
2995 ; SSE-NEXT: testq %rax, %rax
2996 ; SSE-NEXT: js .LBB78_7
2998 ; SSE-NEXT: cvtsi2ssq %rax, %xmm6
2999 ; SSE-NEXT: jmp .LBB78_9
3000 ; SSE-NEXT: .LBB78_7:
3001 ; SSE-NEXT: shrq %rax
3002 ; SSE-NEXT: orq %rax, %rcx
3003 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm6
3004 ; SSE-NEXT: addss %xmm6, %xmm6
3005 ; SSE-NEXT: .LBB78_9:
3006 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
3007 ; SSE-NEXT: movd %xmm1, %rax
3008 ; SSE-NEXT: movl %eax, %ecx
3009 ; SSE-NEXT: andl $1, %ecx
3010 ; SSE-NEXT: testq %rax, %rax
3011 ; SSE-NEXT: js .LBB78_10
3012 ; SSE-NEXT: # BB#11:
3013 ; SSE-NEXT: xorps %xmm5, %xmm5
3014 ; SSE-NEXT: cvtsi2ssq %rax, %xmm5
3015 ; SSE-NEXT: jmp .LBB78_12
3016 ; SSE-NEXT: .LBB78_10:
3017 ; SSE-NEXT: shrq %rax
3018 ; SSE-NEXT: orq %rax, %rcx
3019 ; SSE-NEXT: xorps %xmm5, %xmm5
3020 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm5
3021 ; SSE-NEXT: addss %xmm5, %xmm5
3022 ; SSE-NEXT: .LBB78_12:
3023 ; SSE-NEXT: movd %xmm3, %rax
3024 ; SSE-NEXT: movl %eax, %ecx
3025 ; SSE-NEXT: andl $1, %ecx
3026 ; SSE-NEXT: testq %rax, %rax
3027 ; SSE-NEXT: js .LBB78_13
3028 ; SSE-NEXT: # BB#14:
3029 ; SSE-NEXT: cvtsi2ssq %rax, %xmm7
3030 ; SSE-NEXT: jmp .LBB78_15
3031 ; SSE-NEXT: .LBB78_13:
3032 ; SSE-NEXT: shrq %rax
3033 ; SSE-NEXT: orq %rax, %rcx
3034 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm7
3035 ; SSE-NEXT: addss %xmm7, %xmm7
3036 ; SSE-NEXT: .LBB78_15:
3037 ; SSE-NEXT: movd %xmm2, %rax
3038 ; SSE-NEXT: movl %eax, %ecx
3039 ; SSE-NEXT: andl $1, %ecx
3040 ; SSE-NEXT: testq %rax, %rax
3041 ; SSE-NEXT: js .LBB78_16
3042 ; SSE-NEXT: # BB#17:
3043 ; SSE-NEXT: xorps %xmm1, %xmm1
3044 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1
3045 ; SSE-NEXT: jmp .LBB78_18
3046 ; SSE-NEXT: .LBB78_16:
3047 ; SSE-NEXT: shrq %rax
3048 ; SSE-NEXT: orq %rax, %rcx
3049 ; SSE-NEXT: xorps %xmm1, %xmm1
3050 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm1
3051 ; SSE-NEXT: addss %xmm1, %xmm1
3052 ; SSE-NEXT: .LBB78_18:
3053 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
3054 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
3055 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
3056 ; SSE-NEXT: movd %xmm3, %rax
3057 ; SSE-NEXT: movl %eax, %ecx
3058 ; SSE-NEXT: andl $1, %ecx
3059 ; SSE-NEXT: testq %rax, %rax
3060 ; SSE-NEXT: js .LBB78_19
3061 ; SSE-NEXT: # BB#20:
3062 ; SSE-NEXT: xorps %xmm3, %xmm3
3063 ; SSE-NEXT: cvtsi2ssq %rax, %xmm3
3064 ; SSE-NEXT: jmp .LBB78_21
3065 ; SSE-NEXT: .LBB78_19:
3066 ; SSE-NEXT: shrq %rax
3067 ; SSE-NEXT: orq %rax, %rcx
3068 ; SSE-NEXT: xorps %xmm3, %xmm3
3069 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm3
3070 ; SSE-NEXT: addss %xmm3, %xmm3
3071 ; SSE-NEXT: .LBB78_21:
3072 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
3073 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
3074 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
3075 ; SSE-NEXT: movd %xmm2, %rax
3076 ; SSE-NEXT: movl %eax, %ecx
3077 ; SSE-NEXT: andl $1, %ecx
3078 ; SSE-NEXT: testq %rax, %rax
3079 ; SSE-NEXT: js .LBB78_22
3080 ; SSE-NEXT: # BB#23:
3081 ; SSE-NEXT: xorps %xmm2, %xmm2
3082 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2
3083 ; SSE-NEXT: jmp .LBB78_24
3084 ; SSE-NEXT: .LBB78_22:
3085 ; SSE-NEXT: shrq %rax
3086 ; SSE-NEXT: orq %rax, %rcx
3087 ; SSE-NEXT: xorps %xmm2, %xmm2
3088 ; SSE-NEXT: cvtsi2ssq %rcx, %xmm2
3089 ; SSE-NEXT: addss %xmm2, %xmm2
3090 ; SSE-NEXT: .LBB78_24:
3091 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3092 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
3095 ; AVX1-LABEL: uitofp_load_8i64_to_8f32:
3097 ; AVX1-NEXT: vmovdqa (%rdi), %ymm0
3098 ; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
3099 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax
3100 ; AVX1-NEXT: movl %eax, %ecx
3101 ; AVX1-NEXT: andl $1, %ecx
3102 ; AVX1-NEXT: testq %rax, %rax
3103 ; AVX1-NEXT: js .LBB78_1
3104 ; AVX1-NEXT: # BB#2:
3105 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3106 ; AVX1-NEXT: jmp .LBB78_3
3107 ; AVX1-NEXT: .LBB78_1:
3108 ; AVX1-NEXT: shrq %rax
3109 ; AVX1-NEXT: orq %rax, %rcx
3110 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3111 ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
3112 ; AVX1-NEXT: .LBB78_3:
3113 ; AVX1-NEXT: vmovq %xmm2, %rax
3114 ; AVX1-NEXT: movl %eax, %ecx
3115 ; AVX1-NEXT: andl $1, %ecx
3116 ; AVX1-NEXT: testq %rax, %rax
3117 ; AVX1-NEXT: js .LBB78_4
3118 ; AVX1-NEXT: # BB#5:
3119 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3120 ; AVX1-NEXT: jmp .LBB78_6
3121 ; AVX1-NEXT: .LBB78_4:
3122 ; AVX1-NEXT: shrq %rax
3123 ; AVX1-NEXT: orq %rax, %rcx
3124 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3125 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
3126 ; AVX1-NEXT: .LBB78_6:
3127 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
3128 ; AVX1-NEXT: vmovq %xmm2, %rax
3129 ; AVX1-NEXT: movl %eax, %ecx
3130 ; AVX1-NEXT: andl $1, %ecx
3131 ; AVX1-NEXT: testq %rax, %rax
3132 ; AVX1-NEXT: js .LBB78_7
3133 ; AVX1-NEXT: # BB#8:
3134 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
3135 ; AVX1-NEXT: jmp .LBB78_9
3136 ; AVX1-NEXT: .LBB78_7:
3137 ; AVX1-NEXT: shrq %rax
3138 ; AVX1-NEXT: orq %rax, %rcx
3139 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
3140 ; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4
3141 ; AVX1-NEXT: .LBB78_9:
3142 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax
3143 ; AVX1-NEXT: movl %eax, %ecx
3144 ; AVX1-NEXT: andl $1, %ecx
3145 ; AVX1-NEXT: testq %rax, %rax
3146 ; AVX1-NEXT: js .LBB78_10
3147 ; AVX1-NEXT: # BB#11:
3148 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3149 ; AVX1-NEXT: jmp .LBB78_12
3150 ; AVX1-NEXT: .LBB78_10:
3151 ; AVX1-NEXT: shrq %rax
3152 ; AVX1-NEXT: orq %rax, %rcx
3153 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3154 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
3155 ; AVX1-NEXT: .LBB78_12:
3156 ; AVX1-NEXT: vpextrq $1, %xmm0, %rax
3157 ; AVX1-NEXT: movl %eax, %ecx
3158 ; AVX1-NEXT: andl $1, %ecx
3159 ; AVX1-NEXT: testq %rax, %rax
3160 ; AVX1-NEXT: js .LBB78_13
3161 ; AVX1-NEXT: # BB#14:
3162 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3163 ; AVX1-NEXT: jmp .LBB78_15
3164 ; AVX1-NEXT: .LBB78_13:
3165 ; AVX1-NEXT: shrq %rax
3166 ; AVX1-NEXT: orq %rax, %rcx
3167 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
3168 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5
3169 ; AVX1-NEXT: .LBB78_15:
3170 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
3171 ; AVX1-NEXT: vmovq %xmm0, %rax
3172 ; AVX1-NEXT: movl %eax, %ecx
3173 ; AVX1-NEXT: andl $1, %ecx
3174 ; AVX1-NEXT: testq %rax, %rax
3175 ; AVX1-NEXT: js .LBB78_16
3176 ; AVX1-NEXT: # BB#17:
3177 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3178 ; AVX1-NEXT: jmp .LBB78_18
3179 ; AVX1-NEXT: .LBB78_16:
3180 ; AVX1-NEXT: shrq %rax
3181 ; AVX1-NEXT: orq %rax, %rcx
3182 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3183 ; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3
3184 ; AVX1-NEXT: .LBB78_18:
3185 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
3186 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
3187 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
3188 ; AVX1-NEXT: vmovq %xmm4, %rax
3189 ; AVX1-NEXT: movl %eax, %ecx
3190 ; AVX1-NEXT: andl $1, %ecx
3191 ; AVX1-NEXT: testq %rax, %rax
3192 ; AVX1-NEXT: js .LBB78_19
3193 ; AVX1-NEXT: # BB#20:
3194 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3195 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3196 ; AVX1-NEXT: jmp .LBB78_21
3197 ; AVX1-NEXT: .LBB78_19:
3198 ; AVX1-NEXT: shrq %rax
3199 ; AVX1-NEXT: orq %rax, %rcx
3200 ; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
3201 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3202 ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5
3203 ; AVX1-NEXT: .LBB78_21:
3204 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
3205 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
3206 ; AVX1-NEXT: vpextrq $1, %xmm4, %rax
3207 ; AVX1-NEXT: movl %eax, %ecx
3208 ; AVX1-NEXT: andl $1, %ecx
3209 ; AVX1-NEXT: testq %rax, %rax
3210 ; AVX1-NEXT: js .LBB78_22
3211 ; AVX1-NEXT: # BB#23:
3212 ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3213 ; AVX1-NEXT: jmp .LBB78_24
3214 ; AVX1-NEXT: .LBB78_22:
3215 ; AVX1-NEXT: shrq %rax
3216 ; AVX1-NEXT: orq %rax, %rcx
3217 ; AVX1-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3218 ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
3219 ; AVX1-NEXT: .LBB78_24:
3220 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
3221 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3224 ; AVX2-LABEL: uitofp_load_8i64_to_8f32:
3226 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3227 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
3228 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
3229 ; AVX2-NEXT: movl %eax, %ecx
3230 ; AVX2-NEXT: andl $1, %ecx
3231 ; AVX2-NEXT: testq %rax, %rax
3232 ; AVX2-NEXT: js .LBB78_1
3233 ; AVX2-NEXT: # BB#2:
3234 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm1
3235 ; AVX2-NEXT: jmp .LBB78_3
3236 ; AVX2-NEXT: .LBB78_1:
3237 ; AVX2-NEXT: shrq %rax
3238 ; AVX2-NEXT: orq %rax, %rcx
3239 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm1
3240 ; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
3241 ; AVX2-NEXT: .LBB78_3:
3242 ; AVX2-NEXT: vmovq %xmm2, %rax
3243 ; AVX2-NEXT: movl %eax, %ecx
3244 ; AVX2-NEXT: andl $1, %ecx
3245 ; AVX2-NEXT: testq %rax, %rax
3246 ; AVX2-NEXT: js .LBB78_4
3247 ; AVX2-NEXT: # BB#5:
3248 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3249 ; AVX2-NEXT: jmp .LBB78_6
3250 ; AVX2-NEXT: .LBB78_4:
3251 ; AVX2-NEXT: shrq %rax
3252 ; AVX2-NEXT: orq %rax, %rcx
3253 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3254 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
3255 ; AVX2-NEXT: .LBB78_6:
3256 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
3257 ; AVX2-NEXT: vmovq %xmm2, %rax
3258 ; AVX2-NEXT: movl %eax, %ecx
3259 ; AVX2-NEXT: andl $1, %ecx
3260 ; AVX2-NEXT: testq %rax, %rax
3261 ; AVX2-NEXT: js .LBB78_7
3262 ; AVX2-NEXT: # BB#8:
3263 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm4
3264 ; AVX2-NEXT: jmp .LBB78_9
3265 ; AVX2-NEXT: .LBB78_7:
3266 ; AVX2-NEXT: shrq %rax
3267 ; AVX2-NEXT: orq %rax, %rcx
3268 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm4
3269 ; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4
3270 ; AVX2-NEXT: .LBB78_9:
3271 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax
3272 ; AVX2-NEXT: movl %eax, %ecx
3273 ; AVX2-NEXT: andl $1, %ecx
3274 ; AVX2-NEXT: testq %rax, %rax
3275 ; AVX2-NEXT: js .LBB78_10
3276 ; AVX2-NEXT: # BB#11:
3277 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3278 ; AVX2-NEXT: jmp .LBB78_12
3279 ; AVX2-NEXT: .LBB78_10:
3280 ; AVX2-NEXT: shrq %rax
3281 ; AVX2-NEXT: orq %rax, %rcx
3282 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3283 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
3284 ; AVX2-NEXT: .LBB78_12:
3285 ; AVX2-NEXT: vpextrq $1, %xmm0, %rax
3286 ; AVX2-NEXT: movl %eax, %ecx
3287 ; AVX2-NEXT: andl $1, %ecx
3288 ; AVX2-NEXT: testq %rax, %rax
3289 ; AVX2-NEXT: js .LBB78_13
3290 ; AVX2-NEXT: # BB#14:
3291 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3292 ; AVX2-NEXT: jmp .LBB78_15
3293 ; AVX2-NEXT: .LBB78_13:
3294 ; AVX2-NEXT: shrq %rax
3295 ; AVX2-NEXT: orq %rax, %rcx
3296 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm5
3297 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5
3298 ; AVX2-NEXT: .LBB78_15:
3299 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
3300 ; AVX2-NEXT: vmovq %xmm0, %rax
3301 ; AVX2-NEXT: movl %eax, %ecx
3302 ; AVX2-NEXT: andl $1, %ecx
3303 ; AVX2-NEXT: testq %rax, %rax
3304 ; AVX2-NEXT: js .LBB78_16
3305 ; AVX2-NEXT: # BB#17:
3306 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm3
3307 ; AVX2-NEXT: jmp .LBB78_18
3308 ; AVX2-NEXT: .LBB78_16:
3309 ; AVX2-NEXT: shrq %rax
3310 ; AVX2-NEXT: orq %rax, %rcx
3311 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm3
3312 ; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3
3313 ; AVX2-NEXT: .LBB78_18:
3314 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3]
3315 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3]
3316 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
3317 ; AVX2-NEXT: vmovq %xmm4, %rax
3318 ; AVX2-NEXT: movl %eax, %ecx
3319 ; AVX2-NEXT: andl $1, %ecx
3320 ; AVX2-NEXT: testq %rax, %rax
3321 ; AVX2-NEXT: js .LBB78_19
3322 ; AVX2-NEXT: # BB#20:
3323 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3324 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm5
3325 ; AVX2-NEXT: jmp .LBB78_21
3326 ; AVX2-NEXT: .LBB78_19:
3327 ; AVX2-NEXT: shrq %rax
3328 ; AVX2-NEXT: orq %rax, %rcx
3329 ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
3330 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm0
3331 ; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5
3332 ; AVX2-NEXT: .LBB78_21:
3333 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0]
3334 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3]
3335 ; AVX2-NEXT: vpextrq $1, %xmm4, %rax
3336 ; AVX2-NEXT: movl %eax, %ecx
3337 ; AVX2-NEXT: andl $1, %ecx
3338 ; AVX2-NEXT: testq %rax, %rax
3339 ; AVX2-NEXT: js .LBB78_22
3340 ; AVX2-NEXT: # BB#23:
3341 ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm0, %xmm2
3342 ; AVX2-NEXT: jmp .LBB78_24
3343 ; AVX2-NEXT: .LBB78_22:
3344 ; AVX2-NEXT: shrq %rax
3345 ; AVX2-NEXT: orq %rax, %rcx
3346 ; AVX2-NEXT: vcvtsi2ssq %rcx, %xmm0, %xmm2
3347 ; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
3348 ; AVX2-NEXT: .LBB78_24:
3349 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
3350 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3352 %ld = load <8 x i64>, <8 x i64> *%a
3353 %cvt = uitofp <8 x i64> %ld to <8 x float>
3354 ret <8 x float> %cvt
3357 define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
3358 ; SSE-LABEL: uitofp_load_8i32_to_8f32:
3360 ; SSE-NEXT: movdqa (%rdi), %xmm0
3361 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
3362 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
3363 ; SSE-NEXT: movdqa %xmm0, %xmm3
3364 ; SSE-NEXT: pand %xmm2, %xmm3
3365 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
3366 ; SSE-NEXT: por %xmm4, %xmm3
3367 ; SSE-NEXT: psrld $16, %xmm0
3368 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
3369 ; SSE-NEXT: por %xmm5, %xmm0
3370 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11]
3371 ; SSE-NEXT: addps %xmm6, %xmm0
3372 ; SSE-NEXT: addps %xmm3, %xmm0
3373 ; SSE-NEXT: pand %xmm1, %xmm2
3374 ; SSE-NEXT: por %xmm4, %xmm2
3375 ; SSE-NEXT: psrld $16, %xmm1
3376 ; SSE-NEXT: por %xmm5, %xmm1
3377 ; SSE-NEXT: addps %xmm6, %xmm1
3378 ; SSE-NEXT: addps %xmm2, %xmm1
3381 ; AVX1-LABEL: uitofp_load_8i32_to_8f32:
3383 ; AVX1-NEXT: vmovaps (%rdi), %ymm0
3384 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1
3385 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
3386 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
3387 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
3388 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
3389 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
3390 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
3391 ; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
3392 ; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
3395 ; AVX2-LABEL: uitofp_load_8i32_to_8f32:
3397 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
3398 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
3399 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
3400 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
3401 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
3402 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
3403 ; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
3404 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
3405 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
3407 %ld = load <8 x i32>, <8 x i32> *%a
3408 %cvt = uitofp <8 x i32> %ld to <8 x float>
3409 ret <8 x float> %cvt
3412 define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
3413 ; SSE-LABEL: uitofp_load_8i16_to_8f32:
3415 ; SSE-NEXT: movdqa (%rdi), %xmm1
3416 ; SSE-NEXT: pxor %xmm2, %xmm2
3417 ; SSE-NEXT: movdqa %xmm1, %xmm0
3418 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3419 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
3420 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3421 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
3424 ; AVX1-LABEL: uitofp_load_8i16_to_8f32:
3426 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3427 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
3428 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3429 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
3432 ; AVX2-LABEL: uitofp_load_8i16_to_8f32:
3434 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
3435 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
3437 %ld = load <8 x i16>, <8 x i16> *%a
3438 %cvt = uitofp <8 x i16> %ld to <8 x float>
3439 ret <8 x float> %cvt
3442 define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
3443 ; SSE-LABEL: uitofp_load_8i8_to_8f32:
3445 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
3446 ; SSE-NEXT: pxor %xmm2, %xmm2
3447 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3448 ; SSE-NEXT: movdqa %xmm1, %xmm0
3449 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3450 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
3451 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3452 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
3455 ; AVX1-LABEL: uitofp_load_8i8_to_8f32:
3457 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3458 ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
3459 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3460 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
3463 ; AVX2-LABEL: uitofp_load_8i8_to_8f32:
3465 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
3466 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
3468 %ld = load <8 x i8>, <8 x i8> *%a
3469 %cvt = uitofp <8 x i8> %ld to <8 x float>
3470 ret <8 x float> %cvt
3477 %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
3478 define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
3479 ; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
3481 ; SSE-NEXT: movq 24(%rdi), %rax
3482 ; SSE-NEXT: movdqu 8(%rdi), %xmm0
3483 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3484 ; SSE-NEXT: psrad $16, %xmm1
3485 ; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
3486 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
3487 ; SSE-NEXT: psrad $16, %xmm0
3488 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
3489 ; SSE-NEXT: movaps %xmm0, 16(%rax)
3490 ; SSE-NEXT: movaps %xmm1, (%rax)
3493 ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
3495 ; AVX1-NEXT: movq 24(%rdi), %rax
3496 ; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
3497 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
3498 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
3499 ; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
3500 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3501 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
3502 ; AVX1-NEXT: vmovaps %ymm0, (%rax)
3503 ; AVX1-NEXT: vzeroupper
3506 ; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
3508 ; AVX2-NEXT: movq 24(%rdi), %rax
3509 ; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
3510 ; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
3511 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
3512 ; AVX2-NEXT: vzeroupper
3514 %1 = load %Arguments, %Arguments* %a0, align 1
3515 %2 = extractvalue %Arguments %1, 1
3516 %3 = extractvalue %Arguments %1, 2
3517 %4 = sitofp <8 x i16> %2 to <8 x float>
3518 store <8 x float> %4, <8 x float>* %3, align 32