1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
5 define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
6 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
8 ; AVX512-NEXT: kmovb (%rdi), %k0
9 ; AVX512-NEXT: kshiftrw $4, %k0, %k0
10 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
11 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
12 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
13 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
14 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
17 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
18 ; AVX512NOTDQ: # %bb.0:
19 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
20 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
21 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
22 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
23 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
24 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
25 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
26 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
27 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
28 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
29 ; AVX512NOTDQ-NEXT: retq
30 %d0 = load <8 x i1>, <8 x i1>* %a0
31 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
32 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
33 store <2 x double> %d2, <2 x double>* %a3
36 define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
37 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
39 ; AVX512-NEXT: kmovb (%rdi), %k0
40 ; AVX512-NEXT: kshiftrw $6, %k0, %k0
41 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
42 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
43 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
44 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
45 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
48 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
49 ; AVX512NOTDQ: # %bb.0:
50 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
51 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
52 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
53 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
54 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
55 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
56 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
57 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
58 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
59 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
60 ; AVX512NOTDQ-NEXT: retq
61 %d0 = load <8 x i1>, <8 x i1>* %a0
62 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
63 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
64 store <2 x double> %d2, <2 x double>* %a3
67 define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
68 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
70 ; AVX512-NEXT: kmovw (%rdi), %k0
71 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
72 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
73 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
74 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
75 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
76 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
79 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
80 ; AVX512NOTDQ: # %bb.0:
81 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
82 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
83 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
84 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
85 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
86 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
87 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
88 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
89 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
90 ; AVX512NOTDQ-NEXT: retq
91 %d0 = load <16 x i1>, <16 x i1>* %a0
92 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
93 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
94 store <2 x double> %d2, <2 x double>* %a3
97 define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
98 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
100 ; AVX512-NEXT: kmovw (%rdi), %k0
101 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
102 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
103 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
104 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
105 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
106 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
109 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
110 ; AVX512NOTDQ: # %bb.0:
111 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
112 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
113 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
114 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
115 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
116 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
117 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
118 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
119 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
120 ; AVX512NOTDQ-NEXT: retq
121 %d0 = load <16 x i1>, <16 x i1>* %a0
122 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
123 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
124 store <4 x float> %d2, <4 x float>* %a3
127 define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
128 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
130 ; AVX512-NEXT: kmovw (%rdi), %k0
131 ; AVX512-NEXT: kshiftrw $14, %k0, %k0
132 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
133 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
134 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
135 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
136 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
139 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
140 ; AVX512NOTDQ: # %bb.0:
141 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
142 ; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
143 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
144 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
145 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
146 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
147 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
148 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
149 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
150 ; AVX512NOTDQ-NEXT: retq
151 %d0 = load <16 x i1>, <16 x i1>* %a0
152 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
153 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
154 store <2 x double> %d2, <2 x double>* %a3
157 define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
158 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
160 ; AVX512-NEXT: kmovw (%rdi), %k0
161 ; AVX512-NEXT: kshiftrw $12, %k0, %k0
162 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
163 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
164 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
165 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
166 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
169 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
170 ; AVX512NOTDQ: # %bb.0:
171 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
172 ; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
173 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
175 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
176 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
177 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
178 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
179 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
180 ; AVX512NOTDQ-NEXT: retq
181 %d0 = load <16 x i1>, <16 x i1>* %a0
182 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
183 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
184 store <4 x float> %d2, <4 x float>* %a3
187 define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
188 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
190 ; AVX512-NEXT: kmovd (%rdi), %k0
191 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
192 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
193 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
194 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
195 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
196 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
199 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
200 ; AVX512NOTDQ: # %bb.0:
201 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
202 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
203 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
204 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
205 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
206 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
207 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
208 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
209 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
210 ; AVX512NOTDQ-NEXT: retq
211 %d0 = load <32 x i1>, <32 x i1>* %a0
212 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
213 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
214 store <2 x double> %d2, <2 x double>* %a3
217 define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
218 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
220 ; AVX512-NEXT: kmovd (%rdi), %k0
221 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
222 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
223 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
224 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
225 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
226 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
229 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
230 ; AVX512NOTDQ: # %bb.0:
231 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
232 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
233 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
234 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
235 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
236 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
237 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
238 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
239 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
240 ; AVX512NOTDQ-NEXT: retq
241 %d0 = load <32 x i1>, <32 x i1>* %a0
242 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
243 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
244 store <4 x float> %d2, <4 x float>* %a3
247 define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
248 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
250 ; AVX512-NEXT: kmovd (%rdi), %k0
251 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
252 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
253 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
254 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
255 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
256 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
257 ; AVX512-NEXT: vzeroupper
260 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
261 ; AVX512NOTDQ: # %bb.0:
262 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
263 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
264 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
265 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
266 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
267 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
268 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
269 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
270 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
271 ; AVX512NOTDQ-NEXT: vzeroupper
272 ; AVX512NOTDQ-NEXT: retq
273 %d0 = load <32 x i1>, <32 x i1>* %a0
274 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
275 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
276 store <8 x float> %d2, <8 x float>* %a3
279 define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
280 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
282 ; AVX512-NEXT: kmovd (%rdi), %k0
283 ; AVX512-NEXT: kshiftrd $30, %k0, %k0
284 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
285 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
286 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
287 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
288 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
291 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
292 ; AVX512NOTDQ: # %bb.0:
293 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
294 ; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
295 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
296 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
297 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
298 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
299 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
300 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
301 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
302 ; AVX512NOTDQ-NEXT: retq
303 %d0 = load <32 x i1>, <32 x i1>* %a0
304 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
305 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
306 store <2 x double> %d2, <2 x double>* %a3
309 define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
310 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
312 ; AVX512-NEXT: kmovd (%rdi), %k0
313 ; AVX512-NEXT: kshiftrd $28, %k0, %k0
314 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
315 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
316 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
317 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
318 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
321 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
322 ; AVX512NOTDQ: # %bb.0:
323 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
324 ; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
325 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
326 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
327 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
328 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
329 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
330 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
331 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
332 ; AVX512NOTDQ-NEXT: retq
333 %d0 = load <32 x i1>, <32 x i1>* %a0
334 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
335 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
336 store <4 x float> %d2, <4 x float>* %a3
339 define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
340 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
342 ; AVX512-NEXT: kmovd (%rdi), %k0
343 ; AVX512-NEXT: kshiftrd $24, %k0, %k0
344 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
345 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
346 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
347 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
348 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
349 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
350 ; AVX512-NEXT: vzeroupper
353 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
354 ; AVX512NOTDQ: # %bb.0:
355 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
356 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
357 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
358 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
359 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
360 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
361 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
362 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
363 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
364 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
365 ; AVX512NOTDQ-NEXT: vzeroupper
366 ; AVX512NOTDQ-NEXT: retq
367 %d0 = load <32 x i1>, <32 x i1>* %a0
368 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
369 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
370 store <8 x float> %d2, <8 x float>* %a3
373 define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
374 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
376 ; AVX512-NEXT: kmovq (%rdi), %k0
377 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
378 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
379 ; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
380 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
381 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
382 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
385 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
386 ; AVX512NOTDQ: # %bb.0:
387 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
388 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
389 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
390 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
391 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
392 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
393 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
394 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
395 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
396 ; AVX512NOTDQ-NEXT: retq
397 %d0 = load <64 x i1>, <64 x i1>* %a0
398 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
399 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
400 store <2 x double> %d2, <2 x double>* %a3
403 define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
404 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
406 ; AVX512-NEXT: kmovq (%rdi), %k0
407 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
408 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
409 ; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
410 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
411 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
412 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
415 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
416 ; AVX512NOTDQ: # %bb.0:
417 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
418 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
419 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
420 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
421 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
422 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
423 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
424 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
425 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
426 ; AVX512NOTDQ-NEXT: retq
427 %d0 = load <64 x i1>, <64 x i1>* %a0
428 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
429 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
430 store <4 x float> %d2, <4 x float>* %a3
433 define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
434 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
436 ; AVX512-NEXT: kmovq (%rdi), %k0
437 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
438 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
439 ; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2
440 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
441 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
442 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
443 ; AVX512-NEXT: vzeroupper
446 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
447 ; AVX512NOTDQ: # %bb.0:
448 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
449 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
450 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
451 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
452 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2
453 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
454 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
455 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
456 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
457 ; AVX512NOTDQ-NEXT: vzeroupper
458 ; AVX512NOTDQ-NEXT: retq
459 %d0 = load <64 x i1>, <64 x i1>* %a0
460 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
461 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
462 store <8 x float> %d2, <8 x float>* %a3
465 define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
466 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
468 ; AVX512-NEXT: kmovq (%rdi), %k0
469 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
470 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
471 ; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
472 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
473 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
474 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
475 ; AVX512-NEXT: vzeroupper
478 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
479 ; AVX512NOTDQ: # %bb.0:
480 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
481 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
482 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
483 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
484 ; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
485 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
486 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
487 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
488 ; AVX512NOTDQ-NEXT: vzeroupper
489 ; AVX512NOTDQ-NEXT: retq
490 %d0 = load <64 x i1>, <64 x i1>* %a0
491 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
492 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
493 store <16 x float> %d2, <16 x float>* %a3
496 define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
497 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
499 ; AVX512-NEXT: kmovq (%rdi), %k0
500 ; AVX512-NEXT: kshiftrq $62, %k0, %k0
501 ; AVX512-NEXT: vpmovm2q %k0, %xmm2
502 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
503 ; AVX512-NEXT: vpmovq2m %xmm2, %k1
504 ; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
505 ; AVX512-NEXT: vmovapd %xmm1, (%rsi)
508 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
509 ; AVX512NOTDQ: # %bb.0:
510 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
511 ; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
512 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
513 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
514 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
515 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
516 ; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
517 ; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
518 ; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
519 ; AVX512NOTDQ-NEXT: retq
520 %d0 = load <64 x i1>, <64 x i1>* %a0
521 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
522 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
523 store <2 x double> %d2, <2 x double>* %a3
526 define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
527 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
529 ; AVX512-NEXT: kmovq (%rdi), %k0
530 ; AVX512-NEXT: kshiftrq $60, %k0, %k0
531 ; AVX512-NEXT: vpmovm2d %k0, %xmm2
532 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
533 ; AVX512-NEXT: vpmovd2m %xmm2, %k1
534 ; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
535 ; AVX512-NEXT: vmovaps %xmm1, (%rsi)
538 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
539 ; AVX512NOTDQ: # %bb.0:
540 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
541 ; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
542 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
543 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
544 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
545 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
546 ; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
547 ; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
548 ; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
549 ; AVX512NOTDQ-NEXT: retq
550 %d0 = load <64 x i1>, <64 x i1>* %a0
551 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
552 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
553 store <4 x float> %d2, <4 x float>* %a3
556 define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
557 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
559 ; AVX512-NEXT: kmovq (%rdi), %k0
560 ; AVX512-NEXT: kshiftrq $56, %k0, %k0
561 ; AVX512-NEXT: vpmovm2d %k0, %ymm2
562 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
563 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
564 ; AVX512-NEXT: vpmovd2m %ymm2, %k1
565 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
566 ; AVX512-NEXT: vmovaps %ymm1, (%rsi)
567 ; AVX512-NEXT: vzeroupper
570 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
571 ; AVX512NOTDQ: # %bb.0:
572 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
573 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
574 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
575 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z}
576 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7]
577 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2]
578 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2
579 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1
580 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
581 ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
582 ; AVX512NOTDQ-NEXT: vzeroupper
583 ; AVX512NOTDQ-NEXT: retq
584 %d0 = load <64 x i1>, <64 x i1>* %a0
585 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
586 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
587 store <8 x float> %d2, <8 x float>* %a3
590 define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
591 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
593 ; AVX512-NEXT: kmovq (%rdi), %k0
594 ; AVX512-NEXT: kshiftrq $48, %k0, %k0
595 ; AVX512-NEXT: vpmovm2d %k0, %zmm2
596 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
597 ; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
598 ; AVX512-NEXT: vpmovd2m %zmm2, %k1
599 ; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
600 ; AVX512-NEXT: vmovaps %zmm1, (%rsi)
601 ; AVX512-NEXT: vzeroupper
604 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
605 ; AVX512NOTDQ: # %bb.0:
606 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
607 ; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
608 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
609 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
610 ; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
611 ; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
612 ; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
613 ; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
614 ; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
615 ; AVX512NOTDQ-NEXT: vzeroupper
616 ; AVX512NOTDQ-NEXT: retq
617 %d0 = load <64 x i1>, <64 x i1>* %a0
618 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
619 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
620 store <16 x float> %d2, <16 x float>* %a3
623 define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
624 ; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
626 ; AVX512-NEXT: kmovb (%rdi), %k0
627 ; AVX512-NEXT: kshiftrw $1, %k0, %k0
628 ; AVX512-NEXT: kmovb %k0, (%rsi)
631 ; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
632 ; AVX512NOTDQ: # %bb.0:
633 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
634 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
635 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
636 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
637 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
638 ; AVX512NOTDQ-NEXT: retq
639 %d0 = load <2 x i1>, <2 x i1>* %a0
640 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
641 store <1 x i1> %d1, <1 x i1>* %a1
644 define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
645 ; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
647 ; AVX512-NEXT: kmovb (%rdi), %k0
648 ; AVX512-NEXT: kshiftrw $1, %k0, %k0
649 ; AVX512-NEXT: kmovb %k0, (%rsi)
652 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
653 ; AVX512NOTDQ: # %bb.0:
654 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
655 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
656 ; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
657 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
658 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
659 ; AVX512NOTDQ-NEXT: retq
660 %d0 = load <3 x i1>, <3 x i1>* %a0
661 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
662 store <1 x i1> %d1, <1 x i1>* %a1
665 define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
666 ; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
668 ; AVX512-NEXT: kmovb (%rdi), %k0
669 ; AVX512-NEXT: kshiftrw $2, %k0, %k0
670 ; AVX512-NEXT: kmovb %k0, (%rsi)
673 ; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
674 ; AVX512NOTDQ: # %bb.0:
675 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
676 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
677 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
678 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
679 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
680 ; AVX512NOTDQ-NEXT: retq
681 %d0 = load <3 x i1>, <3 x i1>* %a0
682 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
683 store <1 x i1> %d1, <1 x i1>* %a1
686 define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
687 ; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
689 ; AVX512-NEXT: kmovb (%rdi), %k0
690 ; AVX512-NEXT: kshiftrw $2, %k0, %k0
691 ; AVX512-NEXT: kmovb %k0, (%rsi)
694 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
695 ; AVX512NOTDQ: # %bb.0:
696 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
697 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
698 ; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
699 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
700 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
701 ; AVX512NOTDQ-NEXT: retq
702 %d0 = load <4 x i1>, <4 x i1>* %a0
703 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
704 store <1 x i1> %d1, <1 x i1>* %a1
707 define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
708 ; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
710 ; AVX512-NEXT: kmovb (%rdi), %k0
711 ; AVX512-NEXT: kshiftrw $3, %k0, %k0
712 ; AVX512-NEXT: kmovb %k0, (%rsi)
715 ; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
716 ; AVX512NOTDQ: # %bb.0:
717 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
718 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
719 ; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0
720 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
721 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
722 ; AVX512NOTDQ-NEXT: retq
723 %d0 = load <4 x i1>, <4 x i1>* %a0
724 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
725 store <1 x i1> %d1, <1 x i1>* %a1
728 define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
729 ; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
731 ; AVX512-NEXT: kmovb (%rdi), %k0
732 ; AVX512-NEXT: kshiftrw $4, %k0, %k0
733 ; AVX512-NEXT: kmovb %k0, (%rsi)
736 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
737 ; AVX512NOTDQ: # %bb.0:
738 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
739 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
740 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0
741 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
742 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
743 ; AVX512NOTDQ-NEXT: retq
744 %d0 = load <8 x i1>, <8 x i1>* %a0
745 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
746 store <1 x i1> %d1, <1 x i1>* %a1
749 define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
750 ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
752 ; AVX512-NEXT: kmovb (%rdi), %k0
753 ; AVX512-NEXT: kshiftrw $4, %k0, %k0
754 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
755 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
756 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
757 ; AVX512-NEXT: kmovb %k0, (%rsi)
760 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
761 ; AVX512NOTDQ: # %bb.0:
762 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
763 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
764 ; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
765 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
766 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
767 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
768 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
769 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
770 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
771 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
772 ; AVX512NOTDQ-NEXT: retq
773 %d0 = load <8 x i1>, <8 x i1>* %a0
774 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
775 store <2 x i1> %d1, <2 x i1>* %a1
778 define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
779 ; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
781 ; AVX512-NEXT: kmovb (%rdi), %k0
782 ; AVX512-NEXT: kshiftrw $7, %k0, %k0
783 ; AVX512-NEXT: kmovb %k0, (%rsi)
786 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
787 ; AVX512NOTDQ: # %bb.0:
788 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
789 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
790 ; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0
791 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
792 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
793 ; AVX512NOTDQ-NEXT: retq
794 %d0 = load <8 x i1>, <8 x i1>* %a0
795 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
796 store <1 x i1> %d1, <1 x i1>* %a1
799 define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
800 ; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
802 ; AVX512-NEXT: kmovb (%rdi), %k0
803 ; AVX512-NEXT: kshiftrw $6, %k0, %k0
804 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
805 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
806 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
807 ; AVX512-NEXT: kmovb %k0, (%rsi)
810 ; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
811 ; AVX512NOTDQ: # %bb.0:
812 ; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
813 ; AVX512NOTDQ-NEXT: kmovd %eax, %k0
814 ; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
815 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
816 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
817 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
818 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
819 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
820 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
821 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
822 ; AVX512NOTDQ-NEXT: retq
823 %d0 = load <8 x i1>, <8 x i1>* %a0
824 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
825 store <2 x i1> %d1, <2 x i1>* %a1
828 define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
829 ; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
831 ; AVX512-NEXT: kmovw (%rdi), %k0
832 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
833 ; AVX512-NEXT: kmovb %k0, (%rsi)
836 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
837 ; AVX512NOTDQ: # %bb.0:
838 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
839 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0
840 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
841 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
842 ; AVX512NOTDQ-NEXT: retq
843 %d0 = load <16 x i1>, <16 x i1>* %a0
844 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
845 store <1 x i1> %d1, <1 x i1>* %a1
848 define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
849 ; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
851 ; AVX512-NEXT: kmovw (%rdi), %k0
852 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
853 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
854 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
855 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
856 ; AVX512-NEXT: kmovb %k0, (%rsi)
859 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
860 ; AVX512NOTDQ: # %bb.0:
861 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
862 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
863 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
864 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
865 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
866 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
867 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
868 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
869 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
870 ; AVX512NOTDQ-NEXT: retq
871 %d0 = load <16 x i1>, <16 x i1>* %a0
872 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
873 store <2 x i1> %d1, <2 x i1>* %a1
876 define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
877 ; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
879 ; AVX512-NEXT: kmovw (%rdi), %k0
880 ; AVX512-NEXT: kshiftrw $8, %k0, %k0
881 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
882 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
883 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
884 ; AVX512-NEXT: kmovb %k0, (%rsi)
887 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
888 ; AVX512NOTDQ: # %bb.0:
889 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
890 ; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
891 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
892 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
893 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
894 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
895 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
896 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
897 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
898 ; AVX512NOTDQ-NEXT: retq
899 %d0 = load <16 x i1>, <16 x i1>* %a0
900 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
901 store <4 x i1> %d1, <4 x i1>* %a1
904 define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
905 ; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
907 ; AVX512-NEXT: kmovw (%rdi), %k0
908 ; AVX512-NEXT: kshiftrw $15, %k0, %k0
909 ; AVX512-NEXT: kmovb %k0, (%rsi)
912 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
913 ; AVX512NOTDQ: # %bb.0:
914 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
915 ; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
916 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
917 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
918 ; AVX512NOTDQ-NEXT: retq
919 %d0 = load <16 x i1>, <16 x i1>* %a0
920 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
921 store <1 x i1> %d1, <1 x i1>* %a1
924 define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
925 ; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
927 ; AVX512-NEXT: kmovw (%rdi), %k0
928 ; AVX512-NEXT: kshiftrw $14, %k0, %k0
929 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
930 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
931 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
932 ; AVX512-NEXT: kmovb %k0, (%rsi)
935 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
936 ; AVX512NOTDQ: # %bb.0:
937 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
938 ; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
939 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
940 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
941 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
942 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
943 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
944 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
945 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
946 ; AVX512NOTDQ-NEXT: retq
947 %d0 = load <16 x i1>, <16 x i1>* %a0
948 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
949 store <2 x i1> %d1, <2 x i1>* %a1
952 define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
953 ; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
955 ; AVX512-NEXT: kmovw (%rdi), %k0
956 ; AVX512-NEXT: kshiftrw $12, %k0, %k0
957 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
958 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
959 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
960 ; AVX512-NEXT: kmovb %k0, (%rsi)
963 ; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
964 ; AVX512NOTDQ: # %bb.0:
965 ; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
966 ; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
967 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
968 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
969 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
970 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
971 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
972 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
973 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
974 ; AVX512NOTDQ-NEXT: retq
975 %d0 = load <16 x i1>, <16 x i1>* %a0
976 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
977 store <4 x i1> %d1, <4 x i1>* %a1
980 define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
981 ; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
983 ; AVX512-NEXT: kmovd (%rdi), %k0
984 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
985 ; AVX512-NEXT: kmovb %k0, (%rsi)
988 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
989 ; AVX512NOTDQ: # %bb.0:
990 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
991 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0
992 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
993 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
994 ; AVX512NOTDQ-NEXT: retq
995 %d0 = load <32 x i1>, <32 x i1>* %a0
996 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
997 store <1 x i1> %d1, <1 x i1>* %a1
1000 define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1001 ; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
1003 ; AVX512-NEXT: kmovd (%rdi), %k0
1004 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1005 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1006 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1007 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1008 ; AVX512-NEXT: kmovb %k0, (%rsi)
1011 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
1012 ; AVX512NOTDQ: # %bb.0:
1013 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1014 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1015 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1016 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1017 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1018 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1019 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1020 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1021 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1022 ; AVX512NOTDQ-NEXT: retq
1023 %d0 = load <32 x i1>, <32 x i1>* %a0
1024 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
1025 store <2 x i1> %d1, <2 x i1>* %a1
1028 define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1029 ; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
1031 ; AVX512-NEXT: kmovd (%rdi), %k0
1032 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1033 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1034 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1035 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1036 ; AVX512-NEXT: kmovb %k0, (%rsi)
1039 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
1040 ; AVX512NOTDQ: # %bb.0:
1041 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1042 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1043 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1044 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1045 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1046 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1047 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1048 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1049 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1050 ; AVX512NOTDQ-NEXT: retq
1051 %d0 = load <32 x i1>, <32 x i1>* %a0
1052 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
1053 store <4 x i1> %d1, <4 x i1>* %a1
1056 define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1057 ; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
1059 ; AVX512-NEXT: kmovd (%rdi), %k0
1060 ; AVX512-NEXT: kshiftrd $16, %k0, %k0
1061 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1062 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1063 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1064 ; AVX512-NEXT: kmovb %k0, (%rsi)
1065 ; AVX512-NEXT: vzeroupper
1068 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
1069 ; AVX512NOTDQ: # %bb.0:
1070 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1071 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
1072 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1073 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1074 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1075 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
1076 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1077 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1078 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1079 ; AVX512NOTDQ-NEXT: vzeroupper
1080 ; AVX512NOTDQ-NEXT: retq
1081 %d0 = load <32 x i1>, <32 x i1>* %a0
1082 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
1083 store <8 x i1> %d1, <8 x i1>* %a1
1086 define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
1087 ; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
1089 ; AVX512-NEXT: kmovd (%rdi), %k0
1090 ; AVX512-NEXT: kshiftrd $31, %k0, %k0
1091 ; AVX512-NEXT: kmovb %k0, (%rsi)
1094 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
1095 ; AVX512NOTDQ: # %bb.0:
1096 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1097 ; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0
1098 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1099 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1100 ; AVX512NOTDQ-NEXT: retq
1101 %d0 = load <32 x i1>, <32 x i1>* %a0
1102 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
1103 store <1 x i1> %d1, <1 x i1>* %a1
1106 define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
1107 ; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
1109 ; AVX512-NEXT: kmovd (%rdi), %k0
1110 ; AVX512-NEXT: kshiftrd $30, %k0, %k0
1111 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1112 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1113 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1114 ; AVX512-NEXT: kmovb %k0, (%rsi)
1117 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
1118 ; AVX512NOTDQ: # %bb.0:
1119 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1120 ; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
1121 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1122 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1123 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1124 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1125 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1126 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1127 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1128 ; AVX512NOTDQ-NEXT: retq
1129 %d0 = load <32 x i1>, <32 x i1>* %a0
1130 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
1131 store <2 x i1> %d1, <2 x i1>* %a1
1134 define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
1135 ; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
1137 ; AVX512-NEXT: kmovd (%rdi), %k0
1138 ; AVX512-NEXT: kshiftrd $28, %k0, %k0
1139 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1140 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1141 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1142 ; AVX512-NEXT: kmovb %k0, (%rsi)
1145 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
1146 ; AVX512NOTDQ: # %bb.0:
1147 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1148 ; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
1149 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1150 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1151 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1152 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1153 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1154 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1155 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1156 ; AVX512NOTDQ-NEXT: retq
1157 %d0 = load <32 x i1>, <32 x i1>* %a0
1158 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
1159 store <4 x i1> %d1, <4 x i1>* %a1
1162 define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
1163 ; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
1165 ; AVX512-NEXT: kmovd (%rdi), %k0
1166 ; AVX512-NEXT: kshiftrd $24, %k0, %k0
1167 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1168 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1169 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1170 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1171 ; AVX512-NEXT: kmovb %k0, (%rsi)
1172 ; AVX512-NEXT: vzeroupper
1175 ; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
1176 ; AVX512NOTDQ: # %bb.0:
1177 ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
1178 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
1179 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1180 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1181 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1182 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1183 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
1184 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1185 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1186 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1187 ; AVX512NOTDQ-NEXT: vzeroupper
1188 ; AVX512NOTDQ-NEXT: retq
1189 %d0 = load <32 x i1>, <32 x i1>* %a0
1190 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
1191 store <8 x i1> %d1, <8 x i1>* %a1
1194 define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1195 ; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
1197 ; AVX512-NEXT: kmovq (%rdi), %k0
1198 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1199 ; AVX512-NEXT: kmovb %k0, (%rsi)
1202 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
1203 ; AVX512NOTDQ: # %bb.0:
1204 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1205 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0
1206 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1207 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1208 ; AVX512NOTDQ-NEXT: retq
1209 %d0 = load <64 x i1>, <64 x i1>* %a0
1210 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
1211 store <1 x i1> %d1, <1 x i1>* %a1
1214 define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1215 ; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
1217 ; AVX512-NEXT: kmovq (%rdi), %k0
1218 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1219 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1220 ; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
1221 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1222 ; AVX512-NEXT: kmovb %k0, (%rsi)
1225 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
1226 ; AVX512NOTDQ: # %bb.0:
1227 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1228 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1229 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1230 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1231 ; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
1232 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1233 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1234 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1235 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1236 ; AVX512NOTDQ-NEXT: retq
1237 %d0 = load <64 x i1>, <64 x i1>* %a0
1238 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
1239 store <2 x i1> %d1, <2 x i1>* %a1
1242 define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1243 ; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
1245 ; AVX512-NEXT: kmovq (%rdi), %k0
1246 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1247 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1248 ; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
1249 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1250 ; AVX512-NEXT: kmovb %k0, (%rsi)
1253 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
1254 ; AVX512NOTDQ: # %bb.0:
1255 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1256 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1257 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1258 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1259 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
1260 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1261 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1262 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1263 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1264 ; AVX512NOTDQ-NEXT: retq
1265 %d0 = load <64 x i1>, <64 x i1>* %a0
1266 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
1267 store <4 x i1> %d1, <4 x i1>* %a1
1270 define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1271 ; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
1273 ; AVX512-NEXT: kmovq (%rdi), %k0
1274 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1275 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1276 ; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0
1277 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1278 ; AVX512-NEXT: kmovb %k0, (%rsi)
1279 ; AVX512-NEXT: vzeroupper
1282 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
1283 ; AVX512NOTDQ: # %bb.0:
1284 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1285 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1286 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1287 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1288 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0
1289 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
1290 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1291 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1292 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1293 ; AVX512NOTDQ-NEXT: vzeroupper
1294 ; AVX512NOTDQ-NEXT: retq
1295 %d0 = load <64 x i1>, <64 x i1>* %a0
1296 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1297 store <8 x i1> %d1, <8 x i1>* %a1
1300 define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1301 ; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
1303 ; AVX512-NEXT: kmovq (%rdi), %k0
1304 ; AVX512-NEXT: kshiftrq $32, %k0, %k0
1305 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1306 ; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
1307 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1308 ; AVX512-NEXT: kmovw %k0, (%rsi)
1309 ; AVX512-NEXT: vzeroupper
1312 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
1313 ; AVX512NOTDQ: # %bb.0:
1314 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1315 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
1316 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1317 ; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
1318 ; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
1319 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1320 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1321 ; AVX512NOTDQ-NEXT: vzeroupper
1322 ; AVX512NOTDQ-NEXT: retq
1323 %d0 = load <64 x i1>, <64 x i1>* %a0
1324 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
1325 store <16 x i1> %d1, <16 x i1>* %a1
1328 define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
1329 ; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
1331 ; AVX512-NEXT: kmovq (%rdi), %k0
1332 ; AVX512-NEXT: kshiftrq $63, %k0, %k0
1333 ; AVX512-NEXT: kmovb %k0, (%rsi)
1336 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
1337 ; AVX512NOTDQ: # %bb.0:
1338 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1339 ; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0
1340 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1341 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1342 ; AVX512NOTDQ-NEXT: retq
1343 %d0 = load <64 x i1>, <64 x i1>* %a0
1344 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
1345 store <1 x i1> %d1, <1 x i1>* %a1
1348 define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
1349 ; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
1351 ; AVX512-NEXT: kmovq (%rdi), %k0
1352 ; AVX512-NEXT: kshiftrq $62, %k0, %k0
1353 ; AVX512-NEXT: vpmovm2q %k0, %xmm0
1354 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1355 ; AVX512-NEXT: vpmovq2m %xmm0, %k0
1356 ; AVX512-NEXT: kmovb %k0, (%rsi)
1359 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
1360 ; AVX512NOTDQ: # %bb.0:
1361 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1362 ; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
1363 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1364 ; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
1365 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1366 ; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
1367 ; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
1368 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1369 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1370 ; AVX512NOTDQ-NEXT: retq
1371 %d0 = load <64 x i1>, <64 x i1>* %a0
1372 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
1373 store <2 x i1> %d1, <2 x i1>* %a1
1376 define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
1377 ; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
1379 ; AVX512-NEXT: kmovq (%rdi), %k0
1380 ; AVX512-NEXT: kshiftrq $60, %k0, %k0
1381 ; AVX512-NEXT: vpmovm2d %k0, %xmm0
1382 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1383 ; AVX512-NEXT: vpmovd2m %xmm0, %k0
1384 ; AVX512-NEXT: kmovb %k0, (%rsi)
1387 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
1388 ; AVX512NOTDQ: # %bb.0:
1389 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1390 ; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
1391 ; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
1392 ; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1393 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1394 ; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
1395 ; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
1396 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1397 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1398 ; AVX512NOTDQ-NEXT: retq
1399 %d0 = load <64 x i1>, <64 x i1>* %a0
1400 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
1401 store <4 x i1> %d1, <4 x i1>* %a1
1404 define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
1405 ; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
1407 ; AVX512-NEXT: kmovq (%rdi), %k0
1408 ; AVX512-NEXT: kshiftrq $56, %k0, %k0
1409 ; AVX512-NEXT: vpmovm2d %k0, %ymm0
1410 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1411 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1412 ; AVX512-NEXT: vpmovd2m %ymm0, %k0
1413 ; AVX512-NEXT: kmovb %k0, (%rsi)
1414 ; AVX512-NEXT: vzeroupper
1417 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
1418 ; AVX512NOTDQ: # %bb.0:
1419 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1420 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
1421 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
1422 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
1423 ; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7]
1424 ; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
1425 ; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0
1426 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0
1427 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax
1428 ; AVX512NOTDQ-NEXT: movb %al, (%rsi)
1429 ; AVX512NOTDQ-NEXT: vzeroupper
1430 ; AVX512NOTDQ-NEXT: retq
1431 %d0 = load <64 x i1>, <64 x i1>* %a0
1432 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1433 store <8 x i1> %d1, <8 x i1>* %a1
1436 define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
1437 ; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
1439 ; AVX512-NEXT: kmovq (%rdi), %k0
1440 ; AVX512-NEXT: kshiftrq $48, %k0, %k0
1441 ; AVX512-NEXT: vpmovm2d %k0, %zmm0
1442 ; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1443 ; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
1444 ; AVX512-NEXT: vpmovd2m %zmm0, %k0
1445 ; AVX512-NEXT: kmovw %k0, (%rsi)
1446 ; AVX512-NEXT: vzeroupper
1449 ; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
1450 ; AVX512NOTDQ: # %bb.0:
1451 ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
1452 ; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
1453 ; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1454 ; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1455 ; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
1456 ; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
1457 ; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
1458 ; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
1459 ; AVX512NOTDQ-NEXT: vzeroupper
1460 ; AVX512NOTDQ-NEXT: retq
1461 %d0 = load <64 x i1>, <64 x i1>* %a0
1462 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
1463 store <16 x i1> %d1, <16 x i1>* %a1