1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
3 // Test new aarch64 intrinsics and types
7 // CHECK-LABEL: @test_vmla_lane_s16(
8 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
10 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
11 // CHECK: ret <4 x i16> [[ADD]]
12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
13 return vmla_lane_s16(a, b, v, 3);
16 // CHECK-LABEL: @test_vmlaq_lane_s16(
17 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
18 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
19 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
20 // CHECK: ret <8 x i16> [[ADD]]
21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
22 return vmlaq_lane_s16(a, b, v, 3);
25 // CHECK-LABEL: @test_vmla_lane_s32(
26 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
27 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
28 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
29 // CHECK: ret <2 x i32> [[ADD]]
30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
31 return vmla_lane_s32(a, b, v, 1);
34 // CHECK-LABEL: @test_vmlaq_lane_s32(
35 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
36 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
37 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
38 // CHECK: ret <4 x i32> [[ADD]]
39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
40 return vmlaq_lane_s32(a, b, v, 1);
43 // CHECK-LABEL: @test_vmla_laneq_s16(
44 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
45 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
46 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
47 // CHECK: ret <4 x i16> [[ADD]]
48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
49 return vmla_laneq_s16(a, b, v, 7);
52 // CHECK-LABEL: @test_vmlaq_laneq_s16(
53 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
54 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
55 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
56 // CHECK: ret <8 x i16> [[ADD]]
57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
58 return vmlaq_laneq_s16(a, b, v, 7);
61 // CHECK-LABEL: @test_vmla_laneq_s32(
62 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
63 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
64 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
65 // CHECK: ret <2 x i32> [[ADD]]
66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
67 return vmla_laneq_s32(a, b, v, 3);
70 // CHECK-LABEL: @test_vmlaq_laneq_s32(
71 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
72 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
73 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
74 // CHECK: ret <4 x i32> [[ADD]]
75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
76 return vmlaq_laneq_s32(a, b, v, 3);
79 // CHECK-LABEL: @test_vmls_lane_s16(
80 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
81 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
82 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
83 // CHECK: ret <4 x i16> [[SUB]]
84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
85 return vmls_lane_s16(a, b, v, 3);
88 // CHECK-LABEL: @test_vmlsq_lane_s16(
89 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
90 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
91 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
92 // CHECK: ret <8 x i16> [[SUB]]
93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
94 return vmlsq_lane_s16(a, b, v, 3);
97 // CHECK-LABEL: @test_vmls_lane_s32(
98 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
99 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
100 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
101 // CHECK: ret <2 x i32> [[SUB]]
102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
103 return vmls_lane_s32(a, b, v, 1);
106 // CHECK-LABEL: @test_vmlsq_lane_s32(
107 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
108 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
109 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
110 // CHECK: ret <4 x i32> [[SUB]]
111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
112 return vmlsq_lane_s32(a, b, v, 1);
115 // CHECK-LABEL: @test_vmls_laneq_s16(
116 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
117 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
118 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
119 // CHECK: ret <4 x i16> [[SUB]]
120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
121 return vmls_laneq_s16(a, b, v, 7);
124 // CHECK-LABEL: @test_vmlsq_laneq_s16(
125 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
126 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
127 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
128 // CHECK: ret <8 x i16> [[SUB]]
129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
130 return vmlsq_laneq_s16(a, b, v, 7);
133 // CHECK-LABEL: @test_vmls_laneq_s32(
134 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
135 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
136 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
137 // CHECK: ret <2 x i32> [[SUB]]
138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
139 return vmls_laneq_s32(a, b, v, 3);
142 // CHECK-LABEL: @test_vmlsq_laneq_s32(
143 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
144 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
145 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
146 // CHECK: ret <4 x i32> [[SUB]]
147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
148 return vmlsq_laneq_s32(a, b, v, 3);
151 // CHECK-LABEL: @test_vmul_lane_s16(
152 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
153 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
154 // CHECK: ret <4 x i16> [[MUL]]
155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
156 return vmul_lane_s16(a, v, 3);
159 // CHECK-LABEL: @test_vmulq_lane_s16(
160 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
161 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
162 // CHECK: ret <8 x i16> [[MUL]]
163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
164 return vmulq_lane_s16(a, v, 3);
167 // CHECK-LABEL: @test_vmul_lane_s32(
168 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
169 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
170 // CHECK: ret <2 x i32> [[MUL]]
171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
172 return vmul_lane_s32(a, v, 1);
175 // CHECK-LABEL: @test_vmulq_lane_s32(
176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
177 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
178 // CHECK: ret <4 x i32> [[MUL]]
179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
180 return vmulq_lane_s32(a, v, 1);
183 // CHECK-LABEL: @test_vmul_lane_u16(
184 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
185 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
186 // CHECK: ret <4 x i16> [[MUL]]
187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
188 return vmul_lane_u16(a, v, 3);
191 // CHECK-LABEL: @test_vmulq_lane_u16(
192 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
193 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
194 // CHECK: ret <8 x i16> [[MUL]]
195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
196 return vmulq_lane_u16(a, v, 3);
199 // CHECK-LABEL: @test_vmul_lane_u32(
200 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
201 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
202 // CHECK: ret <2 x i32> [[MUL]]
203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
204 return vmul_lane_u32(a, v, 1);
207 // CHECK-LABEL: @test_vmulq_lane_u32(
208 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
209 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
210 // CHECK: ret <4 x i32> [[MUL]]
211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
212 return vmulq_lane_u32(a, v, 1);
215 // CHECK-LABEL: @test_vmul_laneq_s16(
216 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
217 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
218 // CHECK: ret <4 x i16> [[MUL]]
219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
220 return vmul_laneq_s16(a, v, 7);
223 // CHECK-LABEL: @test_vmulq_laneq_s16(
224 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
225 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
226 // CHECK: ret <8 x i16> [[MUL]]
227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
228 return vmulq_laneq_s16(a, v, 7);
231 // CHECK-LABEL: @test_vmul_laneq_s32(
232 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
233 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
234 // CHECK: ret <2 x i32> [[MUL]]
235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
236 return vmul_laneq_s32(a, v, 3);
239 // CHECK-LABEL: @test_vmulq_laneq_s32(
240 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
241 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
242 // CHECK: ret <4 x i32> [[MUL]]
243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
244 return vmulq_laneq_s32(a, v, 3);
247 // CHECK-LABEL: @test_vmul_laneq_u16(
248 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
249 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
250 // CHECK: ret <4 x i16> [[MUL]]
251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
252 return vmul_laneq_u16(a, v, 7);
255 // CHECK-LABEL: @test_vmulq_laneq_u16(
256 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
257 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
258 // CHECK: ret <8 x i16> [[MUL]]
259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
260 return vmulq_laneq_u16(a, v, 7);
263 // CHECK-LABEL: @test_vmul_laneq_u32(
264 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
265 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
266 // CHECK: ret <2 x i32> [[MUL]]
267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
268 return vmul_laneq_u32(a, v, 3);
271 // CHECK-LABEL: @test_vmulq_laneq_u32(
272 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
273 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
274 // CHECK: ret <4 x i32> [[MUL]]
275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
276 return vmulq_laneq_u32(a, v, 3);
279 // CHECK-LABEL: @test_vfma_lane_f32(
280 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
281 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
282 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
283 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
284 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
285 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
286 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
287 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
288 // CHECK: ret <2 x float> [[FMLA2]]
289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
290 return vfma_lane_f32(a, b, v, 1);
293 // CHECK-LABEL: @test_vfmaq_lane_f32(
294 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
295 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
296 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
297 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
298 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
299 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
300 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
301 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
302 // CHECK: ret <4 x float> [[FMLA2]]
303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
304 return vfmaq_lane_f32(a, b, v, 1);
307 // CHECK-LABEL: @test_vfma_laneq_f32(
308 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
311 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
312 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
313 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
314 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
315 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
316 // CHECK: ret <2 x float> [[TMP6]]
317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
318 return vfma_laneq_f32(a, b, v, 3);
321 // CHECK-LABEL: @test_vfmaq_laneq_f32(
322 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
323 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
324 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
325 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
326 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
327 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
328 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
329 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
330 // CHECK: ret <4 x float> [[TMP6]]
331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
332 return vfmaq_laneq_f32(a, b, v, 3);
335 // CHECK-LABEL: @test_vfms_lane_f32(
336 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
337 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
338 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
339 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
340 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
341 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
342 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
343 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
344 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
345 // CHECK: ret <2 x float> [[FMLA2]]
346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
347 return vfms_lane_f32(a, b, v, 1);
350 // CHECK-LABEL: @test_vfmsq_lane_f32(
351 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
352 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
353 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
354 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
355 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
356 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
357 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
358 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
359 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
360 // CHECK: ret <4 x float> [[FMLA2]]
361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
362 return vfmsq_lane_f32(a, b, v, 1);
365 // CHECK-LABEL: @test_vfms_laneq_f32(
366 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
367 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
368 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
369 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
370 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
371 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
372 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
373 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
374 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
375 // CHECK: ret <2 x float> [[TMP6]]
376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
377 return vfms_laneq_f32(a, b, v, 3);
380 // CHECK-LABEL: @test_vfmsq_laneq_f32(
381 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
382 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
383 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
384 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
385 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
386 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
387 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
388 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
389 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
390 // CHECK: ret <4 x float> [[TMP6]]
391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
392 return vfmsq_laneq_f32(a, b, v, 3);
395 // CHECK-LABEL: @test_vfmaq_lane_f64(
396 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
397 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
398 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
399 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
400 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
401 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
402 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
403 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
404 // CHECK: ret <2 x double> [[FMLA2]]
405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
406 return vfmaq_lane_f64(a, b, v, 0);
409 // CHECK-LABEL: @test_vfmaq_laneq_f64(
410 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
411 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
412 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
413 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
414 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
415 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
416 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
417 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
418 // CHECK: ret <2 x double> [[TMP6]]
419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
420 return vfmaq_laneq_f64(a, b, v, 1);
423 // CHECK-LABEL: @test_vfmsq_lane_f64(
424 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
425 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
426 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
427 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
428 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
429 // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
430 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
431 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
432 // CHECK: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
433 // CHECK: ret <2 x double> [[FMLA2]]
434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
435 return vfmsq_lane_f64(a, b, v, 0);
438 // CHECK-LABEL: @test_vfmsq_laneq_f64(
439 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
440 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
441 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
442 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
443 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
444 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
445 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
446 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
447 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
448 // CHECK: ret <2 x double> [[TMP6]]
449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
450 return vfmsq_laneq_f64(a, b, v, 1);
453 // CHECK-LABEL: @test_vfmas_laneq_f32(
454 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
455 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
456 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
457 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
458 // CHECK: ret float [[TMP2]]
459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
460 return vfmas_laneq_f32(a, b, v, 3);
463 // CHECK-LABEL: @test_vfmsd_lane_f64(
464 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
465 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
466 // CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
467 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
468 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
469 // CHECK: ret double [[TMP2]]
470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
471 return vfmsd_lane_f64(a, b, v, 0);
474 // CHECK-LABEL: @test_vfmss_laneq_f32(
475 // CHECK: [[SUB:%.*]] = fsub float -0.000000e+00, %b
476 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
477 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
478 // CHECK: [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
479 // CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
480 // CHECK: ret float [[TMP2]]
481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
482 return vfmss_laneq_f32(a, b, v, 3);
485 // CHECK-LABEL: @test_vfmsd_laneq_f64(
486 // CHECK: [[SUB:%.*]] = fsub double -0.000000e+00, %b
487 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
488 // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
489 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
490 // CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
491 // CHECK: ret double [[TMP2]]
492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
493 return vfmsd_laneq_f64(a, b, v, 1);
496 // CHECK-LABEL: @test_vmlal_lane_s16(
497 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
498 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
499 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
500 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
501 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
502 // CHECK: ret <4 x i32> [[ADD]]
503 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
504 return vmlal_lane_s16(a, b, v, 3);
507 // CHECK-LABEL: @test_vmlal_lane_s32(
508 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
509 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
510 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
511 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
512 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
513 // CHECK: ret <2 x i64> [[ADD]]
514 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
515 return vmlal_lane_s32(a, b, v, 1);
518 // CHECK-LABEL: @test_vmlal_laneq_s16(
519 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
520 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
521 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
522 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
523 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
524 // CHECK: ret <4 x i32> [[ADD]]
525 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
526 return vmlal_laneq_s16(a, b, v, 7);
529 // CHECK-LABEL: @test_vmlal_laneq_s32(
530 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
531 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
532 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
533 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
534 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
535 // CHECK: ret <2 x i64> [[ADD]]
536 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
537 return vmlal_laneq_s32(a, b, v, 3);
540 // CHECK-LABEL: @test_vmlal_high_lane_s16(
541 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
542 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
543 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
544 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
545 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
546 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
547 // CHECK: ret <4 x i32> [[ADD]]
548 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
549 return vmlal_high_lane_s16(a, b, v, 3);
552 // CHECK-LABEL: @test_vmlal_high_lane_s32(
553 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
554 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
555 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
556 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
557 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
558 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
559 // CHECK: ret <2 x i64> [[ADD]]
560 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
561 return vmlal_high_lane_s32(a, b, v, 1);
564 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
565 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
566 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
567 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
568 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
569 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
570 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
571 // CHECK: ret <4 x i32> [[ADD]]
572 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
573 return vmlal_high_laneq_s16(a, b, v, 7);
576 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
577 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
578 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
579 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
580 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
581 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
582 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
583 // CHECK: ret <2 x i64> [[ADD]]
584 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
585 return vmlal_high_laneq_s32(a, b, v, 3);
588 // CHECK-LABEL: @test_vmlsl_lane_s16(
589 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
590 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
591 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
592 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
593 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
594 // CHECK: ret <4 x i32> [[SUB]]
595 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
596 return vmlsl_lane_s16(a, b, v, 3);
599 // CHECK-LABEL: @test_vmlsl_lane_s32(
600 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
601 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
602 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
603 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
604 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
605 // CHECK: ret <2 x i64> [[SUB]]
606 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
607 return vmlsl_lane_s32(a, b, v, 1);
610 // CHECK-LABEL: @test_vmlsl_laneq_s16(
611 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
612 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
613 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
614 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
615 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
616 // CHECK: ret <4 x i32> [[SUB]]
617 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
618 return vmlsl_laneq_s16(a, b, v, 7);
621 // CHECK-LABEL: @test_vmlsl_laneq_s32(
622 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
623 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
624 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
625 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
626 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
627 // CHECK: ret <2 x i64> [[SUB]]
628 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
629 return vmlsl_laneq_s32(a, b, v, 3);
632 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
633 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
634 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
635 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
636 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
637 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
638 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
639 // CHECK: ret <4 x i32> [[SUB]]
640 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
641 return vmlsl_high_lane_s16(a, b, v, 3);
644 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
645 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
646 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
647 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
648 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
649 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
650 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
651 // CHECK: ret <2 x i64> [[SUB]]
652 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
653 return vmlsl_high_lane_s32(a, b, v, 1);
656 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
657 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
658 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
659 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
660 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
661 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
662 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
663 // CHECK: ret <4 x i32> [[SUB]]
664 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
665 return vmlsl_high_laneq_s16(a, b, v, 7);
668 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
669 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
670 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
671 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
672 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
673 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
674 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
675 // CHECK: ret <2 x i64> [[SUB]]
676 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
677 return vmlsl_high_laneq_s32(a, b, v, 3);
680 // CHECK-LABEL: @test_vmlal_lane_u16(
681 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
682 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
683 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
684 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
685 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
686 // CHECK: ret <4 x i32> [[ADD]]
687 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
688 return vmlal_lane_u16(a, b, v, 3);
691 // CHECK-LABEL: @test_vmlal_lane_u32(
692 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
693 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
694 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
695 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
696 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
697 // CHECK: ret <2 x i64> [[ADD]]
698 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
699 return vmlal_lane_u32(a, b, v, 1);
702 // CHECK-LABEL: @test_vmlal_laneq_u16(
703 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
704 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
705 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
706 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
707 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
708 // CHECK: ret <4 x i32> [[ADD]]
709 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
710 return vmlal_laneq_u16(a, b, v, 7);
713 // CHECK-LABEL: @test_vmlal_laneq_u32(
714 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
715 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
716 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
717 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
718 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
719 // CHECK: ret <2 x i64> [[ADD]]
720 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
721 return vmlal_laneq_u32(a, b, v, 3);
724 // CHECK-LABEL: @test_vmlal_high_lane_u16(
725 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
727 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
728 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
729 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
730 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
731 // CHECK: ret <4 x i32> [[ADD]]
732 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
733 return vmlal_high_lane_u16(a, b, v, 3);
736 // CHECK-LABEL: @test_vmlal_high_lane_u32(
737 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
738 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
739 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
740 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
741 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
742 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
743 // CHECK: ret <2 x i64> [[ADD]]
744 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
745 return vmlal_high_lane_u32(a, b, v, 1);
748 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
749 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
750 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
751 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
752 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
753 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
754 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
755 // CHECK: ret <4 x i32> [[ADD]]
756 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
757 return vmlal_high_laneq_u16(a, b, v, 7);
760 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
761 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
762 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
763 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
764 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
765 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
766 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
767 // CHECK: ret <2 x i64> [[ADD]]
768 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
769 return vmlal_high_laneq_u32(a, b, v, 3);
772 // CHECK-LABEL: @test_vmlsl_lane_u16(
773 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
774 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
775 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
776 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
777 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
778 // CHECK: ret <4 x i32> [[SUB]]
779 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
780 return vmlsl_lane_u16(a, b, v, 3);
783 // CHECK-LABEL: @test_vmlsl_lane_u32(
784 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
785 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
786 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
787 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
788 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
789 // CHECK: ret <2 x i64> [[SUB]]
790 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
791 return vmlsl_lane_u32(a, b, v, 1);
794 // CHECK-LABEL: @test_vmlsl_laneq_u16(
795 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
796 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
797 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
798 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
799 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
800 // CHECK: ret <4 x i32> [[SUB]]
801 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
802 return vmlsl_laneq_u16(a, b, v, 7);
805 // CHECK-LABEL: @test_vmlsl_laneq_u32(
806 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
807 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
808 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
809 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
810 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
811 // CHECK: ret <2 x i64> [[SUB]]
812 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
813 return vmlsl_laneq_u32(a, b, v, 3);
816 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
817 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
818 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
819 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
820 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
821 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
822 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
823 // CHECK: ret <4 x i32> [[SUB]]
824 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
825 return vmlsl_high_lane_u16(a, b, v, 3);
828 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
829 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
831 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
832 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
833 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
834 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
835 // CHECK: ret <2 x i64> [[SUB]]
836 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
837 return vmlsl_high_lane_u32(a, b, v, 1);
840 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
841 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
842 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
843 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
844 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
845 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
846 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
847 // CHECK: ret <4 x i32> [[SUB]]
848 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
849 return vmlsl_high_laneq_u16(a, b, v, 7);
852 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
853 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
854 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
855 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
856 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
857 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
858 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
859 // CHECK: ret <2 x i64> [[SUB]]
860 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
861 return vmlsl_high_laneq_u32(a, b, v, 3);
864 // CHECK-LABEL: @test_vmull_lane_s16(
865 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
866 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
867 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
868 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
869 // CHECK: ret <4 x i32> [[VMULL2_I]]
870 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
871 return vmull_lane_s16(a, v, 3);
874 // CHECK-LABEL: @test_vmull_lane_s32(
875 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
876 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
877 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
878 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
879 // CHECK: ret <2 x i64> [[VMULL2_I]]
880 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
881 return vmull_lane_s32(a, v, 1);
884 // CHECK-LABEL: @test_vmull_lane_u16(
885 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
886 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
887 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
888 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
889 // CHECK: ret <4 x i32> [[VMULL2_I]]
890 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
891 return vmull_lane_u16(a, v, 3);
894 // CHECK-LABEL: @test_vmull_lane_u32(
895 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
896 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
897 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
898 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
899 // CHECK: ret <2 x i64> [[VMULL2_I]]
900 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
901 return vmull_lane_u32(a, v, 1);
904 // CHECK-LABEL: @test_vmull_high_lane_s16(
905 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
906 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
907 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
908 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
909 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
910 // CHECK: ret <4 x i32> [[VMULL2_I]]
911 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
912 return vmull_high_lane_s16(a, v, 3);
915 // CHECK-LABEL: @test_vmull_high_lane_s32(
916 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
917 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
918 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
919 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
920 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
921 // CHECK: ret <2 x i64> [[VMULL2_I]]
922 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
923 return vmull_high_lane_s32(a, v, 1);
926 // CHECK-LABEL: @test_vmull_high_lane_u16(
927 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
928 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
929 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
930 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
931 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
932 // CHECK: ret <4 x i32> [[VMULL2_I]]
933 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
934 return vmull_high_lane_u16(a, v, 3);
937 // CHECK-LABEL: @test_vmull_high_lane_u32(
938 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
939 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
940 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
941 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
942 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
943 // CHECK: ret <2 x i64> [[VMULL2_I]]
944 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
945 return vmull_high_lane_u32(a, v, 1);
948 // CHECK-LABEL: @test_vmull_laneq_s16(
949 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
950 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
951 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
952 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
953 // CHECK: ret <4 x i32> [[VMULL2_I]]
954 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
955 return vmull_laneq_s16(a, v, 7);
958 // CHECK-LABEL: @test_vmull_laneq_s32(
959 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
960 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
961 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
962 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
963 // CHECK: ret <2 x i64> [[VMULL2_I]]
964 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
965 return vmull_laneq_s32(a, v, 3);
968 // CHECK-LABEL: @test_vmull_laneq_u16(
969 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
970 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
971 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
972 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
973 // CHECK: ret <4 x i32> [[VMULL2_I]]
974 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
975 return vmull_laneq_u16(a, v, 7);
978 // CHECK-LABEL: @test_vmull_laneq_u32(
979 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
980 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
981 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
982 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
983 // CHECK: ret <2 x i64> [[VMULL2_I]]
984 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
985 return vmull_laneq_u32(a, v, 3);
988 // CHECK-LABEL: @test_vmull_high_laneq_s16(
989 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
990 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
991 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
992 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
993 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
994 // CHECK: ret <4 x i32> [[VMULL2_I]]
995 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
996 return vmull_high_laneq_s16(a, v, 7);
999 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1000 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1001 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1002 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1003 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1004 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1005 // CHECK: ret <2 x i64> [[VMULL2_I]]
1006 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1007 return vmull_high_laneq_s32(a, v, 3);
1010 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1011 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1012 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1013 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1014 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1015 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1016 // CHECK: ret <4 x i32> [[VMULL2_I]]
1017 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1018 return vmull_high_laneq_u16(a, v, 7);
1021 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1022 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1023 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1024 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1025 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1026 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1027 // CHECK: ret <2 x i64> [[VMULL2_I]]
1028 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1029 return vmull_high_laneq_u32(a, v, 3);
1032 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1033 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1034 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1035 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1036 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1037 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1038 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1039 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
1040 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1041 return vqdmlal_lane_s16(a, b, v, 3);
1044 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1045 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1046 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1047 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1048 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1049 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1050 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1051 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
1052 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1053 return vqdmlal_lane_s32(a, b, v, 1);
1056 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1057 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1058 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1059 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1060 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1061 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1062 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1063 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1064 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
1065 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1066 return vqdmlal_high_lane_s16(a, b, v, 3);
1069 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1070 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1071 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1072 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1073 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1074 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1075 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1076 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1077 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
1078 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1079 return vqdmlal_high_lane_s32(a, b, v, 1);
1082 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1083 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1084 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1085 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1086 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1087 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1088 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1089 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
1090 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1091 return vqdmlsl_lane_s16(a, b, v, 3);
1094 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1095 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1096 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1097 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1098 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1099 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1100 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1101 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
1102 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1103 return vqdmlsl_lane_s32(a, b, v, 1);
1106 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1107 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1108 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1109 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1110 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1111 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1112 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1113 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1114 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
1115 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1116 return vqdmlsl_high_lane_s16(a, b, v, 3);
1119 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1120 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1121 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1122 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1123 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1124 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1125 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1126 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1127 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
1128 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1129 return vqdmlsl_high_lane_s32(a, b, v, 1);
1132 // CHECK-LABEL: @test_vqdmull_lane_s16(
1133 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1134 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1136 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1137 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1138 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1139 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1140 return vqdmull_lane_s16(a, v, 3);
1143 // CHECK-LABEL: @test_vqdmull_lane_s32(
1144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1147 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1148 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1149 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1150 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1151 return vqdmull_lane_s32(a, v, 1);
1154 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1155 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1156 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1157 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1158 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1159 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1160 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1161 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1162 return vqdmull_laneq_s16(a, v, 3);
1165 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1166 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1167 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1168 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1169 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1170 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1171 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1172 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1173 return vqdmull_laneq_s32(a, v, 3);
1176 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1177 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1178 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1179 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1180 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1181 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1182 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1183 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1184 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1185 return vqdmull_high_lane_s16(a, v, 3);
1188 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1189 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1190 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1191 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1192 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1193 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1194 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1195 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1196 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1197 return vqdmull_high_lane_s32(a, v, 1);
1200 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1201 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1202 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1203 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1204 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1205 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1206 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1207 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
1208 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1209 return vqdmull_high_laneq_s16(a, v, 7);
1212 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1213 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1214 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1215 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1216 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1217 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1218 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1219 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
1220 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1221 return vqdmull_high_laneq_s32(a, v, 3);
1224 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1225 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1226 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1227 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1228 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1229 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1230 // CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
1231 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1232 return vqdmulh_lane_s16(a, v, 3);
1235 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1236 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1237 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1238 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1239 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1240 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1241 // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
1242 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1243 return vqdmulhq_lane_s16(a, v, 3);
1246 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1247 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1248 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1249 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1250 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1251 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1252 // CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
1253 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1254 return vqdmulh_lane_s32(a, v, 1);
1257 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1258 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1259 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1260 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1261 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1262 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1263 // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
1264 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1265 return vqdmulhq_lane_s32(a, v, 1);
1268 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1269 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1270 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1271 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1272 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1273 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1274 // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
1275 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1276 return vqrdmulh_lane_s16(a, v, 3);
1279 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1280 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1281 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1282 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1283 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1284 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1285 // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
1286 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1287 return vqrdmulhq_lane_s16(a, v, 3);
1290 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1291 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1292 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1293 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1294 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1295 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1296 // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
1297 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1298 return vqrdmulh_lane_s32(a, v, 1);
1301 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1302 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1303 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1304 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1305 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1306 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1307 // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
1308 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1309 return vqrdmulhq_lane_s32(a, v, 1);
1312 // CHECK-LABEL: @test_vmul_lane_f32(
1313 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1314 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1315 // CHECK: ret <2 x float> [[MUL]]
1316 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1317 return vmul_lane_f32(a, v, 1);
1320 // CHECK-LABEL: @test_vmul_lane_f64(
1321 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1322 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1323 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1324 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1325 // CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1326 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1327 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1328 // CHECK: ret <1 x double> [[TMP5]]
1330 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1331 return vmul_lane_f64(a, v, 0);
1334 // CHECK-LABEL: @test_vmulq_lane_f32(
1335 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1336 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1337 // CHECK: ret <4 x float> [[MUL]]
1339 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1340 return vmulq_lane_f32(a, v, 1);
1343 // CHECK-LABEL: @test_vmulq_lane_f64(
1344 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1345 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1346 // CHECK: ret <2 x double> [[MUL]]
1347 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1348 return vmulq_lane_f64(a, v, 0);
1351 // CHECK-LABEL: @test_vmul_laneq_f32(
1352 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1353 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1354 // CHECK: ret <2 x float> [[MUL]]
1355 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1356 return vmul_laneq_f32(a, v, 3);
1359 // CHECK-LABEL: @test_vmul_laneq_f64(
1360 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1361 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1362 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1363 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1364 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1365 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1366 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1367 // CHECK: ret <1 x double> [[TMP5]]
1368 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1369 return vmul_laneq_f64(a, v, 1);
1372 // CHECK-LABEL: @test_vmulq_laneq_f32(
1373 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1374 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1375 // CHECK: ret <4 x float> [[MUL]]
1377 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1378 return vmulq_laneq_f32(a, v, 3);
1381 // CHECK-LABEL: @test_vmulq_laneq_f64(
1382 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1383 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1384 // CHECK: ret <2 x double> [[MUL]]
1385 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1386 return vmulq_laneq_f64(a, v, 1);
1389 // CHECK-LABEL: @test_vmulx_lane_f32(
1390 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1391 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1392 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1393 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1394 // CHECK: ret <2 x float> [[VMULX2_I]]
1395 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1396 return vmulx_lane_f32(a, v, 1);
1399 // CHECK-LABEL: @test_vmulxq_lane_f32(
1400 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1401 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1402 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1403 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1404 // CHECK: ret <4 x float> [[VMULX2_I]]
1405 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1406 return vmulxq_lane_f32(a, v, 1);
1409 // CHECK-LABEL: @test_vmulxq_lane_f64(
1410 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1411 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1412 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1413 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1414 // CHECK: ret <2 x double> [[VMULX2_I]]
1415 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1416 return vmulxq_lane_f64(a, v, 0);
1419 // CHECK-LABEL: @test_vmulx_laneq_f32(
1420 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1421 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1422 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1423 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1424 // CHECK: ret <2 x float> [[VMULX2_I]]
1425 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1426 return vmulx_laneq_f32(a, v, 3);
1429 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1430 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1431 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1432 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1433 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1434 // CHECK: ret <4 x float> [[VMULX2_I]]
1435 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1436 return vmulxq_laneq_f32(a, v, 3);
1439 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1440 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1441 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1442 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1443 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1444 // CHECK: ret <2 x double> [[VMULX2_I]]
1445 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1446 return vmulxq_laneq_f64(a, v, 1);
1449 // CHECK-LABEL: @test_vmla_lane_s16_0(
1450 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1451 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1452 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1453 // CHECK: ret <4 x i16> [[ADD]]
1454 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1455 return vmla_lane_s16(a, b, v, 0);
1458 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1459 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1460 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1461 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1462 // CHECK: ret <8 x i16> [[ADD]]
1463 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1464 return vmlaq_lane_s16(a, b, v, 0);
1467 // CHECK-LABEL: @test_vmla_lane_s32_0(
1468 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1469 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1470 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1471 // CHECK: ret <2 x i32> [[ADD]]
1472 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1473 return vmla_lane_s32(a, b, v, 0);
1476 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1477 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1478 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1479 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1480 // CHECK: ret <4 x i32> [[ADD]]
1481 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1482 return vmlaq_lane_s32(a, b, v, 0);
1485 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1486 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1487 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1488 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1489 // CHECK: ret <4 x i16> [[ADD]]
1490 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1491 return vmla_laneq_s16(a, b, v, 0);
1494 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1495 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1496 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1497 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1498 // CHECK: ret <8 x i16> [[ADD]]
1499 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1500 return vmlaq_laneq_s16(a, b, v, 0);
1503 // CHECK-LABEL: @test_vmla_laneq_s32_0(
1504 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1505 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1506 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1507 // CHECK: ret <2 x i32> [[ADD]]
1508 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1509 return vmla_laneq_s32(a, b, v, 0);
1512 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
1513 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1514 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1515 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1516 // CHECK: ret <4 x i32> [[ADD]]
1517 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1518 return vmlaq_laneq_s32(a, b, v, 0);
1521 // CHECK-LABEL: @test_vmls_lane_s16_0(
1522 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1523 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1524 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1525 // CHECK: ret <4 x i16> [[SUB]]
1526 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1527 return vmls_lane_s16(a, b, v, 0);
1530 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
1531 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1532 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1533 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1534 // CHECK: ret <8 x i16> [[SUB]]
1535 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1536 return vmlsq_lane_s16(a, b, v, 0);
1539 // CHECK-LABEL: @test_vmls_lane_s32_0(
1540 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1541 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1542 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1543 // CHECK: ret <2 x i32> [[SUB]]
1544 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1545 return vmls_lane_s32(a, b, v, 0);
1548 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
1549 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1550 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1551 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1552 // CHECK: ret <4 x i32> [[SUB]]
1553 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1554 return vmlsq_lane_s32(a, b, v, 0);
1557 // CHECK-LABEL: @test_vmls_laneq_s16_0(
1558 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1559 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1560 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1561 // CHECK: ret <4 x i16> [[SUB]]
1562 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1563 return vmls_laneq_s16(a, b, v, 0);
1566 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
1567 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1568 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1569 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1570 // CHECK: ret <8 x i16> [[SUB]]
1571 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1572 return vmlsq_laneq_s16(a, b, v, 0);
1575 // CHECK-LABEL: @test_vmls_laneq_s32_0(
1576 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1577 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1578 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1579 // CHECK: ret <2 x i32> [[SUB]]
1580 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1581 return vmls_laneq_s32(a, b, v, 0);
1584 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
1585 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1586 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1587 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1588 // CHECK: ret <4 x i32> [[SUB]]
1589 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1590 return vmlsq_laneq_s32(a, b, v, 0);
1593 // CHECK-LABEL: @test_vmul_lane_s16_0(
1594 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1595 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1596 // CHECK: ret <4 x i16> [[MUL]]
1597 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1598 return vmul_lane_s16(a, v, 0);
1601 // CHECK-LABEL: @test_vmulq_lane_s16_0(
1602 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1603 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1604 // CHECK: ret <8 x i16> [[MUL]]
1605 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1606 return vmulq_lane_s16(a, v, 0);
1609 // CHECK-LABEL: @test_vmul_lane_s32_0(
1610 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1611 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1612 // CHECK: ret <2 x i32> [[MUL]]
1613 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1614 return vmul_lane_s32(a, v, 0);
1617 // CHECK-LABEL: @test_vmulq_lane_s32_0(
1618 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1619 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1620 // CHECK: ret <4 x i32> [[MUL]]
1621 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1622 return vmulq_lane_s32(a, v, 0);
1625 // CHECK-LABEL: @test_vmul_lane_u16_0(
1626 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1627 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1628 // CHECK: ret <4 x i16> [[MUL]]
1629 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1630 return vmul_lane_u16(a, v, 0);
1633 // CHECK-LABEL: @test_vmulq_lane_u16_0(
1634 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1635 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1636 // CHECK: ret <8 x i16> [[MUL]]
1637 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1638 return vmulq_lane_u16(a, v, 0);
1641 // CHECK-LABEL: @test_vmul_lane_u32_0(
1642 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1643 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1644 // CHECK: ret <2 x i32> [[MUL]]
1645 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1646 return vmul_lane_u32(a, v, 0);
1649 // CHECK-LABEL: @test_vmulq_lane_u32_0(
1650 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1651 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1652 // CHECK: ret <4 x i32> [[MUL]]
1653 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1654 return vmulq_lane_u32(a, v, 0);
1657 // CHECK-LABEL: @test_vmul_laneq_s16_0(
1658 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1659 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1660 // CHECK: ret <4 x i16> [[MUL]]
1661 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1662 return vmul_laneq_s16(a, v, 0);
1665 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
1666 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1667 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1668 // CHECK: ret <8 x i16> [[MUL]]
1669 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1670 return vmulq_laneq_s16(a, v, 0);
1673 // CHECK-LABEL: @test_vmul_laneq_s32_0(
1674 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1675 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1676 // CHECK: ret <2 x i32> [[MUL]]
1677 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1678 return vmul_laneq_s32(a, v, 0);
1681 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
1682 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1683 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1684 // CHECK: ret <4 x i32> [[MUL]]
1685 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1686 return vmulq_laneq_s32(a, v, 0);
1689 // CHECK-LABEL: @test_vmul_laneq_u16_0(
1690 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1691 // CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1692 // CHECK: ret <4 x i16> [[MUL]]
1693 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1694 return vmul_laneq_u16(a, v, 0);
1697 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
1698 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1699 // CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1700 // CHECK: ret <8 x i16> [[MUL]]
1701 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1702 return vmulq_laneq_u16(a, v, 0);
1705 // CHECK-LABEL: @test_vmul_laneq_u32_0(
1706 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1707 // CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1708 // CHECK: ret <2 x i32> [[MUL]]
1709 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1710 return vmul_laneq_u32(a, v, 0);
1713 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
1714 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1715 // CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1716 // CHECK: ret <4 x i32> [[MUL]]
1717 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1718 return vmulq_laneq_u32(a, v, 0);
1721 // CHECK-LABEL: @test_vfma_lane_f32_0(
1722 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1723 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1724 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1725 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1726 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1727 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1728 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1729 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1730 // CHECK: ret <2 x float> [[FMLA2]]
1731 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1732 return vfma_lane_f32(a, b, v, 0);
1735 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
1736 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1737 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1738 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1739 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1740 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1741 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1742 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1743 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1744 // CHECK: ret <4 x float> [[FMLA2]]
1745 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1746 return vfmaq_lane_f32(a, b, v, 0);
1749 // CHECK-LABEL: @test_vfma_laneq_f32_0(
1750 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1751 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1752 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1753 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1754 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1755 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1756 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1757 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1758 // CHECK: ret <2 x float> [[TMP6]]
1759 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1760 return vfma_laneq_f32(a, b, v, 0);
1763 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
1764 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1765 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1766 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1767 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1768 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1769 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1770 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1771 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1772 // CHECK: ret <4 x float> [[TMP6]]
1773 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1774 return vfmaq_laneq_f32(a, b, v, 0);
1777 // CHECK-LABEL: @test_vfms_lane_f32_0(
1778 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1779 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1780 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1781 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1782 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1783 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1784 // CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1785 // CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1786 // CHECK: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1787 // CHECK: ret <2 x float> [[FMLA2]]
1788 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1789 return vfms_lane_f32(a, b, v, 0);
1792 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
1793 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1794 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1795 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1796 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1797 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1798 // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1799 // CHECK: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1800 // CHECK: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1801 // CHECK: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1802 // CHECK: ret <4 x float> [[FMLA2]]
1803 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1804 return vfmsq_lane_f32(a, b, v, 0);
1807 // CHECK-LABEL: @test_vfms_laneq_f32_0(
1808 // CHECK: [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1809 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1810 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1811 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1812 // CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1813 // CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1814 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1815 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1816 // CHECK: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1817 // CHECK: ret <2 x float> [[TMP6]]
1818 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1819 return vfms_laneq_f32(a, b, v, 0);
1822 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
1823 // CHECK: [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1824 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1825 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1826 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1827 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1828 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1829 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1830 // CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1831 // CHECK: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1832 // CHECK: ret <4 x float> [[TMP6]]
1833 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1834 return vfmsq_laneq_f32(a, b, v, 0);
1837 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
1838 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1839 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
1840 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1841 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1842 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1843 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1844 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1845 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1846 // CHECK: ret <2 x double> [[TMP6]]
1847 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1848 return vfmaq_laneq_f64(a, b, v, 0);
1851 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
1852 // CHECK: [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
1853 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1854 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
1855 // CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1856 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1857 // CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1858 // CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1859 // CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1860 // CHECK: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1861 // CHECK: ret <2 x double> [[TMP6]]
1862 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1863 return vfmsq_laneq_f64(a, b, v, 0);
1866 // CHECK-LABEL: @test_vmlal_lane_s16_0(
1867 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1868 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1869 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1870 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1871 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1872 // CHECK: ret <4 x i32> [[ADD]]
1873 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1874 return vmlal_lane_s16(a, b, v, 0);
1877 // CHECK-LABEL: @test_vmlal_lane_s32_0(
1878 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1879 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1880 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1881 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1882 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1883 // CHECK: ret <2 x i64> [[ADD]]
1884 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1885 return vmlal_lane_s32(a, b, v, 0);
1888 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
1889 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1890 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1891 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1892 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1893 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1894 // CHECK: ret <4 x i32> [[ADD]]
1895 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1896 return vmlal_laneq_s16(a, b, v, 0);
1899 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
1900 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1901 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1902 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1903 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1904 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1905 // CHECK: ret <2 x i64> [[ADD]]
1906 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1907 return vmlal_laneq_s32(a, b, v, 0);
1910 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
1911 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1912 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1913 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1914 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1915 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1916 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1917 // CHECK: ret <4 x i32> [[ADD]]
1918 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
1919 return vmlal_high_lane_s16(a, b, v, 0);
1922 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
1923 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1924 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1925 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1926 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1927 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1928 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1929 // CHECK: ret <2 x i64> [[ADD]]
1930 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
1931 return vmlal_high_lane_s32(a, b, v, 0);
1934 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
1935 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1936 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1937 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1938 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1939 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1940 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1941 // CHECK: ret <4 x i32> [[ADD]]
1942 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
1943 return vmlal_high_laneq_s16(a, b, v, 0);
1946 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
1947 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1948 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1949 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1950 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1951 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1952 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1953 // CHECK: ret <2 x i64> [[ADD]]
1954 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
1955 return vmlal_high_laneq_s32(a, b, v, 0);
1958 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
1959 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1960 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1961 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1962 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1963 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1964 // CHECK: ret <4 x i32> [[SUB]]
1965 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1966 return vmlsl_lane_s16(a, b, v, 0);
1969 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
1970 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1971 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1972 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1973 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1974 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1975 // CHECK: ret <2 x i64> [[SUB]]
1976 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1977 return vmlsl_lane_s32(a, b, v, 0);
1980 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
1981 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1982 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1983 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1984 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1985 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1986 // CHECK: ret <4 x i32> [[SUB]]
1987 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1988 return vmlsl_laneq_s16(a, b, v, 0);
1991 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
1992 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1993 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1994 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1995 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1996 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1997 // CHECK: ret <2 x i64> [[SUB]]
1998 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1999 return vmlsl_laneq_s32(a, b, v, 0);
2002 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2003 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2005 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2006 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2007 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2008 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2009 // CHECK: ret <4 x i32> [[SUB]]
2010 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2011 return vmlsl_high_lane_s16(a, b, v, 0);
2014 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2015 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2016 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2017 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2018 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2019 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2020 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2021 // CHECK: ret <2 x i64> [[SUB]]
2022 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2023 return vmlsl_high_lane_s32(a, b, v, 0);
2026 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2027 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2028 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2029 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2030 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2031 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2032 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2033 // CHECK: ret <4 x i32> [[SUB]]
2034 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2035 return vmlsl_high_laneq_s16(a, b, v, 0);
2038 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2039 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2040 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2041 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2042 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2043 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2044 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2045 // CHECK: ret <2 x i64> [[SUB]]
2046 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2047 return vmlsl_high_laneq_s32(a, b, v, 0);
2050 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2051 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2052 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2053 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2054 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2055 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2056 // CHECK: ret <4 x i32> [[ADD]]
2057 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2058 return vmlal_lane_u16(a, b, v, 0);
2061 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2062 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2063 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2064 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2065 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2066 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067 // CHECK: ret <2 x i64> [[ADD]]
2068 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069 return vmlal_lane_u32(a, b, v, 0);
2072 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2073 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2077 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2078 // CHECK: ret <4 x i32> [[ADD]]
2079 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2080 return vmlal_laneq_u16(a, b, v, 0);
2083 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2084 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2085 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2086 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2087 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2088 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2089 // CHECK: ret <2 x i64> [[ADD]]
2090 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2091 return vmlal_laneq_u32(a, b, v, 0);
2094 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2095 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2096 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2097 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2098 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2099 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2100 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2101 // CHECK: ret <4 x i32> [[ADD]]
2102 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2103 return vmlal_high_lane_u16(a, b, v, 0);
2106 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2107 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2108 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2109 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2110 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2111 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2112 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2113 // CHECK: ret <2 x i64> [[ADD]]
2114 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2115 return vmlal_high_lane_u32(a, b, v, 0);
2118 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2119 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2120 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2121 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2122 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2123 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2124 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2125 // CHECK: ret <4 x i32> [[ADD]]
2126 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2127 return vmlal_high_laneq_u16(a, b, v, 0);
2130 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2131 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2132 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2133 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2134 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2135 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2136 // CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2137 // CHECK: ret <2 x i64> [[ADD]]
2138 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2139 return vmlal_high_laneq_u32(a, b, v, 0);
2142 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2143 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2144 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2145 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2146 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2147 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2148 // CHECK: ret <4 x i32> [[SUB]]
2149 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2150 return vmlsl_lane_u16(a, b, v, 0);
2153 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2154 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2155 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2156 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2157 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2158 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2159 // CHECK: ret <2 x i64> [[SUB]]
2160 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2161 return vmlsl_lane_u32(a, b, v, 0);
2164 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2165 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2166 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2167 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2168 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2169 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2170 // CHECK: ret <4 x i32> [[SUB]]
2171 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2172 return vmlsl_laneq_u16(a, b, v, 0);
2175 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2176 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2177 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2178 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2179 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2180 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2181 // CHECK: ret <2 x i64> [[SUB]]
2182 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2183 return vmlsl_laneq_u32(a, b, v, 0);
2186 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2187 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2188 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2189 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2190 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2191 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2192 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2193 // CHECK: ret <4 x i32> [[SUB]]
2194 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2195 return vmlsl_high_lane_u16(a, b, v, 0);
2198 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2199 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2200 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2201 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2202 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2203 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2204 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2205 // CHECK: ret <2 x i64> [[SUB]]
2206 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2207 return vmlsl_high_lane_u32(a, b, v, 0);
2210 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2211 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2212 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2213 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2214 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2215 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2216 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2217 // CHECK: ret <4 x i32> [[SUB]]
2218 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2219 return vmlsl_high_laneq_u16(a, b, v, 0);
2222 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2223 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2224 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2225 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2226 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2227 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2228 // CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229 // CHECK: ret <2 x i64> [[SUB]]
2230 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2231 return vmlsl_high_laneq_u32(a, b, v, 0);
2234 // CHECK-LABEL: @test_vmull_lane_s16_0(
2235 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2236 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2237 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2238 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2239 // CHECK: ret <4 x i32> [[VMULL2_I]]
2240 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2241 return vmull_lane_s16(a, v, 0);
2244 // CHECK-LABEL: @test_vmull_lane_s32_0(
2245 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2246 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2247 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2248 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2249 // CHECK: ret <2 x i64> [[VMULL2_I]]
2250 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2251 return vmull_lane_s32(a, v, 0);
2254 // CHECK-LABEL: @test_vmull_lane_u16_0(
2255 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2256 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2257 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2258 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2259 // CHECK: ret <4 x i32> [[VMULL2_I]]
2260 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2261 return vmull_lane_u16(a, v, 0);
2264 // CHECK-LABEL: @test_vmull_lane_u32_0(
2265 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2266 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2267 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2268 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2269 // CHECK: ret <2 x i64> [[VMULL2_I]]
2270 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2271 return vmull_lane_u32(a, v, 0);
2274 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
2275 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2276 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2277 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2278 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2279 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2280 // CHECK: ret <4 x i32> [[VMULL2_I]]
2281 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2282 return vmull_high_lane_s16(a, v, 0);
2285 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
2286 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2287 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2288 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2289 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2290 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2291 // CHECK: ret <2 x i64> [[VMULL2_I]]
2292 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2293 return vmull_high_lane_s32(a, v, 0);
2296 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
2297 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2298 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2299 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2300 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2301 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2302 // CHECK: ret <4 x i32> [[VMULL2_I]]
2303 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2304 return vmull_high_lane_u16(a, v, 0);
2307 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
2308 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2309 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2310 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2311 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2312 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2313 // CHECK: ret <2 x i64> [[VMULL2_I]]
2314 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2315 return vmull_high_lane_u32(a, v, 0);
2318 // CHECK-LABEL: @test_vmull_laneq_s16_0(
2319 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2320 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2321 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2322 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2323 // CHECK: ret <4 x i32> [[VMULL2_I]]
2324 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2325 return vmull_laneq_s16(a, v, 0);
2328 // CHECK-LABEL: @test_vmull_laneq_s32_0(
2329 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2330 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2331 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2332 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2333 // CHECK: ret <2 x i64> [[VMULL2_I]]
2334 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2335 return vmull_laneq_s32(a, v, 0);
2338 // CHECK-LABEL: @test_vmull_laneq_u16_0(
2339 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2340 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2341 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2342 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2343 // CHECK: ret <4 x i32> [[VMULL2_I]]
2344 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2345 return vmull_laneq_u16(a, v, 0);
2348 // CHECK-LABEL: @test_vmull_laneq_u32_0(
2349 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2350 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2351 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2352 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2353 // CHECK: ret <2 x i64> [[VMULL2_I]]
2354 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2355 return vmull_laneq_u32(a, v, 0);
2358 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
2359 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2360 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2361 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2362 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2363 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2364 // CHECK: ret <4 x i32> [[VMULL2_I]]
2365 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2366 return vmull_high_laneq_s16(a, v, 0);
2369 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
2370 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2371 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2372 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2373 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2374 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2375 // CHECK: ret <2 x i64> [[VMULL2_I]]
2376 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2377 return vmull_high_laneq_s32(a, v, 0);
2380 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
2381 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2382 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2383 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2384 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2385 // CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2386 // CHECK: ret <4 x i32> [[VMULL2_I]]
2387 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2388 return vmull_high_laneq_u16(a, v, 0);
2391 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
2392 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2393 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2394 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2395 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2396 // CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2397 // CHECK: ret <2 x i64> [[VMULL2_I]]
2398 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2399 return vmull_high_laneq_u32(a, v, 0);
2402 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
2403 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2404 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2405 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2406 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2407 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2408 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2409 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
2410 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2411 return vqdmlal_lane_s16(a, b, v, 0);
2414 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
2415 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2416 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2417 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2418 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2419 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2420 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2421 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
2422 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2423 return vqdmlal_lane_s32(a, b, v, 0);
2426 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
2427 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2428 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2429 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2430 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2431 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2432 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2433 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2434 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
2435 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2436 return vqdmlal_high_lane_s16(a, b, v, 0);
2439 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
2440 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2441 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2442 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2443 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2444 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2445 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2446 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2447 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
2448 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2449 return vqdmlal_high_lane_s32(a, b, v, 0);
2452 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
2453 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2454 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2455 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2456 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2457 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2458 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2459 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
2460 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2461 return vqdmlsl_lane_s16(a, b, v, 0);
2464 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
2465 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2466 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2467 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2468 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2470 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2471 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
2472 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2473 return vqdmlsl_lane_s32(a, b, v, 0);
2476 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
2477 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2478 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2479 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2480 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2481 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2483 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2484 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
2485 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2486 return vqdmlsl_high_lane_s16(a, b, v, 0);
2489 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
2490 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2491 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2493 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2494 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2495 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2496 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2497 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
2498 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2499 return vqdmlsl_high_lane_s32(a, b, v, 0);
2502 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
2503 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2507 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2508 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2509 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2510 return vqdmull_lane_s16(a, v, 0);
2513 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
2514 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2515 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2516 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2517 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2518 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2519 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2520 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2521 return vqdmull_lane_s32(a, v, 0);
2524 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
2525 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2526 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2527 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2528 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2529 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2530 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2531 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2532 return vqdmull_laneq_s16(a, v, 0);
2535 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
2536 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2537 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2538 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2539 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2540 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2541 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2542 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2543 return vqdmull_laneq_s32(a, v, 0);
2546 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
2547 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2548 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2549 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2550 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2551 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2552 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2553 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2554 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2555 return vqdmull_high_lane_s16(a, v, 0);
2558 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
2559 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2560 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2561 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2562 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2563 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2564 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2565 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2566 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2567 return vqdmull_high_lane_s32(a, v, 0);
2570 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
2571 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2572 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2573 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2574 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2575 // CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2576 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2577 // CHECK: ret <4 x i32> [[VQDMULL_V2_I]]
2578 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2579 return vqdmull_high_laneq_s16(a, v, 0);
2582 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
2583 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2584 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2585 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2586 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2587 // CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2588 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2589 // CHECK: ret <2 x i64> [[VQDMULL_V2_I]]
2590 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2591 return vqdmull_high_laneq_s32(a, v, 0);
2594 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
2595 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2596 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2597 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2598 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2599 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2600 // CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
2601 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2602 return vqdmulh_lane_s16(a, v, 0);
2605 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
2606 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2607 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2608 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2609 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2610 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2611 // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
2612 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2613 return vqdmulhq_lane_s16(a, v, 0);
2616 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
2617 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2618 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2619 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2620 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2621 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2622 // CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
2623 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2624 return vqdmulh_lane_s32(a, v, 0);
2627 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
2628 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2629 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2630 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2631 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2632 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2633 // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
2634 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2635 return vqdmulhq_lane_s32(a, v, 0);
2638 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
2639 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2640 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2641 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2642 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2643 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2644 // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
2645 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2646 return vqrdmulh_lane_s16(a, v, 0);
2649 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
2650 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2651 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2652 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2653 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2654 // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
2655 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2656 return vqrdmulhq_lane_s16(a, v, 0);
2659 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
2660 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2661 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2662 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2663 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2664 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
2665 // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
2666 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2667 return vqrdmulh_lane_s32(a, v, 0);
2670 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
2671 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2672 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2673 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2674 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2675 // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
2676 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2677 return vqrdmulhq_lane_s32(a, v, 0);
2680 // CHECK-LABEL: @test_vmul_lane_f32_0(
2681 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2682 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2683 // CHECK: ret <2 x float> [[MUL]]
2684 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
2685 return vmul_lane_f32(a, v, 0);
2688 // CHECK-LABEL: @test_vmulq_lane_f32_0(
2689 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2690 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2691 // CHECK: ret <4 x float> [[MUL]]
2692 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
2693 return vmulq_lane_f32(a, v, 0);
2696 // CHECK-LABEL: @test_vmul_laneq_f32_0(
2697 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2698 // CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2699 // CHECK: ret <2 x float> [[MUL]]
2700 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
2701 return vmul_laneq_f32(a, v, 0);
2704 // CHECK-LABEL: @test_vmul_laneq_f64_0(
2705 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
2706 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
2707 // CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
2708 // CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2709 // CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
2710 // CHECK: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
2711 // CHECK: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
2712 // CHECK: ret <1 x double> [[TMP5]]
2713 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
2714 return vmul_laneq_f64(a, v, 0);
2717 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
2718 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2719 // CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2720 // CHECK: ret <4 x float> [[MUL]]
2721 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2722 return vmulq_laneq_f32(a, v, 0);
2725 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
2726 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2727 // CHECK: [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
2728 // CHECK: ret <2 x double> [[MUL]]
2729 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2730 return vmulq_laneq_f64(a, v, 0);
2733 // CHECK-LABEL: @test_vmulx_lane_f32_0(
2734 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2735 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2736 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2737 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2738 // CHECK: ret <2 x float> [[VMULX2_I]]
2739 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
2740 return vmulx_lane_f32(a, v, 0);
2743 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
2744 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2745 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2746 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2747 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2748 // CHECK: ret <4 x float> [[VMULX2_I]]
2749 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
2750 return vmulxq_lane_f32(a, v, 0);
2753 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
2754 // CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
2755 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2756 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2757 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2758 // CHECK: ret <2 x double> [[VMULX2_I]]
2759 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
2760 return vmulxq_lane_f64(a, v, 0);
2763 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
2764 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2765 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2766 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2767 // CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2768 // CHECK: ret <2 x float> [[VMULX2_I]]
2769 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
2770 return vmulx_laneq_f32(a, v, 0);
2773 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
2774 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2775 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2776 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2777 // CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2778 // CHECK: ret <4 x float> [[VMULX2_I]]
2779 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2780 return vmulxq_laneq_f32(a, v, 0);
2783 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
2784 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2785 // CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2786 // CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2787 // CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2788 // CHECK: ret <2 x double> [[VMULX2_I]]
2789 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2790 return vmulxq_laneq_f64(a, v, 0);
2793 // CHECK-LABEL: @test_vmull_high_n_s16(
2794 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2795 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2796 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2797 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2798 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2799 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2800 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2801 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2802 // CHECK: ret <4 x i32> [[VMULL5_I_I]]
2803 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
2804 return vmull_high_n_s16(a, b);
2807 // CHECK-LABEL: @test_vmull_high_n_s32(
2808 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2809 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2810 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2811 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2812 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2813 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2814 // CHECK: ret <2 x i64> [[VMULL3_I_I]]
2815 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
2816 return vmull_high_n_s32(a, b);
2819 // CHECK-LABEL: @test_vmull_high_n_u16(
2820 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2821 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2822 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2823 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2824 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2825 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2826 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2827 // CHECK: [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2828 // CHECK: ret <4 x i32> [[VMULL5_I_I]]
2829 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
2830 return vmull_high_n_u16(a, b);
2833 // CHECK-LABEL: @test_vmull_high_n_u32(
2834 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2835 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2836 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2837 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2838 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2839 // CHECK: [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2840 // CHECK: ret <2 x i64> [[VMULL3_I_I]]
2841 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
2842 return vmull_high_n_u32(a, b);
2845 // CHECK-LABEL: @test_vqdmull_high_n_s16(
2846 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2847 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2848 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2849 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2850 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2851 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2852 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2853 // CHECK: [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2854 // CHECK: [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
2855 // CHECK: ret <4 x i32> [[VQDMULL_V5_I_I]]
2856 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
2857 return vqdmull_high_n_s16(a, b);
2860 // CHECK-LABEL: @test_vqdmull_high_n_s32(
2861 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2862 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2863 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2864 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2865 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2866 // CHECK: [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2867 // CHECK: [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
2868 // CHECK: ret <2 x i64> [[VQDMULL_V3_I_I]]
2869 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
2870 return vqdmull_high_n_s32(a, b);
2873 // CHECK-LABEL: @test_vmlal_high_n_s16(
2874 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2875 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2876 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2877 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2878 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2879 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2880 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2881 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2882 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2883 // CHECK: ret <4 x i32> [[ADD_I_I]]
2884 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2885 return vmlal_high_n_s16(a, b, c);
2888 // CHECK-LABEL: @test_vmlal_high_n_s32(
2889 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2890 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2891 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2892 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2893 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2894 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2895 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2896 // CHECK: ret <2 x i64> [[ADD_I_I]]
2897 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2898 return vmlal_high_n_s32(a, b, c);
2901 // CHECK-LABEL: @test_vmlal_high_n_u16(
2902 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2903 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2904 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2905 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2906 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2907 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2908 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2909 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2910 // CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2911 // CHECK: ret <4 x i32> [[ADD_I_I]]
2912 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2913 return vmlal_high_n_u16(a, b, c);
2916 // CHECK-LABEL: @test_vmlal_high_n_u32(
2917 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2918 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2919 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2920 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2921 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2922 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2923 // CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2924 // CHECK: ret <2 x i64> [[ADD_I_I]]
2925 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
2926 return vmlal_high_n_u32(a, b, c);
2929 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
2930 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2931 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2932 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2933 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2934 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2935 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2936 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2937 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2938 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2939 // CHECK: [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
2940 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I_I]]
2941 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2942 return vqdmlal_high_n_s16(a, b, c);
2945 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
2946 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2947 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2948 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2949 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2950 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2951 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2952 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2953 // CHECK: [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
2954 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I_I]]
2955 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2956 return vqdmlal_high_n_s32(a, b, c);
2959 // CHECK-LABEL: @test_vmlsl_high_n_s16(
2960 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2961 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2962 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2963 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2964 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2965 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2966 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2967 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2968 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2969 // CHECK: ret <4 x i32> [[SUB_I_I]]
2970 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2971 return vmlsl_high_n_s16(a, b, c);
2974 // CHECK-LABEL: @test_vmlsl_high_n_s32(
2975 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2976 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2977 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2978 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2979 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2980 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2981 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
2982 // CHECK: ret <2 x i64> [[SUB_I_I]]
2983 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2984 return vmlsl_high_n_s32(a, b, c);
2987 // CHECK-LABEL: @test_vmlsl_high_n_u16(
2988 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2989 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2990 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2991 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2992 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2993 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2994 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2995 // CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2996 // CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2997 // CHECK: ret <4 x i32> [[SUB_I_I]]
2998 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2999 return vmlsl_high_n_u16(a, b, c);
3002 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3003 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3004 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3005 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3006 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3007 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3008 // CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3009 // CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3010 // CHECK: ret <2 x i64> [[SUB_I_I]]
3011 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3012 return vmlsl_high_n_u32(a, b, c);
3015 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3016 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3017 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3018 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3019 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3020 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3021 // CHECK: [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3022 // CHECK: [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3023 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3024 // CHECK: [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
3025 // CHECK: [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
3026 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I_I]]
3027 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3028 return vqdmlsl_high_n_s16(a, b, c);
3031 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
3032 // CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3033 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3034 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3035 // CHECK: [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3036 // CHECK: [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3037 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3038 // CHECK: [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3039 // CHECK: [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
3040 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I_I]]
3041 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3042 return vqdmlsl_high_n_s32(a, b, c);
3045 // CHECK-LABEL: @test_vmul_n_f32(
3046 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3047 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3048 // CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3049 // CHECK: ret <2 x float> [[MUL_I]]
3050 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3051 return vmul_n_f32(a, b);
3054 // CHECK-LABEL: @test_vmulq_n_f32(
3055 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3056 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3057 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3058 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3059 // CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3060 // CHECK: ret <4 x float> [[MUL_I]]
3061 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3062 return vmulq_n_f32(a, b);
3065 // CHECK-LABEL: @test_vmulq_n_f64(
3066 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3067 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3068 // CHECK: [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3069 // CHECK: ret <2 x double> [[MUL_I]]
3070 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3071 return vmulq_n_f64(a, b);
3074 // CHECK-LABEL: @test_vfma_n_f32(
3075 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3076 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3077 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3078 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3079 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3080 // CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a)
3081 // CHECK: ret <2 x float> [[TMP3]]
3082 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3083 return vfma_n_f32(a, b, n);
3086 // CHECK-LABEL: @test_vfma_n_f64(
3087 // CHECK: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3088 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3089 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3090 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3091 // CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> [[VECINIT_I]], <1 x double> %a)
3092 // CHECK: ret <1 x double> [[TMP3]]
3093 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3094 return vfma_n_f64(a, b, n);
3097 // CHECK-LABEL: @test_vfmaq_n_f32(
3098 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3099 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3100 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3101 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3102 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3103 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3104 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3105 // CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a)
3106 // CHECK: ret <4 x float> [[TMP3]]
3107 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3108 return vfmaq_n_f32(a, b, n);
3111 // CHECK-LABEL: @test_vfms_n_f32(
3112 // CHECK: [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3113 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3114 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3115 // CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3116 // CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3117 // CHECK: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3118 // CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a)
3119 // CHECK: ret <2 x float> [[TMP3]]
3120 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3121 return vfms_n_f32(a, b, n);
3124 // CHECK-LABEL: @test_vfms_n_f64(
3125 // CHECK: [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
3126 // CHECK: [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3127 // CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3128 // CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
3129 // CHECK: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3130 // CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> [[VECINIT_I]], <1 x double> %a)
3131 // CHECK: ret <1 x double> [[TMP3]]
3132 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3133 return vfms_n_f64(a, b, n);
3136 // CHECK-LABEL: @test_vfmsq_n_f32(
3137 // CHECK: [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3138 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3139 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3140 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3141 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3142 // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3143 // CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3144 // CHECK: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3145 // CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a)
3146 // CHECK: ret <4 x float> [[TMP3]]
3147 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3148 return vfmsq_n_f32(a, b, n);
3151 // CHECK-LABEL: @test_vmul_n_s16(
3152 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3153 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3154 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3155 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3156 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3157 // CHECK: ret <4 x i16> [[MUL_I]]
3158 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3159 return vmul_n_s16(a, b);
3162 // CHECK-LABEL: @test_vmulq_n_s16(
3163 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3164 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3165 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3166 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3167 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3168 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3169 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3170 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3171 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3172 // CHECK: ret <8 x i16> [[MUL_I]]
3173 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3174 return vmulq_n_s16(a, b);
3177 // CHECK-LABEL: @test_vmul_n_s32(
3178 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3179 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3180 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3181 // CHECK: ret <2 x i32> [[MUL_I]]
3182 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3183 return vmul_n_s32(a, b);
3186 // CHECK-LABEL: @test_vmulq_n_s32(
3187 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3188 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3189 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3190 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3191 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3192 // CHECK: ret <4 x i32> [[MUL_I]]
3193 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3194 return vmulq_n_s32(a, b);
3197 // CHECK-LABEL: @test_vmul_n_u16(
3198 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3199 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3200 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3201 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3202 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3203 // CHECK: ret <4 x i16> [[MUL_I]]
3204 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3205 return vmul_n_u16(a, b);
3208 // CHECK-LABEL: @test_vmulq_n_u16(
3209 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3210 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3211 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3212 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3213 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3214 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3215 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3216 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3217 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3218 // CHECK: ret <8 x i16> [[MUL_I]]
3219 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3220 return vmulq_n_u16(a, b);
3223 // CHECK-LABEL: @test_vmul_n_u32(
3224 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3225 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3226 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3227 // CHECK: ret <2 x i32> [[MUL_I]]
3228 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3229 return vmul_n_u32(a, b);
3232 // CHECK-LABEL: @test_vmulq_n_u32(
3233 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3234 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3235 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3236 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3237 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3238 // CHECK: ret <4 x i32> [[MUL_I]]
3239 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3240 return vmulq_n_u32(a, b);
3243 // CHECK-LABEL: @test_vmull_n_s16(
3244 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3245 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3246 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3247 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3248 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3249 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3250 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3251 // CHECK: ret <4 x i32> [[VMULL5_I]]
3252 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3253 return vmull_n_s16(a, b);
3256 // CHECK-LABEL: @test_vmull_n_s32(
3257 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3258 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3259 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3260 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3261 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3262 // CHECK: ret <2 x i64> [[VMULL3_I]]
3263 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3264 return vmull_n_s32(a, b);
3267 // CHECK-LABEL: @test_vmull_n_u16(
3268 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3269 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3270 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3271 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3272 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3273 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3274 // CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3275 // CHECK: ret <4 x i32> [[VMULL5_I]]
3276 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3277 return vmull_n_u16(a, b);
3280 // CHECK-LABEL: @test_vmull_n_u32(
3281 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3282 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3283 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3284 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3285 // CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3286 // CHECK: ret <2 x i64> [[VMULL3_I]]
3287 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3288 return vmull_n_u32(a, b);
3291 // CHECK-LABEL: @test_vqdmull_n_s16(
3292 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3293 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3294 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3295 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3296 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3297 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3298 // CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3299 // CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3300 // CHECK: ret <4 x i32> [[VQDMULL_V5_I]]
3301 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3302 return vqdmull_n_s16(a, b);
3305 // CHECK-LABEL: @test_vqdmull_n_s32(
3306 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3307 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3308 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3309 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3310 // CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3311 // CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3312 // CHECK: ret <2 x i64> [[VQDMULL_V3_I]]
3313 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3314 return vqdmull_n_s32(a, b);
3317 // CHECK-LABEL: @test_vqdmulh_n_s16(
3318 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3319 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3320 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3321 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3322 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3323 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3324 // CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3325 // CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3326 // CHECK: ret <4 x i16> [[VQDMULH_V5_I]]
3327 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3328 return vqdmulh_n_s16(a, b);
3331 // CHECK-LABEL: @test_vqdmulhq_n_s16(
3332 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3333 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3334 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3335 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3336 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3337 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3338 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3339 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3340 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3341 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3342 // CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3343 // CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3344 // CHECK: ret <8 x i16> [[VQDMULHQ_V9_I]]
3345 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3346 return vqdmulhq_n_s16(a, b);
3349 // CHECK-LABEL: @test_vqdmulh_n_s32(
3350 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3351 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3352 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3353 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3354 // CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3355 // CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3356 // CHECK: ret <2 x i32> [[VQDMULH_V3_I]]
3357 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3358 return vqdmulh_n_s32(a, b);
3361 // CHECK-LABEL: @test_vqdmulhq_n_s32(
3362 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3363 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3364 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3365 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3366 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3367 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3368 // CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3369 // CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3370 // CHECK: ret <4 x i32> [[VQDMULHQ_V5_I]]
3371 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3372 return vqdmulhq_n_s32(a, b);
3375 // CHECK-LABEL: @test_vqrdmulh_n_s16(
3376 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3377 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3378 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3379 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3380 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3381 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3382 // CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3383 // CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3384 // CHECK: ret <4 x i16> [[VQRDMULH_V5_I]]
3385 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3386 return vqrdmulh_n_s16(a, b);
3389 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
3390 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3391 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3392 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3393 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3394 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3395 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3396 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3397 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3398 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3399 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3400 // CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3401 // CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3402 // CHECK: ret <8 x i16> [[VQRDMULHQ_V9_I]]
3403 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3404 return vqrdmulhq_n_s16(a, b);
3407 // CHECK-LABEL: @test_vqrdmulh_n_s32(
3408 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3409 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3410 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3411 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3412 // CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3413 // CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3414 // CHECK: ret <2 x i32> [[VQRDMULH_V3_I]]
3415 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3416 return vqrdmulh_n_s32(a, b);
3419 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
3420 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3421 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3422 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3423 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3424 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3425 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3426 // CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3427 // CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3428 // CHECK: ret <4 x i32> [[VQRDMULHQ_V5_I]]
3429 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3430 return vqrdmulhq_n_s32(a, b);
3433 // CHECK-LABEL: @test_vmla_n_s16(
3434 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3435 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3436 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3437 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3438 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3439 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3440 // CHECK: ret <4 x i16> [[ADD_I]]
3441 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3442 return vmla_n_s16(a, b, c);
3445 // CHECK-LABEL: @test_vmlaq_n_s16(
3446 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3447 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3448 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3449 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3450 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3451 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3452 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3453 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3454 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3455 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3456 // CHECK: ret <8 x i16> [[ADD_I]]
3457 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3458 return vmlaq_n_s16(a, b, c);
3461 // CHECK-LABEL: @test_vmla_n_s32(
3462 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3463 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3464 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3465 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3466 // CHECK: ret <2 x i32> [[ADD_I]]
3467 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3468 return vmla_n_s32(a, b, c);
3471 // CHECK-LABEL: @test_vmlaq_n_s32(
3472 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3473 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3474 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3475 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3476 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3477 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3478 // CHECK: ret <4 x i32> [[ADD_I]]
3479 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3480 return vmlaq_n_s32(a, b, c);
3483 // CHECK-LABEL: @test_vmla_n_u16(
3484 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3485 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3486 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3487 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3488 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3489 // CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3490 // CHECK: ret <4 x i16> [[ADD_I]]
3491 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3492 return vmla_n_u16(a, b, c);
3495 // CHECK-LABEL: @test_vmlaq_n_u16(
3496 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3497 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3498 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3499 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3500 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3501 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3502 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3503 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3504 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3505 // CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3506 // CHECK: ret <8 x i16> [[ADD_I]]
3507 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3508 return vmlaq_n_u16(a, b, c);
3511 // CHECK-LABEL: @test_vmla_n_u32(
3512 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3513 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3514 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3515 // CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3516 // CHECK: ret <2 x i32> [[ADD_I]]
3517 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3518 return vmla_n_u32(a, b, c);
3521 // CHECK-LABEL: @test_vmlaq_n_u32(
3522 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3523 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3524 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3525 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3526 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3527 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3528 // CHECK: ret <4 x i32> [[ADD_I]]
3529 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3530 return vmlaq_n_u32(a, b, c);
3533 // CHECK-LABEL: @test_vmlal_n_s16(
3534 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3535 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3536 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3537 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3538 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3539 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3540 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3541 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3542 // CHECK: ret <4 x i32> [[ADD_I]]
3543 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3544 return vmlal_n_s16(a, b, c);
3547 // CHECK-LABEL: @test_vmlal_n_s32(
3548 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3549 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3550 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3551 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3552 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3553 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3554 // CHECK: ret <2 x i64> [[ADD_I]]
3555 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3556 return vmlal_n_s32(a, b, c);
3559 // CHECK-LABEL: @test_vmlal_n_u16(
3560 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3561 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3562 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3563 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3564 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3565 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3566 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3567 // CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3568 // CHECK: ret <4 x i32> [[ADD_I]]
3569 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3570 return vmlal_n_u16(a, b, c);
3573 // CHECK-LABEL: @test_vmlal_n_u32(
3574 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3575 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3576 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3577 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3578 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3579 // CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3580 // CHECK: ret <2 x i64> [[ADD_I]]
3581 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3582 return vmlal_n_u32(a, b, c);
3585 // CHECK-LABEL: @test_vqdmlal_n_s16(
3586 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3587 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3588 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3589 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3590 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3591 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3592 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3593 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3594 // CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3595 // CHECK: ret <4 x i32> [[VQDMLAL_V6_I]]
3596 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3597 return vqdmlal_n_s16(a, b, c);
3600 // CHECK-LABEL: @test_vqdmlal_n_s32(
3601 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3602 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3603 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3604 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3605 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3606 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3607 // CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3608 // CHECK: ret <2 x i64> [[VQDMLAL_V4_I]]
3609 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3610 return vqdmlal_n_s32(a, b, c);
3613 // CHECK-LABEL: @test_vmls_n_s16(
3614 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3615 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3616 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3617 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3618 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3619 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3620 // CHECK: ret <4 x i16> [[SUB_I]]
3621 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3622 return vmls_n_s16(a, b, c);
3625 // CHECK-LABEL: @test_vmlsq_n_s16(
3626 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3627 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3628 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3629 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3630 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3631 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3632 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3633 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3634 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3635 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3636 // CHECK: ret <8 x i16> [[SUB_I]]
3637 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3638 return vmlsq_n_s16(a, b, c);
3641 // CHECK-LABEL: @test_vmls_n_s32(
3642 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3643 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3644 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3645 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3646 // CHECK: ret <2 x i32> [[SUB_I]]
3647 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3648 return vmls_n_s32(a, b, c);
3651 // CHECK-LABEL: @test_vmlsq_n_s32(
3652 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3653 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3654 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3655 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3656 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3657 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3658 // CHECK: ret <4 x i32> [[SUB_I]]
3659 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3660 return vmlsq_n_s32(a, b, c);
3663 // CHECK-LABEL: @test_vmls_n_u16(
3664 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3665 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3666 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3667 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3668 // CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3669 // CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3670 // CHECK: ret <4 x i16> [[SUB_I]]
3671 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3672 return vmls_n_u16(a, b, c);
3675 // CHECK-LABEL: @test_vmlsq_n_u16(
3676 // CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3677 // CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3678 // CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3679 // CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3680 // CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3681 // CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3682 // CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3683 // CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3684 // CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3685 // CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3686 // CHECK: ret <8 x i16> [[SUB_I]]
3687 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3688 return vmlsq_n_u16(a, b, c);
3691 // CHECK-LABEL: @test_vmls_n_u32(
3692 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3693 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3694 // CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3695 // CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3696 // CHECK: ret <2 x i32> [[SUB_I]]
3697 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3698 return vmls_n_u32(a, b, c);
3701 // CHECK-LABEL: @test_vmlsq_n_u32(
3702 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3703 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3704 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3705 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3706 // CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3707 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3708 // CHECK: ret <4 x i32> [[SUB_I]]
3709 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3710 return vmlsq_n_u32(a, b, c);
3713 // CHECK-LABEL: @test_vmlsl_n_s16(
3714 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3715 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3716 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3717 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3718 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3719 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3720 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3721 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3722 // CHECK: ret <4 x i32> [[SUB_I]]
3723 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3724 return vmlsl_n_s16(a, b, c);
3727 // CHECK-LABEL: @test_vmlsl_n_s32(
3728 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3729 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3730 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3731 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3732 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3733 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3734 // CHECK: ret <2 x i64> [[SUB_I]]
3735 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3736 return vmlsl_n_s32(a, b, c);
3739 // CHECK-LABEL: @test_vmlsl_n_u16(
3740 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3741 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3742 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3743 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3744 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3745 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3746 // CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3747 // CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3748 // CHECK: ret <4 x i32> [[SUB_I]]
3749 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3750 return vmlsl_n_u16(a, b, c);
3753 // CHECK-LABEL: @test_vmlsl_n_u32(
3754 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3755 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3756 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3757 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3758 // CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3759 // CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3760 // CHECK: ret <2 x i64> [[SUB_I]]
3761 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3762 return vmlsl_n_u32(a, b, c);
3765 // CHECK-LABEL: @test_vqdmlsl_n_s16(
3766 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3767 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3768 // CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3769 // CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3770 // CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3771 // CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3772 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3773 // CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3774 // CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3775 // CHECK: ret <4 x i32> [[VQDMLSL_V6_I]]
3776 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3777 return vqdmlsl_n_s16(a, b, c);
3780 // CHECK-LABEL: @test_vqdmlsl_n_s32(
3781 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3782 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3783 // CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3784 // CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3785 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3786 // CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3787 // CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3788 // CHECK: ret <2 x i64> [[VQDMLSL_V4_I]]
3789 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3790 return vqdmlsl_n_s32(a, b, c);
3793 // CHECK-LABEL: @test_vmla_lane_u16_0(
3794 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3795 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3796 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3797 // CHECK: ret <4 x i16> [[ADD]]
3798 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3799 return vmla_lane_u16(a, b, v, 0);
3802 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
3803 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3804 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3805 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3806 // CHECK: ret <8 x i16> [[ADD]]
3807 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3808 return vmlaq_lane_u16(a, b, v, 0);
3811 // CHECK-LABEL: @test_vmla_lane_u32_0(
3812 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3813 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3814 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3815 // CHECK: ret <2 x i32> [[ADD]]
3816 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3817 return vmla_lane_u32(a, b, v, 0);
3820 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
3821 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3822 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3823 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3824 // CHECK: ret <4 x i32> [[ADD]]
3825 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3826 return vmlaq_lane_u32(a, b, v, 0);
3829 // CHECK-LABEL: @test_vmla_laneq_u16_0(
3830 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3831 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3832 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3833 // CHECK: ret <4 x i16> [[ADD]]
3834 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3835 return vmla_laneq_u16(a, b, v, 0);
3838 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
3839 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3840 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3841 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3842 // CHECK: ret <8 x i16> [[ADD]]
3843 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3844 return vmlaq_laneq_u16(a, b, v, 0);
3847 // CHECK-LABEL: @test_vmla_laneq_u32_0(
3848 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3849 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3850 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3851 // CHECK: ret <2 x i32> [[ADD]]
3852 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3853 return vmla_laneq_u32(a, b, v, 0);
3856 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
3857 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3858 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3859 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3860 // CHECK: ret <4 x i32> [[ADD]]
3861 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3862 return vmlaq_laneq_u32(a, b, v, 0);
3865 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
3866 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3867 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3868 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3869 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3870 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3871 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3872 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
3873 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3874 return vqdmlal_laneq_s16(a, b, v, 0);
3877 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
3878 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3879 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3880 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3881 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3882 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
3883 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3884 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
3885 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
3886 return vqdmlal_laneq_s32(a, b, v, 0);
3889 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
3890 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3891 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3892 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3893 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3894 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3895 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
3896 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3897 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
3898 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
3899 return vqdmlal_high_laneq_s16(a, b, v, 0);
3902 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
3903 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3904 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3905 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3906 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3907 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3908 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
3909 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3910 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
3911 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
3912 return vqdmlal_high_laneq_s32(a, b, v, 0);
3915 // CHECK-LABEL: @test_vmls_lane_u16_0(
3916 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3917 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3918 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3919 // CHECK: ret <4 x i16> [[SUB]]
3920 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3921 return vmls_lane_u16(a, b, v, 0);
3924 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
3925 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3926 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3927 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3928 // CHECK: ret <8 x i16> [[SUB]]
3929 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3930 return vmlsq_lane_u16(a, b, v, 0);
3933 // CHECK-LABEL: @test_vmls_lane_u32_0(
3934 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3935 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3936 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3937 // CHECK: ret <2 x i32> [[SUB]]
3938 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3939 return vmls_lane_u32(a, b, v, 0);
3942 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
3943 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3944 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3945 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3946 // CHECK: ret <4 x i32> [[SUB]]
3947 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3948 return vmlsq_lane_u32(a, b, v, 0);
3951 // CHECK-LABEL: @test_vmls_laneq_u16_0(
3952 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3953 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3954 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3955 // CHECK: ret <4 x i16> [[SUB]]
3956 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3957 return vmls_laneq_u16(a, b, v, 0);
3960 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
3961 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3962 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3963 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3964 // CHECK: ret <8 x i16> [[SUB]]
3965 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3966 return vmlsq_laneq_u16(a, b, v, 0);
3969 // CHECK-LABEL: @test_vmls_laneq_u32_0(
3970 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3971 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3972 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3973 // CHECK: ret <2 x i32> [[SUB]]
3974 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3975 return vmls_laneq_u32(a, b, v, 0);
3978 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
3979 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3980 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3981 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3982 // CHECK: ret <4 x i32> [[SUB]]
3983 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3984 return vmlsq_laneq_u32(a, b, v, 0);
3987 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
3988 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3989 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3990 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3991 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3992 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3993 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3994 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
3995 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3996 return vqdmlsl_laneq_s16(a, b, v, 0);
3999 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
4000 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4001 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4002 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4003 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4004 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4005 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4006 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4007 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4008 return vqdmlsl_laneq_s32(a, b, v, 0);
4011 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
4012 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4013 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4014 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4015 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4016 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4017 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4018 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4019 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4020 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4021 return vqdmlsl_high_laneq_s16(a, b, v, 0);
4024 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
4025 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4026 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4027 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4028 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4029 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4030 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4031 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4032 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4033 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4034 return vqdmlsl_high_laneq_s32(a, b, v, 0);
4037 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
4038 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4039 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4040 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4041 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4042 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4043 // CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
4044 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4045 return vqdmulh_laneq_s16(a, v, 0);
4048 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
4049 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4050 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4051 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4052 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4053 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4054 // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
4055 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4056 return vqdmulhq_laneq_s16(a, v, 0);
4059 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
4060 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4061 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4062 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4063 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4064 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4065 // CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
4066 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4067 return vqdmulh_laneq_s32(a, v, 0);
4070 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
4071 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4072 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4073 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4074 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4075 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4076 // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
4077 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4078 return vqdmulhq_laneq_s32(a, v, 0);
4081 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
4082 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4083 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4084 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4085 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4086 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4087 // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
4088 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4089 return vqrdmulh_laneq_s16(a, v, 0);
4092 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
4093 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4094 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4095 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4096 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4097 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4098 // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
4099 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4100 return vqrdmulhq_laneq_s16(a, v, 0);
4103 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
4104 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4105 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4106 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4107 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4108 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4109 // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
4110 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4111 return vqrdmulh_laneq_s32(a, v, 0);
4114 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
4115 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4116 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4117 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4118 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4119 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4120 // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
4121 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4122 return vqrdmulhq_laneq_s32(a, v, 0);
4125 // CHECK-LABEL: @test_vmla_lane_u16(
4126 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4127 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4128 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4129 // CHECK: ret <4 x i16> [[ADD]]
4130 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4131 return vmla_lane_u16(a, b, v, 3);
4134 // CHECK-LABEL: @test_vmlaq_lane_u16(
4135 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4136 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4137 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4138 // CHECK: ret <8 x i16> [[ADD]]
4139 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4140 return vmlaq_lane_u16(a, b, v, 3);
4143 // CHECK-LABEL: @test_vmla_lane_u32(
4144 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4145 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4146 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4147 // CHECK: ret <2 x i32> [[ADD]]
4148 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4149 return vmla_lane_u32(a, b, v, 1);
4152 // CHECK-LABEL: @test_vmlaq_lane_u32(
4153 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4154 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4155 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4156 // CHECK: ret <4 x i32> [[ADD]]
4157 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4158 return vmlaq_lane_u32(a, b, v, 1);
4161 // CHECK-LABEL: @test_vmla_laneq_u16(
4162 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4163 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4164 // CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4165 // CHECK: ret <4 x i16> [[ADD]]
4166 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4167 return vmla_laneq_u16(a, b, v, 7);
4170 // CHECK-LABEL: @test_vmlaq_laneq_u16(
4171 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4172 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4173 // CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4174 // CHECK: ret <8 x i16> [[ADD]]
4175 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4176 return vmlaq_laneq_u16(a, b, v, 7);
4179 // CHECK-LABEL: @test_vmla_laneq_u32(
4180 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4181 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4182 // CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4183 // CHECK: ret <2 x i32> [[ADD]]
4184 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4185 return vmla_laneq_u32(a, b, v, 3);
4188 // CHECK-LABEL: @test_vmlaq_laneq_u32(
4189 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4190 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4191 // CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4192 // CHECK: ret <4 x i32> [[ADD]]
4193 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4194 return vmlaq_laneq_u32(a, b, v, 3);
4197 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
4198 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4199 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4200 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4201 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4202 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4203 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4204 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
4205 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4206 return vqdmlal_laneq_s16(a, b, v, 7);
4209 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
4210 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4211 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4212 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4213 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4214 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4215 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4216 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
4217 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4218 return vqdmlal_laneq_s32(a, b, v, 3);
4221 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
4222 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4223 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4224 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4225 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4226 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4227 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4228 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4229 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]]
4230 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4231 return vqdmlal_high_laneq_s16(a, b, v, 7);
4234 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
4235 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4236 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4237 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4238 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4239 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4240 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4241 // CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4242 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]]
4243 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4244 return vqdmlal_high_laneq_s32(a, b, v, 3);
4247 // CHECK-LABEL: @test_vmls_lane_u16(
4248 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4249 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4250 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4251 // CHECK: ret <4 x i16> [[SUB]]
4252 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4253 return vmls_lane_u16(a, b, v, 3);
4256 // CHECK-LABEL: @test_vmlsq_lane_u16(
4257 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4258 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4259 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4260 // CHECK: ret <8 x i16> [[SUB]]
4261 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4262 return vmlsq_lane_u16(a, b, v, 3);
4265 // CHECK-LABEL: @test_vmls_lane_u32(
4266 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4267 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4268 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4269 // CHECK: ret <2 x i32> [[SUB]]
4270 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4271 return vmls_lane_u32(a, b, v, 1);
4274 // CHECK-LABEL: @test_vmlsq_lane_u32(
4275 // CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4276 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4277 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4278 // CHECK: ret <4 x i32> [[SUB]]
4279 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4280 return vmlsq_lane_u32(a, b, v, 1);
4283 // CHECK-LABEL: @test_vmls_laneq_u16(
4284 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4285 // CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4286 // CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4287 // CHECK: ret <4 x i16> [[SUB]]
4288 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4289 return vmls_laneq_u16(a, b, v, 7);
4292 // CHECK-LABEL: @test_vmlsq_laneq_u16(
4293 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4294 // CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4295 // CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4296 // CHECK: ret <8 x i16> [[SUB]]
4297 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4298 return vmlsq_laneq_u16(a, b, v, 7);
4301 // CHECK-LABEL: @test_vmls_laneq_u32(
4302 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4303 // CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4304 // CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4305 // CHECK: ret <2 x i32> [[SUB]]
4306 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4307 return vmls_laneq_u32(a, b, v, 3);
4310 // CHECK-LABEL: @test_vmlsq_laneq_u32(
4311 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4312 // CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4313 // CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4314 // CHECK: ret <4 x i32> [[SUB]]
4315 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4316 return vmlsq_laneq_u32(a, b, v, 3);
4319 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
4320 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4321 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4322 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4323 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4324 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4325 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4326 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4327 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4328 return vqdmlsl_laneq_s16(a, b, v, 7);
4331 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
4332 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4333 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4334 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4335 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4336 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4337 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4338 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4339 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4340 return vqdmlsl_laneq_s32(a, b, v, 3);
4343 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
4344 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4345 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4346 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4347 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4348 // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4349 // CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4350 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4351 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]]
4352 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4353 return vqdmlsl_high_laneq_s16(a, b, v, 7);
4356 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
4357 // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4358 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4359 // CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4360 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4361 // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4362 // CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4363 // CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4364 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]]
4365 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4366 return vqdmlsl_high_laneq_s32(a, b, v, 3);
4369 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
4370 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4371 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4372 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4373 // CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4374 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4375 // CHECK: ret <4 x i16> [[VQDMULH_V2_I]]
4376 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4377 return vqdmulh_laneq_s16(a, v, 7);
4380 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
4381 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4382 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4383 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4384 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4385 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4386 // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]]
4387 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4388 return vqdmulhq_laneq_s16(a, v, 7);
4391 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
4392 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4393 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4394 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4395 // CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4396 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4397 // CHECK: ret <2 x i32> [[VQDMULH_V2_I]]
4398 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4399 return vqdmulh_laneq_s32(a, v, 3);
4402 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
4403 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4404 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4405 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4406 // CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4407 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4408 // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]]
4409 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4410 return vqdmulhq_laneq_s32(a, v, 3);
4413 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
4414 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4415 // CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4416 // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4417 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4418 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4419 // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]]
4420 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4421 return vqrdmulh_laneq_s16(a, v, 7);
4424 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
4425 // CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4426 // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4427 // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4428 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4429 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4430 // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]]
4431 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4432 return vqrdmulhq_laneq_s16(a, v, 7);
4435 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
4436 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4437 // CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4438 // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4439 // CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4440 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4441 // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]]
4442 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4443 return vqrdmulh_laneq_s32(a, v, 3);
4446 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
4447 // CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4448 // CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4449 // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4450 // CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4451 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4452 // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]]
4453 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4454 return vqrdmulhq_laneq_s32(a, v, 3);