test/CodeGen/aarch64-neon-2velem.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
   2
   3 // Test new aarch64 intrinsics and types
   4
   5 #include <arm_neon.h>
   6
   7 // CHECK-LABEL: @test_vmla_lane_s16(
   8 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  10 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
  11 // CHECK:   ret <4 x i16> [[ADD]]
  12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
  13   return vmla_lane_s16(a, b, v, 3);
  14 }
  15
  16 // CHECK-LABEL: @test_vmlaq_lane_s16(
  17 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  18 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  19 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
  20 // CHECK:   ret <8 x i16> [[ADD]]
  21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
  22   return vmlaq_lane_s16(a, b, v, 3);
  23 }
  24
  25 // CHECK-LABEL: @test_vmla_lane_s32(
  26 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
  27 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
  28 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
  29 // CHECK:   ret <2 x i32> [[ADD]]
  30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
  31   return vmla_lane_s32(a, b, v, 1);
  32 }
  33
  34 // CHECK-LABEL: @test_vmlaq_lane_s32(
  35 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  36 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
  37 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
  38 // CHECK:   ret <4 x i32> [[ADD]]
  39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
  40   return vmlaq_lane_s32(a, b, v, 1);
  41 }
  42
  43 // CHECK-LABEL: @test_vmla_laneq_s16(
  44 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
  45 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  46 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
  47 // CHECK:   ret <4 x i16> [[ADD]]
  48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
  49   return vmla_laneq_s16(a, b, v, 7);
  50 }
  51
  52 // CHECK-LABEL: @test_vmlaq_laneq_s16(
  53 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  54 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  55 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
  56 // CHECK:   ret <8 x i16> [[ADD]]
  57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
  58   return vmlaq_laneq_s16(a, b, v, 7);
  59 }
  60
  61 // CHECK-LABEL: @test_vmla_laneq_s32(
  62 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
  63 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
  64 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
  65 // CHECK:   ret <2 x i32> [[ADD]]
  66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
  67   return vmla_laneq_s32(a, b, v, 3);
  68 }
  69
  70 // CHECK-LABEL: @test_vmlaq_laneq_s32(
  71 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  72 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
  73 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
  74 // CHECK:   ret <4 x i32> [[ADD]]
  75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
  76   return vmlaq_laneq_s32(a, b, v, 3);
  77 }
  78
  79 // CHECK-LABEL: @test_vmls_lane_s16(
  80 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  81 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  82 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
  83 // CHECK:   ret <4 x i16> [[SUB]]
  84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
  85   return vmls_lane_s16(a, b, v, 3);
  86 }
  87
  88 // CHECK-LABEL: @test_vmlsq_lane_s16(
  89 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  90 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  91 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
  92 // CHECK:   ret <8 x i16> [[SUB]]
  93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
  94   return vmlsq_lane_s16(a, b, v, 3);
  95 }
  96
  97 // CHECK-LABEL: @test_vmls_lane_s32(
  98 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
  99 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 100 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
 101 // CHECK:   ret <2 x i32> [[SUB]]
 102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 103   return vmls_lane_s32(a, b, v, 1);
 104 }
 105
 106 // CHECK-LABEL: @test_vmlsq_lane_s32(
 107 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 108 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 109 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
 110 // CHECK:   ret <4 x i32> [[SUB]]
 111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 112   return vmlsq_lane_s32(a, b, v, 1);
 113 }
 114
 115 // CHECK-LABEL: @test_vmls_laneq_s16(
 116 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 117 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 118 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
 119 // CHECK:   ret <4 x i16> [[SUB]]
 120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 121   return vmls_laneq_s16(a, b, v, 7);
 122 }
 123
 124 // CHECK-LABEL: @test_vmlsq_laneq_s16(
 125 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 126 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 127 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
 128 // CHECK:   ret <8 x i16> [[SUB]]
 129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 130   return vmlsq_laneq_s16(a, b, v, 7);
 131 }
 132
 133 // CHECK-LABEL: @test_vmls_laneq_s32(
 134 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 135 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 136 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
 137 // CHECK:   ret <2 x i32> [[SUB]]
 138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 139   return vmls_laneq_s32(a, b, v, 3);
 140 }
 141
 142 // CHECK-LABEL: @test_vmlsq_laneq_s32(
 143 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 144 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 145 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
 146 // CHECK:   ret <4 x i32> [[SUB]]
 147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 148   return vmlsq_laneq_s32(a, b, v, 3);
 149 }
 150
 151 // CHECK-LABEL: @test_vmul_lane_s16(
 152 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 153 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 154 // CHECK:   ret <4 x i16> [[MUL]]
 155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
 156   return vmul_lane_s16(a, v, 3);
 157 }
 158
 159 // CHECK-LABEL: @test_vmulq_lane_s16(
 160 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 161 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 162 // CHECK:   ret <8 x i16> [[MUL]]
 163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
 164   return vmulq_lane_s16(a, v, 3);
 165 }
 166
 167 // CHECK-LABEL: @test_vmul_lane_s32(
 168 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 169 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 170 // CHECK:   ret <2 x i32> [[MUL]]
 171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
 172   return vmul_lane_s32(a, v, 1);
 173 }
 174
 175 // CHECK-LABEL: @test_vmulq_lane_s32(
 176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 177 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 178 // CHECK:   ret <4 x i32> [[MUL]]
 179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
 180   return vmulq_lane_s32(a, v, 1);
 181 }
 182
 183 // CHECK-LABEL: @test_vmul_lane_u16(
 184 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 185 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 186 // CHECK:   ret <4 x i16> [[MUL]]
 187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
 188   return vmul_lane_u16(a, v, 3);
 189 }
 190
 191 // CHECK-LABEL: @test_vmulq_lane_u16(
 192 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 193 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 194 // CHECK:   ret <8 x i16> [[MUL]]
 195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
 196   return vmulq_lane_u16(a, v, 3);
 197 }
 198
 199 // CHECK-LABEL: @test_vmul_lane_u32(
 200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 201 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 202 // CHECK:   ret <2 x i32> [[MUL]]
 203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
 204   return vmul_lane_u32(a, v, 1);
 205 }
 206
 207 // CHECK-LABEL: @test_vmulq_lane_u32(
 208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 209 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 210 // CHECK:   ret <4 x i32> [[MUL]]
 211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
 212   return vmulq_lane_u32(a, v, 1);
 213 }
 214
 215 // CHECK-LABEL: @test_vmul_laneq_s16(
 216 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 217 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 218 // CHECK:   ret <4 x i16> [[MUL]]
 219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
 220   return vmul_laneq_s16(a, v, 7);
 221 }
 222
 223 // CHECK-LABEL: @test_vmulq_laneq_s16(
 224 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 225 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 226 // CHECK:   ret <8 x i16> [[MUL]]
 227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
 228   return vmulq_laneq_s16(a, v, 7);
 229 }
 230
 231 // CHECK-LABEL: @test_vmul_laneq_s32(
 232 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 233 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 234 // CHECK:   ret <2 x i32> [[MUL]]
 235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
 236   return vmul_laneq_s32(a, v, 3);
 237 }
 238
 239 // CHECK-LABEL: @test_vmulq_laneq_s32(
 240 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 241 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 242 // CHECK:   ret <4 x i32> [[MUL]]
 243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
 244   return vmulq_laneq_s32(a, v, 3);
 245 }
 246
 247 // CHECK-LABEL: @test_vmul_laneq_u16(
 248 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 249 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 250 // CHECK:   ret <4 x i16> [[MUL]]
 251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
 252   return vmul_laneq_u16(a, v, 7);
 253 }
 254
 255 // CHECK-LABEL: @test_vmulq_laneq_u16(
 256 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 257 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 258 // CHECK:   ret <8 x i16> [[MUL]]
 259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
 260   return vmulq_laneq_u16(a, v, 7);
 261 }
 262
 263 // CHECK-LABEL: @test_vmul_laneq_u32(
 264 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 265 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 266 // CHECK:   ret <2 x i32> [[MUL]]
 267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 268   return vmul_laneq_u32(a, v, 3);
 269 }
 270
 271 // CHECK-LABEL: @test_vmulq_laneq_u32(
 272 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 273 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 274 // CHECK:   ret <4 x i32> [[MUL]]
 275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
 276   return vmulq_laneq_u32(a, v, 3);
 277 }
 278
 279 // CHECK-LABEL: @test_vfma_lane_f32(
 280 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 281 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 282 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 283 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 284 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 285 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 286 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 287 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 288 // CHECK:   ret <2 x float> [[FMLA2]]
 289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 290   return vfma_lane_f32(a, b, v, 1);
 291 }
 292
 293 // CHECK-LABEL: @test_vfmaq_lane_f32(
 294 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 295 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 296 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 297 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 298 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 299 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 300 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 301 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 302 // CHECK:   ret <4 x float> [[FMLA2]]
 303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 304   return vfmaq_lane_f32(a, b, v, 1);
 305 }
 306
 307 // CHECK-LABEL: @test_vfma_laneq_f32(
 308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 310 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 311 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 312 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 313 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 314 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 315 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 316 // CHECK:   ret <2 x float> [[TMP6]]
 317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 318   return vfma_laneq_f32(a, b, v, 3);
 319 }
 320
 321 // CHECK-LABEL: @test_vfmaq_laneq_f32(
 322 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 323 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 324 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 325 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 326 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 327 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 328 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 329 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 330 // CHECK:   ret <4 x float> [[TMP6]]
 331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 332   return vfmaq_laneq_f32(a, b, v, 3);
 333 }
 334
 335 // CHECK-LABEL: @test_vfms_lane_f32(
 336 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 338 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
 339 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 340 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 341 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 342 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 343 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 344 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 345 // CHECK:   ret <2 x float> [[FMLA2]]
 346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 347   return vfms_lane_f32(a, b, v, 1);
 348 }
 349
 350 // CHECK-LABEL: @test_vfmsq_lane_f32(
 351 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 352 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 353 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
 354 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 355 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 356 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 357 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 358 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 359 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 360 // CHECK:   ret <4 x float> [[FMLA2]]
 361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 362   return vfmsq_lane_f32(a, b, v, 1);
 363 }
 364
 365 // CHECK-LABEL: @test_vfms_laneq_f32(
 366 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
 369 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 370 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 371 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 372 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 373 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 374 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 375 // CHECK:   ret <2 x float> [[TMP6]]
 376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 377   return vfms_laneq_f32(a, b, v, 3);
 378 }
 379
 380 // CHECK-LABEL: @test_vfmsq_laneq_f32(
 381 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 382 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 383 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
 384 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 385 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 386 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 387 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 388 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 389 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 390 // CHECK:   ret <4 x float> [[TMP6]]
 391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 392   return vfmsq_laneq_f32(a, b, v, 3);
 393 }
 394
 395 // CHECK-LABEL: @test_vfmaq_lane_f64(
 396 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 397 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 398 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 399 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 400 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 401 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 402 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 403 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 404 // CHECK:   ret <2 x double> [[FMLA2]]
 405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 406   return vfmaq_lane_f64(a, b, v, 0);
 407 }
 408
 409 // CHECK-LABEL: @test_vfmaq_laneq_f64(
 410 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 411 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 412 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 413 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 414 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 415 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 416 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 417 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 418 // CHECK:   ret <2 x double> [[TMP6]]
 419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 420   return vfmaq_laneq_f64(a, b, v, 1);
 421 }
 422
 423 // CHECK-LABEL: @test_vfmsq_lane_f64(
 424 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 425 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 426 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
 427 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 428 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 429 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 430 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 431 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 432 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 433 // CHECK:   ret <2 x double> [[FMLA2]]
 434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 435   return vfmsq_lane_f64(a, b, v, 0);
 436 }
 437
 438 // CHECK-LABEL: @test_vfmsq_laneq_f64(
 439 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 440 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 441 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
 442 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 443 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 444 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 445 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 446 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 447 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 448 // CHECK:   ret <2 x double> [[TMP6]]
 449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 450   return vfmsq_laneq_f64(a, b, v, 1);
 451 }
 452
 453 // CHECK-LABEL: @test_vfmas_laneq_f32(
 454 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 455 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 456 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 457 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
 458 // CHECK:   ret float [[TMP2]]
 459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 460   return vfmas_laneq_f32(a, b, v, 3);
 461 }
 462
 463 // CHECK-LABEL: @test_vfmsd_lane_f64(
 464 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 465 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
 466 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 467 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 468 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
 469 // CHECK:   ret double [[TMP2]]
 470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
 471   return vfmsd_lane_f64(a, b, v, 0);
 472 }
 473
 474 // CHECK-LABEL: @test_vfmss_laneq_f32(
 475 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
 476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 478 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 479 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
 480 // CHECK:   ret float [[TMP2]]
 481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 482   return vfmss_laneq_f32(a, b, v, 3);
 483 }
 484
 485 // CHECK-LABEL: @test_vfmsd_laneq_f64(
 486 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 487 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
 488 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 489 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 490 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
 491 // CHECK:   ret double [[TMP2]]
 492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 493   return vfmsd_laneq_f64(a, b, v, 1);
 494 }
 495
 496 // CHECK-LABEL: @test_vmlal_lane_s16(
 497 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 500 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 501 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 502 // CHECK:   ret <4 x i32> [[ADD]]
 503 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 504   return vmlal_lane_s16(a, b, v, 3);
 505 }
 506
 507 // CHECK-LABEL: @test_vmlal_lane_s32(
 508 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 509 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 510 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 511 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 512 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 513 // CHECK:   ret <2 x i64> [[ADD]]
 514 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 515   return vmlal_lane_s32(a, b, v, 1);
 516 }
 517
 518 // CHECK-LABEL: @test_vmlal_laneq_s16(
 519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 522 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 523 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 524 // CHECK:   ret <4 x i32> [[ADD]]
 525 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 526   return vmlal_laneq_s16(a, b, v, 7);
 527 }
 528
 529 // CHECK-LABEL: @test_vmlal_laneq_s32(
 530 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 531 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 532 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 533 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 534 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 535 // CHECK:   ret <2 x i64> [[ADD]]
 536 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 537   return vmlal_laneq_s32(a, b, v, 3);
 538 }
 539
 540 // CHECK-LABEL: @test_vmlal_high_lane_s16(
 541 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 542 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 543 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 544 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 545 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 546 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 547 // CHECK:   ret <4 x i32> [[ADD]]
 548 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 549   return vmlal_high_lane_s16(a, b, v, 3);
 550 }
 551
 552 // CHECK-LABEL: @test_vmlal_high_lane_s32(
 553 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 554 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 555 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 556 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 557 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 558 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 559 // CHECK:   ret <2 x i64> [[ADD]]
 560 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 561   return vmlal_high_lane_s32(a, b, v, 1);
 562 }
 563
 564 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
 565 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 566 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 567 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 568 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 569 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 570 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 571 // CHECK:   ret <4 x i32> [[ADD]]
 572 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 573   return vmlal_high_laneq_s16(a, b, v, 7);
 574 }
 575
 576 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
 577 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 578 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 579 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 580 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 581 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 582 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 583 // CHECK:   ret <2 x i64> [[ADD]]
 584 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 585   return vmlal_high_laneq_s32(a, b, v, 3);
 586 }
 587
 588 // CHECK-LABEL: @test_vmlsl_lane_s16(
 589 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 590 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 591 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 592 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 593 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 594 // CHECK:   ret <4 x i32> [[SUB]]
 595 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 596   return vmlsl_lane_s16(a, b, v, 3);
 597 }
 598
 599 // CHECK-LABEL: @test_vmlsl_lane_s32(
 600 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 601 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 602 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 603 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 604 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 605 // CHECK:   ret <2 x i64> [[SUB]]
 606 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 607   return vmlsl_lane_s32(a, b, v, 1);
 608 }
 609
 610 // CHECK-LABEL: @test_vmlsl_laneq_s16(
 611 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 612 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 613 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 614 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 615 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 616 // CHECK:   ret <4 x i32> [[SUB]]
 617 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 618   return vmlsl_laneq_s16(a, b, v, 7);
 619 }
 620
 621 // CHECK-LABEL: @test_vmlsl_laneq_s32(
 622 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 623 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 624 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 625 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 626 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 627 // CHECK:   ret <2 x i64> [[SUB]]
 628 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 629   return vmlsl_laneq_s32(a, b, v, 3);
 630 }
 631
 632 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
 633 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 635 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 636 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 637 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 638 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 639 // CHECK:   ret <4 x i32> [[SUB]]
 640 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 641   return vmlsl_high_lane_s16(a, b, v, 3);
 642 }
 643
 644 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
 645 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 646 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 647 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 648 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 649 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 650 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 651 // CHECK:   ret <2 x i64> [[SUB]]
 652 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 653   return vmlsl_high_lane_s32(a, b, v, 1);
 654 }
 655
 656 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 657 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 658 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 659 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 660 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 661 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 662 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 663 // CHECK:   ret <4 x i32> [[SUB]]
 664 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 665   return vmlsl_high_laneq_s16(a, b, v, 7);
 666 }
 667
 668 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 669 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 670 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 671 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 672 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 673 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 674 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 675 // CHECK:   ret <2 x i64> [[SUB]]
 676 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 677   return vmlsl_high_laneq_s32(a, b, v, 3);
 678 }
 679
 680 // CHECK-LABEL: @test_vmlal_lane_u16(
 681 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 682 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 683 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 684 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 685 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 686 // CHECK:   ret <4 x i32> [[ADD]]
 687 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 688   return vmlal_lane_u16(a, b, v, 3);
 689 }
 690
 691 // CHECK-LABEL: @test_vmlal_lane_u32(
 692 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 693 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 694 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 695 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 696 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 697 // CHECK:   ret <2 x i64> [[ADD]]
 698 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 699   return vmlal_lane_u32(a, b, v, 1);
 700 }
 701
 702 // CHECK-LABEL: @test_vmlal_laneq_u16(
 703 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 704 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 705 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 706 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 707 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 708 // CHECK:   ret <4 x i32> [[ADD]]
 709 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 710   return vmlal_laneq_u16(a, b, v, 7);
 711 }
 712
 713 // CHECK-LABEL: @test_vmlal_laneq_u32(
 714 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 715 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 716 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 717 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 718 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 719 // CHECK:   ret <2 x i64> [[ADD]]
 720 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 721   return vmlal_laneq_u32(a, b, v, 3);
 722 }
 723
 724 // CHECK-LABEL: @test_vmlal_high_lane_u16(
 725 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 727 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 728 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 729 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 730 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 731 // CHECK:   ret <4 x i32> [[ADD]]
 732 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 733   return vmlal_high_lane_u16(a, b, v, 3);
 734 }
 735
 736 // CHECK-LABEL: @test_vmlal_high_lane_u32(
 737 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 738 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 739 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 740 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 741 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 742 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 743 // CHECK:   ret <2 x i64> [[ADD]]
 744 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 745   return vmlal_high_lane_u32(a, b, v, 1);
 746 }
 747
 748 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
 749 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 750 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 751 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 752 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 753 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 754 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 755 // CHECK:   ret <4 x i32> [[ADD]]
 756 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 757   return vmlal_high_laneq_u16(a, b, v, 7);
 758 }
 759
 760 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
 761 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 762 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 764 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 765 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 766 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 767 // CHECK:   ret <2 x i64> [[ADD]]
 768 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 769   return vmlal_high_laneq_u32(a, b, v, 3);
 770 }
 771
 772 // CHECK-LABEL: @test_vmlsl_lane_u16(
 773 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 774 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 775 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 776 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 777 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 778 // CHECK:   ret <4 x i32> [[SUB]]
 779 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 780   return vmlsl_lane_u16(a, b, v, 3);
 781 }
 782
 783 // CHECK-LABEL: @test_vmlsl_lane_u32(
 784 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 785 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 786 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 787 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 788 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 789 // CHECK:   ret <2 x i64> [[SUB]]
 790 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 791   return vmlsl_lane_u32(a, b, v, 1);
 792 }
 793
 794 // CHECK-LABEL: @test_vmlsl_laneq_u16(
 795 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 796 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 797 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 798 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
 799 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 800 // CHECK:   ret <4 x i32> [[SUB]]
 801 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 802   return vmlsl_laneq_u16(a, b, v, 7);
 803 }
 804
 805 // CHECK-LABEL: @test_vmlsl_laneq_u32(
 806 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 807 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 808 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 809 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
 810 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 811 // CHECK:   ret <2 x i64> [[SUB]]
 812 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 813   return vmlsl_laneq_u32(a, b, v, 3);
 814 }
 815
 816 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
 817 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 818 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 819 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 820 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 821 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 822 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 823 // CHECK:   ret <4 x i32> [[SUB]]
 824 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 825   return vmlsl_high_lane_u16(a, b, v, 3);
 826 }
 827
 828 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
 829 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 830 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 831 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 832 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 833 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 834 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 835 // CHECK:   ret <2 x i64> [[SUB]]
 836 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 837   return vmlsl_high_lane_u32(a, b, v, 1);
 838 }
 839
 840 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
 841 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 842 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 843 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 844 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 845 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 846 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 847 // CHECK:   ret <4 x i32> [[SUB]]
 848 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 849   return vmlsl_high_laneq_u16(a, b, v, 7);
 850 }
 851
 852 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
 853 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 855 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 856 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 857 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 858 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 859 // CHECK:   ret <2 x i64> [[SUB]]
 860 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 861   return vmlsl_high_laneq_u32(a, b, v, 3);
 862 }
 863
 864 // CHECK-LABEL: @test_vmull_lane_s16(
 865 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 866 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 867 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 868 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 869 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 870 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
 871   return vmull_lane_s16(a, v, 3);
 872 }
 873
 874 // CHECK-LABEL: @test_vmull_lane_s32(
 875 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 876 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 877 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 878 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 879 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 880 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
 881   return vmull_lane_s32(a, v, 1);
 882 }
 883
 884 // CHECK-LABEL: @test_vmull_lane_u16(
 885 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 886 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 887 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 888 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 889 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 890 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
 891   return vmull_lane_u16(a, v, 3);
 892 }
 893
 894 // CHECK-LABEL: @test_vmull_lane_u32(
 895 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 896 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 897 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 898 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 899 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 900 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
 901   return vmull_lane_u32(a, v, 1);
 902 }
 903
 904 // CHECK-LABEL: @test_vmull_high_lane_s16(
 905 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 906 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 907 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 909 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 910 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 911 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 912   return vmull_high_lane_s16(a, v, 3);
 913 }
 914
 915 // CHECK-LABEL: @test_vmull_high_lane_s32(
 916 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 917 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 918 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 919 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 920 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 921 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 922 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 923   return vmull_high_lane_s32(a, v, 1);
 924 }
 925
 926 // CHECK-LABEL: @test_vmull_high_lane_u16(
 927 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 928 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 929 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 930 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 931 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 932 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 933 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
 934   return vmull_high_lane_u16(a, v, 3);
 935 }
 936
 937 // CHECK-LABEL: @test_vmull_high_lane_u32(
 938 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 939 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 940 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 941 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 942 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
 943 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 944 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
 945   return vmull_high_lane_u32(a, v, 1);
 946 }
 947
 948 // CHECK-LABEL: @test_vmull_laneq_s16(
 949 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 950 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 951 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 952 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 953 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 954 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
 955   return vmull_laneq_s16(a, v, 7);
 956 }
 957
 958 // CHECK-LABEL: @test_vmull_laneq_s32(
 959 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 960 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 961 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 962 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 963 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 964 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
 965   return vmull_laneq_s32(a, v, 3);
 966 }
 967
 968 // CHECK-LABEL: @test_vmull_laneq_u16(
 969 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 970 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 971 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 972 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
 973 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 974 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
 975   return vmull_laneq_u16(a, v, 7);
 976 }
 977
 978 // CHECK-LABEL: @test_vmull_laneq_u32(
 979 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 980 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 981 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 982 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
 983 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 984 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
 985   return vmull_laneq_u32(a, v, 3);
 986 }
 987
 988 // CHECK-LABEL: @test_vmull_high_laneq_s16(
 989 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 990 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 991 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 992 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 993 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
 994 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 995 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 996   return vmull_high_laneq_s16(a, v, 7);
 997 }
 998
 999 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1000 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1001 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1002 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1003 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1004 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1005 // CHECK:   ret <2 x i64> [[VMULL2_I]]
1006 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1007   return vmull_high_laneq_s32(a, v, 3);
1008 }
1009
1010 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1011 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1012 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1013 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1014 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1015 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1016 // CHECK:   ret <4 x i32> [[VMULL2_I]]
1017 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1018   return vmull_high_laneq_u16(a, v, 7);
1019 }
1020
1021 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1022 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1023 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1024 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1025 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1026 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1027 // CHECK:   ret <2 x i64> [[VMULL2_I]]
1028 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1029   return vmull_high_laneq_u32(a, v, 3);
1030 }
1031
1032 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1033 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1034 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1036 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1037 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1038 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
1039 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1040 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1041   return vqdmlal_lane_s16(a, b, v, 3);
1042 }
1043
1044 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1045 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1046 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1047 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1048 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1049 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1050 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
1051 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1052 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1053   return vqdmlal_lane_s32(a, b, v, 1);
1054 }
1055
1056 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1057 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1058 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1059 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1060 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1061 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1062 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1063 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
1064 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1065 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1066   return vqdmlal_high_lane_s16(a, b, v, 3);
1067 }
1068
1069 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1070 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1071 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1073 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1074 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1075 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1076 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
1077 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1078 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1079   return vqdmlal_high_lane_s32(a, b, v, 1);
1080 }
1081
1082 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1083 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1084 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1085 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1086 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1087 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1088 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
1089 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1090 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1091   return vqdmlsl_lane_s16(a, b, v, 3);
1092 }
1093
1094 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1095 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1096 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1097 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1098 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1099 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1100 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
1101 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1102 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1103   return vqdmlsl_lane_s32(a, b, v, 1);
1104 }
1105
1106 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1107 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1108 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1109 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1111 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1112 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1113 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
1114 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1115 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1116   return vqdmlsl_high_lane_s16(a, b, v, 3);
1117 }
1118
1119 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1120 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1121 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1122 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1123 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1124 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1125 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1126 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
1127 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1128 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1129   return vqdmlsl_high_lane_s32(a, b, v, 1);
1130 }
1131
1132 // CHECK-LABEL: @test_vqdmull_lane_s16(
1133 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1136 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
1137 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1138 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1139 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1140   return vqdmull_lane_s16(a, v, 3);
1141 }
1142
1143 // CHECK-LABEL: @test_vqdmull_lane_s32(
1144 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1147 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
1148 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1149 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1150 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1151   return vqdmull_lane_s32(a, v, 1);
1152 }
1153
1154 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1155 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1156 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1157 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1158 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
1159 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1160 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1161 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1162   return vqdmull_laneq_s16(a, v, 3);
1163 }
1164
1165 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1166 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1167 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1168 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1169 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
1170 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1171 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1172 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1173   return vqdmull_laneq_s32(a, v, 3);
1174 }
1175
1176 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1177 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1178 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1179 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1180 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1181 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1182 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1183 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1184 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1185   return vqdmull_high_lane_s16(a, v, 3);
1186 }
1187
1188 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1189 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1190 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1191 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1192 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1193 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1194 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1195 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1196 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1197   return vqdmull_high_lane_s32(a, v, 1);
1198 }
1199
1200 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1201 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1202 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1203 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1204 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1205 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1206 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1207 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1208 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1209   return vqdmull_high_laneq_s16(a, v, 7);
1210 }
1211
1212 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1213 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1214 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1215 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1216 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1217 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1218 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1219 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1220 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1221   return vqdmull_high_laneq_s32(a, v, 3);
1222 }
1223
1224 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1225 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1226 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1227 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1228 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
1229 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1230 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
1231 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1232   return vqdmulh_lane_s16(a, v, 3);
1233 }
1234
1235 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1236 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1237 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1238 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1239 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
1240 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1241 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
1242 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1243   return vqdmulhq_lane_s16(a, v, 3);
1244 }
1245
1246 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1247 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1248 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1249 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1250 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
1251 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1252 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
1253 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1254   return vqdmulh_lane_s32(a, v, 1);
1255 }
1256
1257 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1258 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1259 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1260 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1261 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
1262 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1263 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
1264 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1265   return vqdmulhq_lane_s32(a, v, 1);
1266 }
1267
1268 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1269 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1270 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1271 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1272 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
1273 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1274 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
1275 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1276   return vqrdmulh_lane_s16(a, v, 3);
1277 }
1278
1279 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1280 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1281 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1282 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1283 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
1284 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1285 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
1286 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1287   return vqrdmulhq_lane_s16(a, v, 3);
1288 }
1289
1290 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1291 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1292 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1293 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1294 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
1295 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1296 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
1297 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1298   return vqrdmulh_lane_s32(a, v, 1);
1299 }
1300
1301 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1302 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1304 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1305 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
1306 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1307 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
1308 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1309   return vqrdmulhq_lane_s32(a, v, 1);
1310 }
1311
1312 // CHECK-LABEL: @test_vmul_lane_f32(
1313 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1314 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1315 // CHECK:   ret <2 x float> [[MUL]]
1316 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1317   return vmul_lane_f32(a, v, 1);
1318 }
1319
1320 // CHECK-LABEL: @test_vmul_lane_f64(
1321 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1322 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1323 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1324 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1325 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1326 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1327 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1328 // CHECK:   ret <1 x double> [[TMP5]]
1329
1330 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1331   return vmul_lane_f64(a, v, 0);
1332 }
1333
1334 // CHECK-LABEL: @test_vmulq_lane_f32(
1335 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1336 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1337 // CHECK:   ret <4 x float> [[MUL]]
1338
1339 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1340   return vmulq_lane_f32(a, v, 1);
1341 }
1342
1343 // CHECK-LABEL: @test_vmulq_lane_f64(
1344 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1345 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1346 // CHECK:   ret <2 x double> [[MUL]]
1347 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1348   return vmulq_lane_f64(a, v, 0);
1349 }
1350
1351 // CHECK-LABEL: @test_vmul_laneq_f32(
1352 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1353 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1354 // CHECK:   ret <2 x float> [[MUL]]
1355 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1356   return vmul_laneq_f32(a, v, 3);
1357 }
1358
1359 // CHECK-LABEL: @test_vmul_laneq_f64(
1360 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1361 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1362 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1363 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1364 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1365 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1366 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1367 // CHECK:   ret <1 x double> [[TMP5]]
1368 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1369   return vmul_laneq_f64(a, v, 1);
1370 }
1371
1372 // CHECK-LABEL: @test_vmulq_laneq_f32(
1373 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1374 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1375 // CHECK:   ret <4 x float> [[MUL]]
1376
1377 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1378   return vmulq_laneq_f32(a, v, 3);
1379 }
1380
1381 // CHECK-LABEL: @test_vmulq_laneq_f64(
1382 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1383 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1384 // CHECK:   ret <2 x double> [[MUL]]
1385 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1386   return vmulq_laneq_f64(a, v, 1);
1387 }
1388
1389 // CHECK-LABEL: @test_vmulx_lane_f32(
1390 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1391 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1392 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1393 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
1394 // CHECK:   ret <2 x float> [[VMULX2_I]]
1395 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1396   return vmulx_lane_f32(a, v, 1);
1397 }
1398
1399 // CHECK-LABEL: @test_vmulxq_lane_f32(
1400 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1401 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1402 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1403 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
1404 // CHECK:   ret <4 x float> [[VMULX2_I]]
1405 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1406   return vmulxq_lane_f32(a, v, 1);
1407 }
1408
1409 // CHECK-LABEL: @test_vmulxq_lane_f64(
1410 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1411 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1412 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1413 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
1414 // CHECK:   ret <2 x double> [[VMULX2_I]]
1415 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1416   return vmulxq_lane_f64(a, v, 0);
1417 }
1418
1419 // CHECK-LABEL: @test_vmulx_laneq_f32(
1420 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1421 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1422 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1423 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
1424 // CHECK:   ret <2 x float> [[VMULX2_I]]
1425 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1426   return vmulx_laneq_f32(a, v, 3);
1427 }
1428
1429 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1430 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1431 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1432 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1433 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
1434 // CHECK:   ret <4 x float> [[VMULX2_I]]
1435 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1436   return vmulxq_laneq_f32(a, v, 3);
1437 }
1438
1439 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1440 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1441 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1442 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1443 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
1444 // CHECK:   ret <2 x double> [[VMULX2_I]]
1445 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1446   return vmulxq_laneq_f64(a, v, 1);
1447 }
1448
1449 // CHECK-LABEL: @test_vmla_lane_s16_0(
1450 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1451 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1452 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1453 // CHECK:   ret <4 x i16> [[ADD]]
1454 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1455   return vmla_lane_s16(a, b, v, 0);
1456 }
1457
1458 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1459 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1460 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1461 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1462 // CHECK:   ret <8 x i16> [[ADD]]
1463 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1464   return vmlaq_lane_s16(a, b, v, 0);
1465 }
1466
1467 // CHECK-LABEL: @test_vmla_lane_s32_0(
1468 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1469 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1470 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1471 // CHECK:   ret <2 x i32> [[ADD]]
1472 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1473   return vmla_lane_s32(a, b, v, 0);
1474 }
1475
1476 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1477 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1478 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1479 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1480 // CHECK:   ret <4 x i32> [[ADD]]
1481 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1482   return vmlaq_lane_s32(a, b, v, 0);
1483 }
1484
1485 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1486 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1487 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1488 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1489 // CHECK:   ret <4 x i16> [[ADD]]
1490 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1491   return vmla_laneq_s16(a, b, v, 0);
1492 }
1493
1494 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1495 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1496 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1497 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1498 // CHECK:   ret <8 x i16> [[ADD]]
1499 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1500   return vmlaq_laneq_s16(a, b, v, 0);
1501 }
1502
1503 // CHECK-LABEL: @test_vmla_laneq_s32_0(
1504 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1505 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1506 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1507 // CHECK:   ret <2 x i32> [[ADD]]
1508 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1509   return vmla_laneq_s32(a, b, v, 0);
1510 }
1511
1512 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
1513 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1514 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1515 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1516 // CHECK:   ret <4 x i32> [[ADD]]
1517 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1518   return vmlaq_laneq_s32(a, b, v, 0);
1519 }
1520
1521 // CHECK-LABEL: @test_vmls_lane_s16_0(
1522 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1523 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1524 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1525 // CHECK:   ret <4 x i16> [[SUB]]
1526 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1527   return vmls_lane_s16(a, b, v, 0);
1528 }
1529
1530 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
1531 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1532 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1533 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1534 // CHECK:   ret <8 x i16> [[SUB]]
1535 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1536   return vmlsq_lane_s16(a, b, v, 0);
1537 }
1538
1539 // CHECK-LABEL: @test_vmls_lane_s32_0(
1540 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1541 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1542 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1543 // CHECK:   ret <2 x i32> [[SUB]]
1544 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1545   return vmls_lane_s32(a, b, v, 0);
1546 }
1547
1548 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
1549 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1550 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1551 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1552 // CHECK:   ret <4 x i32> [[SUB]]
1553 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1554   return vmlsq_lane_s32(a, b, v, 0);
1555 }
1556
1557 // CHECK-LABEL: @test_vmls_laneq_s16_0(
1558 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1559 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1560 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1561 // CHECK:   ret <4 x i16> [[SUB]]
1562 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1563   return vmls_laneq_s16(a, b, v, 0);
1564 }
1565
1566 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
1567 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1568 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1569 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1570 // CHECK:   ret <8 x i16> [[SUB]]
1571 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1572   return vmlsq_laneq_s16(a, b, v, 0);
1573 }
1574
1575 // CHECK-LABEL: @test_vmls_laneq_s32_0(
1576 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1577 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1578 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1579 // CHECK:   ret <2 x i32> [[SUB]]
1580 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1581   return vmls_laneq_s32(a, b, v, 0);
1582 }
1583
1584 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
1585 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1586 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1587 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1588 // CHECK:   ret <4 x i32> [[SUB]]
1589 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1590   return vmlsq_laneq_s32(a, b, v, 0);
1591 }
1592
1593 // CHECK-LABEL: @test_vmul_lane_s16_0(
1594 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1595 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1596 // CHECK:   ret <4 x i16> [[MUL]]
1597 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1598   return vmul_lane_s16(a, v, 0);
1599 }
1600
1601 // CHECK-LABEL: @test_vmulq_lane_s16_0(
1602 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1603 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1604 // CHECK:   ret <8 x i16> [[MUL]]
1605 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1606   return vmulq_lane_s16(a, v, 0);
1607 }
1608
1609 // CHECK-LABEL: @test_vmul_lane_s32_0(
1610 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1611 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1612 // CHECK:   ret <2 x i32> [[MUL]]
1613 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1614   return vmul_lane_s32(a, v, 0);
1615 }
1616
1617 // CHECK-LABEL: @test_vmulq_lane_s32_0(
1618 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1619 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1620 // CHECK:   ret <4 x i32> [[MUL]]
1621 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1622   return vmulq_lane_s32(a, v, 0);
1623 }
1624
1625 // CHECK-LABEL: @test_vmul_lane_u16_0(
1626 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1627 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1628 // CHECK:   ret <4 x i16> [[MUL]]
1629 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1630   return vmul_lane_u16(a, v, 0);
1631 }
1632
1633 // CHECK-LABEL: @test_vmulq_lane_u16_0(
1634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1635 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1636 // CHECK:   ret <8 x i16> [[MUL]]
1637 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1638   return vmulq_lane_u16(a, v, 0);
1639 }
1640
1641 // CHECK-LABEL: @test_vmul_lane_u32_0(
1642 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1643 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1644 // CHECK:   ret <2 x i32> [[MUL]]
1645 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1646   return vmul_lane_u32(a, v, 0);
1647 }
1648
1649 // CHECK-LABEL: @test_vmulq_lane_u32_0(
1650 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1651 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1652 // CHECK:   ret <4 x i32> [[MUL]]
1653 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1654   return vmulq_lane_u32(a, v, 0);
1655 }
1656
1657 // CHECK-LABEL: @test_vmul_laneq_s16_0(
1658 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1659 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1660 // CHECK:   ret <4 x i16> [[MUL]]
1661 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1662   return vmul_laneq_s16(a, v, 0);
1663 }
1664
1665 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
1666 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1667 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1668 // CHECK:   ret <8 x i16> [[MUL]]
1669 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1670   return vmulq_laneq_s16(a, v, 0);
1671 }
1672
1673 // CHECK-LABEL: @test_vmul_laneq_s32_0(
1674 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1675 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1676 // CHECK:   ret <2 x i32> [[MUL]]
1677 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1678   return vmul_laneq_s32(a, v, 0);
1679 }
1680
1681 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
1682 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1683 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1684 // CHECK:   ret <4 x i32> [[MUL]]
1685 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1686   return vmulq_laneq_s32(a, v, 0);
1687 }
1688
1689 // CHECK-LABEL: @test_vmul_laneq_u16_0(
1690 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1691 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1692 // CHECK:   ret <4 x i16> [[MUL]]
1693 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1694   return vmul_laneq_u16(a, v, 0);
1695 }
1696
1697 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
1698 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1699 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1700 // CHECK:   ret <8 x i16> [[MUL]]
1701 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1702   return vmulq_laneq_u16(a, v, 0);
1703 }
1704
1705 // CHECK-LABEL: @test_vmul_laneq_u32_0(
1706 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1707 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1708 // CHECK:   ret <2 x i32> [[MUL]]
1709 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1710   return vmul_laneq_u32(a, v, 0);
1711 }
1712
1713 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
1714 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1715 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1716 // CHECK:   ret <4 x i32> [[MUL]]
1717 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1718   return vmulq_laneq_u32(a, v, 0);
1719 }
1720
1721 // CHECK-LABEL: @test_vfma_lane_f32_0(
1722 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1723 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1724 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1725 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1726 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1727 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1728 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1729 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1730 // CHECK:   ret <2 x float> [[FMLA2]]
1731 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1732   return vfma_lane_f32(a, b, v, 0);
1733 }
1734
1735 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
1736 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1737 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1738 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1739 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1740 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1741 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1742 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1743 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1744 // CHECK:   ret <4 x float> [[FMLA2]]
1745 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1746   return vfmaq_lane_f32(a, b, v, 0);
1747 }
1748
1749 // CHECK-LABEL: @test_vfma_laneq_f32_0(
1750 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1751 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1752 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1753 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1754 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1755 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1756 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1757 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1758 // CHECK:   ret <2 x float> [[TMP6]]
1759 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1760   return vfma_laneq_f32(a, b, v, 0);
1761 }
1762
1763 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
1764 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1765 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1766 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1767 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1768 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1769 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1770 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1771 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1772 // CHECK:   ret <4 x float> [[TMP6]]
1773 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1774   return vfmaq_laneq_f32(a, b, v, 0);
1775 }
1776
1777 // CHECK-LABEL: @test_vfms_lane_f32_0(
1778 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1779 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1780 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1781 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1782 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1783 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1784 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1785 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1786 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1787 // CHECK:   ret <2 x float> [[FMLA2]]
1788 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1789   return vfms_lane_f32(a, b, v, 0);
1790 }
1791
1792 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
1793 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1794 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1795 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1796 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1797 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1798 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1799 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1800 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1801 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1802 // CHECK:   ret <4 x float> [[FMLA2]]
1803 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1804   return vfmsq_lane_f32(a, b, v, 0);
1805 }
1806
1807 // CHECK-LABEL: @test_vfms_laneq_f32_0(
1808 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1810 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1811 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1812 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1813 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1814 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1815 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1816 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1817 // CHECK:   ret <2 x float> [[TMP6]]
1818 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1819   return vfms_laneq_f32(a, b, v, 0);
1820 }
1821
1822 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
1823 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1824 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1825 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1826 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1827 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1828 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1829 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1830 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1831 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1832 // CHECK:   ret <4 x float> [[TMP6]]
1833 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1834   return vfmsq_laneq_f32(a, b, v, 0);
1835 }
1836
1837 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
1838 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1839 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
1840 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1841 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1842 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1843 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1844 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1845 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1846 // CHECK:   ret <2 x double> [[TMP6]]
1847 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1848   return vfmaq_laneq_f64(a, b, v, 0);
1849 }
1850
1851 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
1852 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
1853 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1854 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
1855 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1856 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1857 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1858 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1859 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1860 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1861 // CHECK:   ret <2 x double> [[TMP6]]
1862 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1863   return vfmsq_laneq_f64(a, b, v, 0);
1864 }
1865
1866 // CHECK-LABEL: @test_vmlal_lane_s16_0(
1867 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1868 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1869 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1870 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1871 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1872 // CHECK:   ret <4 x i32> [[ADD]]
1873 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1874   return vmlal_lane_s16(a, b, v, 0);
1875 }
1876
1877 // CHECK-LABEL: @test_vmlal_lane_s32_0(
1878 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1879 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1880 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1881 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1882 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1883 // CHECK:   ret <2 x i64> [[ADD]]
1884 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1885   return vmlal_lane_s32(a, b, v, 0);
1886 }
1887
1888 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
1889 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1890 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1891 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1892 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1893 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1894 // CHECK:   ret <4 x i32> [[ADD]]
1895 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1896   return vmlal_laneq_s16(a, b, v, 0);
1897 }
1898
1899 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
1900 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1901 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1902 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1903 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1904 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1905 // CHECK:   ret <2 x i64> [[ADD]]
1906 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1907   return vmlal_laneq_s32(a, b, v, 0);
1908 }
1909
1910 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
1911 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1912 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1914 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1915 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1916 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1917 // CHECK:   ret <4 x i32> [[ADD]]
1918 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
1919   return vmlal_high_lane_s16(a, b, v, 0);
1920 }
1921
1922 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
1923 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1924 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1925 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1926 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1927 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1928 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1929 // CHECK:   ret <2 x i64> [[ADD]]
1930 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
1931   return vmlal_high_lane_s32(a, b, v, 0);
1932 }
1933
1934 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
1935 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1936 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1937 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1938 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1939 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
1940 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1941 // CHECK:   ret <4 x i32> [[ADD]]
1942 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
1943   return vmlal_high_laneq_s16(a, b, v, 0);
1944 }
1945
1946 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
1947 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1948 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1949 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1950 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1951 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
1952 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1953 // CHECK:   ret <2 x i64> [[ADD]]
1954 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
1955   return vmlal_high_laneq_s32(a, b, v, 0);
1956 }
1957
1958 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
1959 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1960 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1961 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1962 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1963 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1964 // CHECK:   ret <4 x i32> [[SUB]]
1965 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1966   return vmlsl_lane_s16(a, b, v, 0);
1967 }
1968
1969 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
1970 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1971 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1972 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1973 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1974 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1975 // CHECK:   ret <2 x i64> [[SUB]]
1976 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1977   return vmlsl_lane_s32(a, b, v, 0);
1978 }
1979
1980 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
1981 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1982 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1983 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1984 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
1985 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1986 // CHECK:   ret <4 x i32> [[SUB]]
1987 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1988   return vmlsl_laneq_s16(a, b, v, 0);
1989 }
1990
1991 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
1992 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1993 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1994 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1995 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
1996 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1997 // CHECK:   ret <2 x i64> [[SUB]]
1998 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1999   return vmlsl_laneq_s32(a, b, v, 0);
2000 }
2001
2002 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2003 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2005 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2006 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2007 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2008 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2009 // CHECK:   ret <4 x i32> [[SUB]]
2010 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2011   return vmlsl_high_lane_s16(a, b, v, 0);
2012 }
2013
2014 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2015 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2016 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2017 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2018 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2019 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2020 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2021 // CHECK:   ret <2 x i64> [[SUB]]
2022 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2023   return vmlsl_high_lane_s32(a, b, v, 0);
2024 }
2025
2026 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2027 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2028 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2029 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2030 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2031 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2032 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2033 // CHECK:   ret <4 x i32> [[SUB]]
2034 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2035   return vmlsl_high_laneq_s16(a, b, v, 0);
2036 }
2037
2038 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2039 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2040 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2041 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2042 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2043 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2044 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2045 // CHECK:   ret <2 x i64> [[SUB]]
2046 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2047   return vmlsl_high_laneq_s32(a, b, v, 0);
2048 }
2049
2050 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2051 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2052 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2053 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2054 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2055 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2056 // CHECK:   ret <4 x i32> [[ADD]]
2057 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2058   return vmlal_lane_u16(a, b, v, 0);
2059 }
2060
2061 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2062 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2063 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2064 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2065 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2066 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067 // CHECK:   ret <2 x i64> [[ADD]]
2068 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069   return vmlal_lane_u32(a, b, v, 0);
2070 }
2071
2072 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2073 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2077 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2078 // CHECK:   ret <4 x i32> [[ADD]]
2079 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2080   return vmlal_laneq_u16(a, b, v, 0);
2081 }
2082
2083 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2084 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2085 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2086 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2087 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2088 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2089 // CHECK:   ret <2 x i64> [[ADD]]
2090 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2091   return vmlal_laneq_u32(a, b, v, 0);
2092 }
2093
2094 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2095 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2096 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2098 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2099 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2100 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2101 // CHECK:   ret <4 x i32> [[ADD]]
2102 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2103   return vmlal_high_lane_u16(a, b, v, 0);
2104 }
2105
2106 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2107 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2108 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2109 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2110 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2111 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2112 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2113 // CHECK:   ret <2 x i64> [[ADD]]
2114 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2115   return vmlal_high_lane_u32(a, b, v, 0);
2116 }
2117
2118 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2119 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2120 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2122 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2123 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2124 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2125 // CHECK:   ret <4 x i32> [[ADD]]
2126 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2127   return vmlal_high_laneq_u16(a, b, v, 0);
2128 }
2129
2130 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2131 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2132 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2133 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2134 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2135 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2136 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2137 // CHECK:   ret <2 x i64> [[ADD]]
2138 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2139   return vmlal_high_laneq_u32(a, b, v, 0);
2140 }
2141
2142 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2143 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2145 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2146 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2147 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2148 // CHECK:   ret <4 x i32> [[SUB]]
2149 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2150   return vmlsl_lane_u16(a, b, v, 0);
2151 }
2152
2153 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2154 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2155 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2156 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2157 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2158 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2159 // CHECK:   ret <2 x i64> [[SUB]]
2160 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2161   return vmlsl_lane_u32(a, b, v, 0);
2162 }
2163
2164 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2165 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2166 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2167 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2168 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2169 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2170 // CHECK:   ret <4 x i32> [[SUB]]
2171 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2172   return vmlsl_laneq_u16(a, b, v, 0);
2173 }
2174
2175 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2178 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2179 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2180 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2181 // CHECK:   ret <2 x i64> [[SUB]]
2182 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2183   return vmlsl_laneq_u32(a, b, v, 0);
2184 }
2185
2186 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2187 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2188 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2189 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2190 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2191 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2192 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2193 // CHECK:   ret <4 x i32> [[SUB]]
2194 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2195   return vmlsl_high_lane_u16(a, b, v, 0);
2196 }
2197
2198 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2199 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2201 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2202 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2203 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2204 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2205 // CHECK:   ret <2 x i64> [[SUB]]
2206 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2207   return vmlsl_high_lane_u32(a, b, v, 0);
2208 }
2209
2210 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2211 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2212 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2213 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2214 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2215 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2216 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2217 // CHECK:   ret <4 x i32> [[SUB]]
2218 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2219   return vmlsl_high_laneq_u16(a, b, v, 0);
2220 }
2221
2222 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2223 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2224 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2225 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2226 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2227 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2228 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229 // CHECK:   ret <2 x i64> [[SUB]]
2230 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2231   return vmlsl_high_laneq_u32(a, b, v, 0);
2232 }
2233
2234 // CHECK-LABEL: @test_vmull_lane_s16_0(
2235 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2236 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2237 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2238 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2239 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2240 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2241   return vmull_lane_s16(a, v, 0);
2242 }
2243
2244 // CHECK-LABEL: @test_vmull_lane_s32_0(
2245 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2246 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2247 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2248 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2249 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2250 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2251   return vmull_lane_s32(a, v, 0);
2252 }
2253
2254 // CHECK-LABEL: @test_vmull_lane_u16_0(
2255 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2256 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2257 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2258 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2259 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2260 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2261   return vmull_lane_u16(a, v, 0);
2262 }
2263
2264 // CHECK-LABEL: @test_vmull_lane_u32_0(
2265 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2266 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2267 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2268 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2269 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2270 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2271   return vmull_lane_u32(a, v, 0);
2272 }
2273
2274 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
2275 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2276 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2277 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2278 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2279 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2280 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2281 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2282   return vmull_high_lane_s16(a, v, 0);
2283 }
2284
2285 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
2286 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2287 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2288 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2289 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2290 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2291 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2292 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2293   return vmull_high_lane_s32(a, v, 0);
2294 }
2295
2296 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
2297 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2298 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2299 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2300 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2301 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2302 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2303 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2304   return vmull_high_lane_u16(a, v, 0);
2305 }
2306
2307 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
2308 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2309 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2310 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2311 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2312 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2313 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2314 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2315   return vmull_high_lane_u32(a, v, 0);
2316 }
2317
2318 // CHECK-LABEL: @test_vmull_laneq_s16_0(
2319 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2320 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2321 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2322 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2323 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2324 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2325   return vmull_laneq_s16(a, v, 0);
2326 }
2327
2328 // CHECK-LABEL: @test_vmull_laneq_s32_0(
2329 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2330 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2331 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2332 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2333 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2334 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2335   return vmull_laneq_s32(a, v, 0);
2336 }
2337
2338 // CHECK-LABEL: @test_vmull_laneq_u16_0(
2339 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2340 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2341 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2342 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2343 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2344 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2345   return vmull_laneq_u16(a, v, 0);
2346 }
2347
2348 // CHECK-LABEL: @test_vmull_laneq_u32_0(
2349 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2350 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2351 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2352 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2353 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2354 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2355   return vmull_laneq_u32(a, v, 0);
2356 }
2357
2358 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
2359 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2360 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2361 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2362 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2363 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2364 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2365 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2366   return vmull_high_laneq_s16(a, v, 0);
2367 }
2368
2369 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
2370 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2371 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2372 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2373 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2374 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2375 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2376 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2377   return vmull_high_laneq_s32(a, v, 0);
2378 }
2379
2380 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
2381 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2382 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2383 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2384 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2385 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2386 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2387 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2388   return vmull_high_laneq_u16(a, v, 0);
2389 }
2390
2391 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
2392 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2393 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2394 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2395 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2396 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2397 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2398 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2399   return vmull_high_laneq_u32(a, v, 0);
2400 }
2401
2402 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
2403 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2404 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2405 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2406 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2407 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2408 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
2409 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2410 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2411   return vqdmlal_lane_s16(a, b, v, 0);
2412 }
2413
2414 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
2415 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2416 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2417 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2418 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2419 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2420 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
2421 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2422 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2423   return vqdmlal_lane_s32(a, b, v, 0);
2424 }
2425
2426 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
2427 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2428 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2429 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2430 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2431 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2432 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2433 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
2434 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2435 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2436   return vqdmlal_high_lane_s16(a, b, v, 0);
2437 }
2438
2439 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
2440 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2441 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2442 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2443 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2444 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2445 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2446 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
2447 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2448 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2449   return vqdmlal_high_lane_s32(a, b, v, 0);
2450 }
2451
2452 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
2453 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2454 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2455 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2456 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2457 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
2458 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
2459 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2460 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2461   return vqdmlsl_lane_s16(a, b, v, 0);
2462 }
2463
2464 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
2465 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2466 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2467 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2468 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
2470 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
2471 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2472 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2473   return vqdmlsl_lane_s32(a, b, v, 0);
2474 }
2475
2476 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
2477 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2478 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2479 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2480 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2481 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2483 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
2484 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2485 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2486   return vqdmlsl_high_lane_s16(a, b, v, 0);
2487 }
2488
2489 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
2490 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2491 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2493 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2494 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2495 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2496 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
2497 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2498 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2499   return vqdmlsl_high_lane_s32(a, b, v, 0);
2500 }
2501
2502 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
2503 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2507 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2508 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2509 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2510   return vqdmull_lane_s16(a, v, 0);
2511 }
2512
2513 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
2514 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2515 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2516 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2517 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2518 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2519 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2520 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2521   return vqdmull_lane_s32(a, v, 0);
2522 }
2523
2524 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
2525 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2526 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2527 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2528 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2529 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2530 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2531 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2532   return vqdmull_laneq_s16(a, v, 0);
2533 }
2534
2535 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
2536 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2537 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2538 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2539 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2540 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2541 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2542 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2543   return vqdmull_laneq_s32(a, v, 0);
2544 }
2545
2546 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
2547 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2548 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2549 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2550 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2551 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2552 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2553 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2554 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2555   return vqdmull_high_lane_s16(a, v, 0);
2556 }
2557
2558 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
2559 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2560 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2561 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2562 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2563 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2564 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2565 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2566 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2567   return vqdmull_high_lane_s32(a, v, 0);
2568 }
2569
2570 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
2571 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2572 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2573 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2574 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2575 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
2576 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2577 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2578 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2579   return vqdmull_high_laneq_s16(a, v, 0);
2580 }
2581
2582 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
2583 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2584 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2585 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2586 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2587 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
2588 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2589 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2590 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2591   return vqdmull_high_laneq_s32(a, v, 0);
2592 }
2593
2594 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
2595 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2596 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2597 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2598 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2599 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2600 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
2601 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2602   return vqdmulh_lane_s16(a, v, 0);
2603 }
2604
2605 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
2606 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2607 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2608 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2609 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
2610 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2611 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
2612 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2613   return vqdmulhq_lane_s16(a, v, 0);
2614 }
2615
2616 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
2617 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2618 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2619 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2620 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2621 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2622 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
2623 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2624   return vqdmulh_lane_s32(a, v, 0);
2625 }
2626
2627 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
2628 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2629 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2630 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2631 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
2632 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2633 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
2634 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2635   return vqdmulhq_lane_s32(a, v, 0);
2636 }
2637
2638 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
2639 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2640 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2641 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2642 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
2643 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2644 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
2645 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2646   return vqrdmulh_lane_s16(a, v, 0);
2647 }
2648
2649 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
2650 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2651 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2652 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2653 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
2654 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
2655 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2656   return vqrdmulhq_lane_s16(a, v, 0);
2657 }
2658
2659 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
2660 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2661 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2662 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2663 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
2664 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
2665 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
2666 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2667   return vqrdmulh_lane_s32(a, v, 0);
2668 }
2669
2670 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
2671 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2672 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2673 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2674 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
2675 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
2676 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2677   return vqrdmulhq_lane_s32(a, v, 0);
2678 }
2679
2680 // CHECK-LABEL: @test_vmul_lane_f32_0(
2681 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2682 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2683 // CHECK:   ret <2 x float> [[MUL]]
2684 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
2685   return vmul_lane_f32(a, v, 0);
2686 }
2687
2688 // CHECK-LABEL: @test_vmulq_lane_f32_0(
2689 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2690 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2691 // CHECK:   ret <4 x float> [[MUL]]
2692 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
2693   return vmulq_lane_f32(a, v, 0);
2694 }
2695
2696 // CHECK-LABEL: @test_vmul_laneq_f32_0(
2697 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2698 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2699 // CHECK:   ret <2 x float> [[MUL]]
2700 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
2701   return vmul_laneq_f32(a, v, 0);
2702 }
2703
2704 // CHECK-LABEL: @test_vmul_laneq_f64_0(
2705 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
2706 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
2707 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
2708 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2709 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
2710 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
2711 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
2712 // CHECK:   ret <1 x double> [[TMP5]]
2713 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
2714   return vmul_laneq_f64(a, v, 0);
2715 }
2716
2717 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
2718 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2719 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2720 // CHECK:   ret <4 x float> [[MUL]]
2721 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2722   return vmulq_laneq_f32(a, v, 0);
2723 }
2724
2725 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
2726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2727 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
2728 // CHECK:   ret <2 x double> [[MUL]]
2729 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2730   return vmulq_laneq_f64(a, v, 0);
2731 }
2732
2733 // CHECK-LABEL: @test_vmulx_lane_f32_0(
2734 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2735 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2736 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2737 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
2738 // CHECK:   ret <2 x float> [[VMULX2_I]]
2739 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
2740   return vmulx_lane_f32(a, v, 0);
2741 }
2742
2743 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
2744 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2745 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2746 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2747 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
2748 // CHECK:   ret <4 x float> [[VMULX2_I]]
2749 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
2750   return vmulxq_lane_f32(a, v, 0);
2751 }
2752
2753 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
2754 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
2755 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2756 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2757 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
2758 // CHECK:   ret <2 x double> [[VMULX2_I]]
2759 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
2760   return vmulxq_lane_f64(a, v, 0);
2761 }
2762
2763 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
2764 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2765 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2766 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2767 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]]) #2
2768 // CHECK:   ret <2 x float> [[VMULX2_I]]
2769 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
2770   return vmulx_laneq_f32(a, v, 0);
2771 }
2772
2773 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
2774 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2775 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2776 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2777 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]]) #2
2778 // CHECK:   ret <4 x float> [[VMULX2_I]]
2779 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2780   return vmulxq_laneq_f32(a, v, 0);
2781 }
2782
2783 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
2784 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2785 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2786 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2787 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]]) #2
2788 // CHECK:   ret <2 x double> [[VMULX2_I]]
2789 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2790   return vmulxq_laneq_f64(a, v, 0);
2791 }
2792
2793 // CHECK-LABEL: @test_vmull_high_n_s16(
2794 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2796 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2797 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2798 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2799 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2800 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2801 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2802 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
2803 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
2804   return vmull_high_n_s16(a, b);
2805 }
2806
2807 // CHECK-LABEL: @test_vmull_high_n_s32(
2808 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2810 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2811 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2812 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2813 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2814 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
2815 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
2816   return vmull_high_n_s32(a, b);
2817 }
2818
2819 // CHECK-LABEL: @test_vmull_high_n_u16(
2820 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2821 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2822 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2823 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2824 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2825 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2826 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2827 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2828 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
2829 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
2830   return vmull_high_n_u16(a, b);
2831 }
2832
2833 // CHECK-LABEL: @test_vmull_high_n_u32(
2834 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2835 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2836 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2837 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2838 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2839 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2840 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
2841 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
2842   return vmull_high_n_u32(a, b);
2843 }
2844
2845 // CHECK-LABEL: @test_vqdmull_high_n_s16(
2846 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2847 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2848 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2849 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2850 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2851 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2852 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2853 // CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2854 // CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
2855 // CHECK:   ret <4 x i32> [[VQDMULL_V5_I_I]]
2856 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
2857   return vqdmull_high_n_s16(a, b);
2858 }
2859
2860 // CHECK-LABEL: @test_vqdmull_high_n_s32(
2861 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2862 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2863 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2864 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2865 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2866 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2867 // CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
2868 // CHECK:   ret <2 x i64> [[VQDMULL_V3_I_I]]
2869 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
2870   return vqdmull_high_n_s32(a, b);
2871 }
2872
2873 // CHECK-LABEL: @test_vmlal_high_n_s16(
2874 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2875 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2876 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2877 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2878 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2879 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2880 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2881 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2882 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2883 // CHECK:   ret <4 x i32> [[ADD_I_I]]
2884 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2885   return vmlal_high_n_s16(a, b, c);
2886 }
2887
2888 // CHECK-LABEL: @test_vmlal_high_n_s32(
2889 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2890 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2891 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2892 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2893 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2894 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2895 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2896 // CHECK:   ret <2 x i64> [[ADD_I_I]]
2897 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2898   return vmlal_high_n_s32(a, b, c);
2899 }
2900
2901 // CHECK-LABEL: @test_vmlal_high_n_u16(
2902 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2903 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2904 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2905 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2906 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2907 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2909 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2910 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2911 // CHECK:   ret <4 x i32> [[ADD_I_I]]
2912 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2913   return vmlal_high_n_u16(a, b, c);
2914 }
2915
2916 // CHECK-LABEL: @test_vmlal_high_n_u32(
2917 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2918 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2919 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2920 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2921 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2922 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2923 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2924 // CHECK:   ret <2 x i64> [[ADD_I_I]]
2925 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
2926   return vmlal_high_n_u32(a, b, c);
2927 }
2928
2929 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
2930 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2931 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2932 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2933 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2934 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2935 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2936 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2937 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2938 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2939 // CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
2940 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
2941 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2942   return vqdmlal_high_n_s16(a, b, c);
2943 }
2944
2945 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
2946 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2947 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2948 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2949 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2950 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2951 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2952 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2953 // CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
2954 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
2955 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2956   return vqdmlal_high_n_s32(a, b, c);
2957 }
2958
2959 // CHECK-LABEL: @test_vmlsl_high_n_s16(
2960 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2961 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2962 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2963 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2964 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2965 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2966 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2967 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2968 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2969 // CHECK:   ret <4 x i32> [[SUB_I_I]]
2970 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2971   return vmlsl_high_n_s16(a, b, c);
2972 }
2973
2974 // CHECK-LABEL: @test_vmlsl_high_n_s32(
2975 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2976 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2977 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2978 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2979 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2980 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
2981 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
2982 // CHECK:   ret <2 x i64> [[SUB_I_I]]
2983 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2984   return vmlsl_high_n_s32(a, b, c);
2985 }
2986
2987 // CHECK-LABEL: @test_vmlsl_high_n_u16(
2988 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2989 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2990 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2991 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2992 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2993 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2994 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2995 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
2996 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2997 // CHECK:   ret <4 x i32> [[SUB_I_I]]
2998 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2999   return vmlsl_high_n_u16(a, b, c);
3000 }
3001
3002 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3003 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3004 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3005 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3006 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3007 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3008 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
3009 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3010 // CHECK:   ret <2 x i64> [[SUB_I_I]]
3011 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3012   return vmlsl_high_n_u32(a, b, c);
3013 }
3014
3015 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3016 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3017 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3018 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3019 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3020 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3021 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3022 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3023 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3024 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]]) #2
3025 // CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]]) #2
3026 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
3027 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3028   return vqdmlsl_high_n_s16(a, b, c);
3029 }
3030
3031 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
3032 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3035 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3036 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3037 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3038 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]]) #2
3039 // CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]]) #2
3040 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
3041 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3042   return vqdmlsl_high_n_s32(a, b, c);
3043 }
3044
3045 // CHECK-LABEL: @test_vmul_n_f32(
3046 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3047 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3048 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3049 // CHECK:   ret <2 x float> [[MUL_I]]
3050 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3051   return vmul_n_f32(a, b);
3052 }
3053
3054 // CHECK-LABEL: @test_vmulq_n_f32(
3055 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3056 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3057 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3058 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3059 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3060 // CHECK:   ret <4 x float> [[MUL_I]]
3061 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3062   return vmulq_n_f32(a, b);
3063 }
3064
3065 // CHECK-LABEL: @test_vmulq_n_f64(
3066 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3067 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3068 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3069 // CHECK:   ret <2 x double> [[MUL_I]]
3070 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3071   return vmulq_n_f64(a, b);
3072 }
3073
3074 // CHECK-LABEL: @test_vfma_n_f32(
3075 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3076 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3077 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3078 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3079 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3080 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
3081 // CHECK:   ret <2 x float> [[TMP3]]
3082 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3083   return vfma_n_f32(a, b, n);
3084 }
3085
3086 // CHECK-LABEL: @test_vfmaq_n_f32(
3087 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3088 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3089 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3090 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3091 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3092 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3093 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3094 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
3095 // CHECK:   ret <4 x float> [[TMP3]]
3096 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3097   return vfmaq_n_f32(a, b, n);
3098 }
3099
3100 // CHECK-LABEL: @test_vfms_n_f32(
3101 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3102 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3103 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3104 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3105 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3106 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3107 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a) #2
3108 // CHECK:   ret <2 x float> [[TMP3]]
3109 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3110   return vfms_n_f32(a, b, n);
3111 }
3112
3113 // CHECK-LABEL: @test_vfmsq_n_f32(
3114 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3115 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3116 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3117 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3118 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3119 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3120 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3121 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3122 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a) #2
3123 // CHECK:   ret <4 x float> [[TMP3]]
3124 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3125   return vfmsq_n_f32(a, b, n);
3126 }
3127
3128 // CHECK-LABEL: @test_vmul_n_s16(
3129 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3130 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3131 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3132 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3133 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3134 // CHECK:   ret <4 x i16> [[MUL_I]]
3135 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3136   return vmul_n_s16(a, b);
3137 }
3138
3139 // CHECK-LABEL: @test_vmulq_n_s16(
3140 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3141 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3142 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3143 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3144 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3145 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3146 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3147 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3148 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3149 // CHECK:   ret <8 x i16> [[MUL_I]]
3150 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3151   return vmulq_n_s16(a, b);
3152 }
3153
3154 // CHECK-LABEL: @test_vmul_n_s32(
3155 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3156 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3157 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3158 // CHECK:   ret <2 x i32> [[MUL_I]]
3159 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3160   return vmul_n_s32(a, b);
3161 }
3162
3163 // CHECK-LABEL: @test_vmulq_n_s32(
3164 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3165 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3166 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3167 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3168 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3169 // CHECK:   ret <4 x i32> [[MUL_I]]
3170 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3171   return vmulq_n_s32(a, b);
3172 }
3173
3174 // CHECK-LABEL: @test_vmul_n_u16(
3175 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3176 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3177 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3178 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3179 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3180 // CHECK:   ret <4 x i16> [[MUL_I]]
3181 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3182   return vmul_n_u16(a, b);
3183 }
3184
3185 // CHECK-LABEL: @test_vmulq_n_u16(
3186 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3187 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3188 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3189 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3190 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3191 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3192 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3193 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3194 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3195 // CHECK:   ret <8 x i16> [[MUL_I]]
3196 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3197   return vmulq_n_u16(a, b);
3198 }
3199
3200 // CHECK-LABEL: @test_vmul_n_u32(
3201 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3202 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3203 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3204 // CHECK:   ret <2 x i32> [[MUL_I]]
3205 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3206   return vmul_n_u32(a, b);
3207 }
3208
3209 // CHECK-LABEL: @test_vmulq_n_u32(
3210 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3211 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3212 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3213 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3214 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3215 // CHECK:   ret <4 x i32> [[MUL_I]]
3216 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3217   return vmulq_n_u32(a, b);
3218 }
3219
3220 // CHECK-LABEL: @test_vmull_n_s16(
3221 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3222 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3223 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3224 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3225 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3226 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3227 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
3228 // CHECK:   ret <4 x i32> [[VMULL5_I]]
3229 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3230   return vmull_n_s16(a, b);
3231 }
3232
3233 // CHECK-LABEL: @test_vmull_n_s32(
3234 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3235 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3236 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3237 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3238 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
3239 // CHECK:   ret <2 x i64> [[VMULL3_I]]
3240 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3241   return vmull_n_s32(a, b);
3242 }
3243
3244 // CHECK-LABEL: @test_vmull_n_u16(
3245 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3246 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3247 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3248 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3249 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3250 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3251 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
3252 // CHECK:   ret <4 x i32> [[VMULL5_I]]
3253 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3254   return vmull_n_u16(a, b);
3255 }
3256
3257 // CHECK-LABEL: @test_vmull_n_u32(
3258 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3259 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3260 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3261 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3262 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
3263 // CHECK:   ret <2 x i64> [[VMULL3_I]]
3264 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3265   return vmull_n_u32(a, b);
3266 }
3267
3268 // CHECK-LABEL: @test_vqdmull_n_s16(
3269 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3270 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3271 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3272 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3273 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3274 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3275 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
3276 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3277 // CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
3278 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3279   return vqdmull_n_s16(a, b);
3280 }
3281
3282 // CHECK-LABEL: @test_vqdmull_n_s32(
3283 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3284 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3285 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3286 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3287 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
3288 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3289 // CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
3290 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3291   return vqdmull_n_s32(a, b);
3292 }
3293
3294 // CHECK-LABEL: @test_vqdmulh_n_s16(
3295 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3296 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3297 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3298 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3299 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3300 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3301 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
3302 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3303 // CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
3304 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3305   return vqdmulh_n_s16(a, b);
3306 }
3307
3308 // CHECK-LABEL: @test_vqdmulhq_n_s16(
3309 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3310 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3311 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3312 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3313 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3314 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3315 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3316 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3317 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3318 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3319 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
3320 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3321 // CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
3322 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3323   return vqdmulhq_n_s16(a, b);
3324 }
3325
3326 // CHECK-LABEL: @test_vqdmulh_n_s32(
3327 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3328 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3329 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3330 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3331 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
3332 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3333 // CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
3334 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3335   return vqdmulh_n_s32(a, b);
3336 }
3337
3338 // CHECK-LABEL: @test_vqdmulhq_n_s32(
3339 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3340 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3341 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3342 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3343 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3344 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3345 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
3346 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3347 // CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
3348 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3349   return vqdmulhq_n_s32(a, b);
3350 }
3351
3352 // CHECK-LABEL: @test_vqrdmulh_n_s16(
3353 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3354 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3355 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3356 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3357 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3358 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3359 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) #2
3360 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3361 // CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
3362 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3363   return vqrdmulh_n_s16(a, b);
3364 }
3365
3366 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
3367 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3368 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3369 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3370 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3371 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3372 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3373 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3374 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3375 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3376 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3377 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) #2
3378 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3379 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
3380 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3381   return vqrdmulhq_n_s16(a, b);
3382 }
3383
3384 // CHECK-LABEL: @test_vqrdmulh_n_s32(
3385 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3386 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3387 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3388 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3389 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) #2
3390 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3391 // CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
3392 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3393   return vqrdmulh_n_s32(a, b);
3394 }
3395
3396 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
3397 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3398 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3399 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3400 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3401 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3402 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3403 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) #2
3404 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3405 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
3406 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3407   return vqrdmulhq_n_s32(a, b);
3408 }
3409
3410 // CHECK-LABEL: @test_vmla_n_s16(
3411 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3412 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3413 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3414 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3415 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3416 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3417 // CHECK:   ret <4 x i16> [[ADD_I]]
3418 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3419   return vmla_n_s16(a, b, c);
3420 }
3421
3422 // CHECK-LABEL: @test_vmlaq_n_s16(
3423 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3424 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3425 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3426 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3427 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3428 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3429 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3430 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3431 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3432 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3433 // CHECK:   ret <8 x i16> [[ADD_I]]
3434 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3435   return vmlaq_n_s16(a, b, c);
3436 }
3437
3438 // CHECK-LABEL: @test_vmla_n_s32(
3439 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3440 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3441 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3442 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3443 // CHECK:   ret <2 x i32> [[ADD_I]]
3444 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3445   return vmla_n_s32(a, b, c);
3446 }
3447
3448 // CHECK-LABEL: @test_vmlaq_n_s32(
3449 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3450 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3451 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3452 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3453 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3454 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3455 // CHECK:   ret <4 x i32> [[ADD_I]]
3456 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3457   return vmlaq_n_s32(a, b, c);
3458 }
3459
3460 // CHECK-LABEL: @test_vmla_n_u16(
3461 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3462 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3463 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3464 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3465 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3466 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3467 // CHECK:   ret <4 x i16> [[ADD_I]]
3468 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3469   return vmla_n_u16(a, b, c);
3470 }
3471
3472 // CHECK-LABEL: @test_vmlaq_n_u16(
3473 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3474 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3475 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3476 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3477 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3478 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3479 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3480 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3481 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3482 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3483 // CHECK:   ret <8 x i16> [[ADD_I]]
3484 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3485   return vmlaq_n_u16(a, b, c);
3486 }
3487
3488 // CHECK-LABEL: @test_vmla_n_u32(
3489 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3490 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3491 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3492 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3493 // CHECK:   ret <2 x i32> [[ADD_I]]
3494 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3495   return vmla_n_u32(a, b, c);
3496 }
3497
3498 // CHECK-LABEL: @test_vmlaq_n_u32(
3499 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3500 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3501 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3502 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3503 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3504 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3505 // CHECK:   ret <4 x i32> [[ADD_I]]
3506 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3507   return vmlaq_n_u32(a, b, c);
3508 }
3509
3510 // CHECK-LABEL: @test_vmlal_n_s16(
3511 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3512 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3513 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3514 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3515 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3516 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3517 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3518 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3519 // CHECK:   ret <4 x i32> [[ADD_I]]
3520 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3521   return vmlal_n_s16(a, b, c);
3522 }
3523
3524 // CHECK-LABEL: @test_vmlal_n_s32(
3525 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3526 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3527 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3528 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3529 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3530 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3531 // CHECK:   ret <2 x i64> [[ADD_I]]
3532 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3533   return vmlal_n_s32(a, b, c);
3534 }
3535
3536 // CHECK-LABEL: @test_vmlal_n_u16(
3537 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3538 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3539 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3540 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3541 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3542 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3543 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3544 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3545 // CHECK:   ret <4 x i32> [[ADD_I]]
3546 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3547   return vmlal_n_u16(a, b, c);
3548 }
3549
3550 // CHECK-LABEL: @test_vmlal_n_u32(
3551 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3552 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3553 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3554 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3555 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3556 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3557 // CHECK:   ret <2 x i64> [[ADD_I]]
3558 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3559   return vmlal_n_u32(a, b, c);
3560 }
3561
3562 // CHECK-LABEL: @test_vqdmlal_n_s16(
3563 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3564 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3565 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3566 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3567 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3568 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3569 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3570 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3571 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
3572 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
3573 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3574   return vqdmlal_n_s16(a, b, c);
3575 }
3576
3577 // CHECK-LABEL: @test_vqdmlal_n_s32(
3578 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3579 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3580 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3581 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3582 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3583 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3584 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
3585 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
3586 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3587   return vqdmlal_n_s32(a, b, c);
3588 }
3589
3590 // CHECK-LABEL: @test_vmls_n_s16(
3591 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3592 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3593 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3594 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3595 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3596 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3597 // CHECK:   ret <4 x i16> [[SUB_I]]
3598 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3599   return vmls_n_s16(a, b, c);
3600 }
3601
3602 // CHECK-LABEL: @test_vmlsq_n_s16(
3603 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3604 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3605 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3606 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3607 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3608 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3609 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3610 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3611 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3612 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3613 // CHECK:   ret <8 x i16> [[SUB_I]]
3614 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3615   return vmlsq_n_s16(a, b, c);
3616 }
3617
3618 // CHECK-LABEL: @test_vmls_n_s32(
3619 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3620 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3621 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3622 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3623 // CHECK:   ret <2 x i32> [[SUB_I]]
3624 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3625   return vmls_n_s32(a, b, c);
3626 }
3627
3628 // CHECK-LABEL: @test_vmlsq_n_s32(
3629 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3630 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3631 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3632 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3633 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3634 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3635 // CHECK:   ret <4 x i32> [[SUB_I]]
3636 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3637   return vmlsq_n_s32(a, b, c);
3638 }
3639
3640 // CHECK-LABEL: @test_vmls_n_u16(
3641 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3642 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3643 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3644 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3645 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3646 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3647 // CHECK:   ret <4 x i16> [[SUB_I]]
3648 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3649   return vmls_n_u16(a, b, c);
3650 }
3651
3652 // CHECK-LABEL: @test_vmlsq_n_u16(
3653 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3654 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3655 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3656 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3657 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3658 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3659 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3660 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3661 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3662 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3663 // CHECK:   ret <8 x i16> [[SUB_I]]
3664 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3665   return vmlsq_n_u16(a, b, c);
3666 }
3667
3668 // CHECK-LABEL: @test_vmls_n_u32(
3669 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3670 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3671 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3672 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3673 // CHECK:   ret <2 x i32> [[SUB_I]]
3674 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3675   return vmls_n_u32(a, b, c);
3676 }
3677
3678 // CHECK-LABEL: @test_vmlsq_n_u32(
3679 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3680 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3681 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3682 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3683 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3684 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3685 // CHECK:   ret <4 x i32> [[SUB_I]]
3686 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3687   return vmlsq_n_u32(a, b, c);
3688 }
3689
3690 // CHECK-LABEL: @test_vmlsl_n_s16(
3691 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3692 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3693 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3694 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3695 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3696 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3697 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3698 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3699 // CHECK:   ret <4 x i32> [[SUB_I]]
3700 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3701   return vmlsl_n_s16(a, b, c);
3702 }
3703
3704 // CHECK-LABEL: @test_vmlsl_n_s32(
3705 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3706 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3707 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3708 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3709 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3710 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3711 // CHECK:   ret <2 x i64> [[SUB_I]]
3712 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3713   return vmlsl_n_s32(a, b, c);
3714 }
3715
3716 // CHECK-LABEL: @test_vmlsl_n_u16(
3717 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3718 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3719 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3720 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3721 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3722 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3723 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3724 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3725 // CHECK:   ret <4 x i32> [[SUB_I]]
3726 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3727   return vmlsl_n_u16(a, b, c);
3728 }
3729
3730 // CHECK-LABEL: @test_vmlsl_n_u32(
3731 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3732 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3733 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3734 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3735 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3736 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3737 // CHECK:   ret <2 x i64> [[SUB_I]]
3738 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3739   return vmlsl_n_u32(a, b, c);
3740 }
3741
3742 // CHECK-LABEL: @test_vqdmlsl_n_s16(
3743 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3744 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3745 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3746 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3747 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3748 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3749 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3750 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) #2
3751 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) #2
3752 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
3753 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3754   return vqdmlsl_n_s16(a, b, c);
3755 }
3756
3757 // CHECK-LABEL: @test_vqdmlsl_n_s32(
3758 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3759 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3760 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3761 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3762 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3763 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) #2
3764 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) #2
3765 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
3766 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3767   return vqdmlsl_n_s32(a, b, c);
3768 }
3769
3770 // CHECK-LABEL: @test_vmla_lane_u16_0(
3771 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3772 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3773 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3774 // CHECK:   ret <4 x i16> [[ADD]]
3775 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3776   return vmla_lane_u16(a, b, v, 0);
3777 }
3778
3779 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
3780 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3781 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3782 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3783 // CHECK:   ret <8 x i16> [[ADD]]
3784 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3785   return vmlaq_lane_u16(a, b, v, 0);
3786 }
3787
3788 // CHECK-LABEL: @test_vmla_lane_u32_0(
3789 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3790 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3791 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3792 // CHECK:   ret <2 x i32> [[ADD]]
3793 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3794   return vmla_lane_u32(a, b, v, 0);
3795 }
3796
3797 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
3798 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3799 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3800 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3801 // CHECK:   ret <4 x i32> [[ADD]]
3802 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3803   return vmlaq_lane_u32(a, b, v, 0);
3804 }
3805
3806 // CHECK-LABEL: @test_vmla_laneq_u16_0(
3807 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3808 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3809 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3810 // CHECK:   ret <4 x i16> [[ADD]]
3811 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3812   return vmla_laneq_u16(a, b, v, 0);
3813 }
3814
3815 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
3816 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3817 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3818 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3819 // CHECK:   ret <8 x i16> [[ADD]]
3820 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3821   return vmlaq_laneq_u16(a, b, v, 0);
3822 }
3823
3824 // CHECK-LABEL: @test_vmla_laneq_u32_0(
3825 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3826 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3827 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3828 // CHECK:   ret <2 x i32> [[ADD]]
3829 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3830   return vmla_laneq_u32(a, b, v, 0);
3831 }
3832
3833 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
3834 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3835 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3836 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3837 // CHECK:   ret <4 x i32> [[ADD]]
3838 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3839   return vmlaq_laneq_u32(a, b, v, 0);
3840 }
3841
3842 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
3843 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3844 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3845 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3846 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3847 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
3848 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
3849 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
3850 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3851   return vqdmlal_laneq_s16(a, b, v, 0);
3852 }
3853
3854 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
3855 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3856 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3857 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3858 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3859 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
3860 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
3861 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
3862 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
3863   return vqdmlal_laneq_s32(a, b, v, 0);
3864 }
3865
3866 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
3867 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3868 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3869 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3870 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3871 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3872 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
3873 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
3874 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
3875 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
3876   return vqdmlal_high_laneq_s16(a, b, v, 0);
3877 }
3878
3879 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
3880 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3881 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3882 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3883 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3884 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3885 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
3886 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
3887 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
3888 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
3889   return vqdmlal_high_laneq_s32(a, b, v, 0);
3890 }
3891
3892 // CHECK-LABEL: @test_vmls_lane_u16_0(
3893 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3894 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3895 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3896 // CHECK:   ret <4 x i16> [[SUB]]
3897 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3898   return vmls_lane_u16(a, b, v, 0);
3899 }
3900
3901 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
3902 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3903 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3904 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3905 // CHECK:   ret <8 x i16> [[SUB]]
3906 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3907   return vmlsq_lane_u16(a, b, v, 0);
3908 }
3909
3910 // CHECK-LABEL: @test_vmls_lane_u32_0(
3911 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3912 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3913 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3914 // CHECK:   ret <2 x i32> [[SUB]]
3915 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3916   return vmls_lane_u32(a, b, v, 0);
3917 }
3918
3919 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
3920 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3921 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3922 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3923 // CHECK:   ret <4 x i32> [[SUB]]
3924 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3925   return vmlsq_lane_u32(a, b, v, 0);
3926 }
3927
3928 // CHECK-LABEL: @test_vmls_laneq_u16_0(
3929 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3930 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3931 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3932 // CHECK:   ret <4 x i16> [[SUB]]
3933 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3934   return vmls_laneq_u16(a, b, v, 0);
3935 }
3936
3937 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
3938 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3939 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3940 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3941 // CHECK:   ret <8 x i16> [[SUB]]
3942 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3943   return vmlsq_laneq_u16(a, b, v, 0);
3944 }
3945
3946 // CHECK-LABEL: @test_vmls_laneq_u32_0(
3947 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3948 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3949 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3950 // CHECK:   ret <2 x i32> [[SUB]]
3951 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3952   return vmls_laneq_u32(a, b, v, 0);
3953 }
3954
3955 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
3956 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3957 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3958 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3959 // CHECK:   ret <4 x i32> [[SUB]]
3960 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3961   return vmlsq_laneq_u32(a, b, v, 0);
3962 }
3963
3964 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
3965 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3966 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3967 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3968 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3969 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
3970 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
3971 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
3972 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3973   return vqdmlsl_laneq_s16(a, b, v, 0);
3974 }
3975
3976 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
3977 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3978 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3979 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3980 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3981 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
3982 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
3983 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
3984 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
3985   return vqdmlsl_laneq_s32(a, b, v, 0);
3986 }
3987
3988 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
3989 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3990 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3991 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3992 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3993 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3994 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
3995 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
3996 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
3997 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
3998   return vqdmlsl_high_laneq_s16(a, b, v, 0);
3999 }
4000
4001 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
4002 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4003 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4004 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4005 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4006 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4007 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
4008 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
4009 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4010 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4011   return vqdmlsl_high_laneq_s32(a, b, v, 0);
4012 }
4013
4014 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
4015 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4016 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4017 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4018 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
4019 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4020 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
4021 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4022   return vqdmulh_laneq_s16(a, v, 0);
4023 }
4024
4025 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
4026 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4027 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4028 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4029 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
4030 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4031 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
4032 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4033   return vqdmulhq_laneq_s16(a, v, 0);
4034 }
4035
4036 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
4037 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4038 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4039 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4040 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
4041 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4042 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
4043 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4044   return vqdmulh_laneq_s32(a, v, 0);
4045 }
4046
4047 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
4048 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4049 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4050 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4051 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
4052 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4053 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
4054 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4055   return vqdmulhq_laneq_s32(a, v, 0);
4056 }
4057
4058 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
4059 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4060 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4061 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4062 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
4063 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4064 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
4065 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4066   return vqrdmulh_laneq_s16(a, v, 0);
4067 }
4068
4069 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
4070 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4071 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4072 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4073 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
4074 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4075 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
4076 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4077   return vqrdmulhq_laneq_s16(a, v, 0);
4078 }
4079
4080 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
4081 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4082 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4083 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4084 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
4085 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4086 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
4087 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4088   return vqrdmulh_laneq_s32(a, v, 0);
4089 }
4090
4091 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
4092 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4093 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4094 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4095 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
4096 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4097 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
4098 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4099   return vqrdmulhq_laneq_s32(a, v, 0);
4100 }
4101
4102 // CHECK-LABEL: @test_vmla_lane_u16(
4103 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4104 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4105 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4106 // CHECK:   ret <4 x i16> [[ADD]]
4107 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4108   return vmla_lane_u16(a, b, v, 3);
4109 }
4110
4111 // CHECK-LABEL: @test_vmlaq_lane_u16(
4112 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4113 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4114 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4115 // CHECK:   ret <8 x i16> [[ADD]]
4116 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4117   return vmlaq_lane_u16(a, b, v, 3);
4118 }
4119
4120 // CHECK-LABEL: @test_vmla_lane_u32(
4121 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4122 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4123 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4124 // CHECK:   ret <2 x i32> [[ADD]]
4125 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4126   return vmla_lane_u32(a, b, v, 1);
4127 }
4128
4129 // CHECK-LABEL: @test_vmlaq_lane_u32(
4130 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4131 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4132 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4133 // CHECK:   ret <4 x i32> [[ADD]]
4134 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4135   return vmlaq_lane_u32(a, b, v, 1);
4136 }
4137
4138 // CHECK-LABEL: @test_vmla_laneq_u16(
4139 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4140 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4141 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4142 // CHECK:   ret <4 x i16> [[ADD]]
4143 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4144   return vmla_laneq_u16(a, b, v, 7);
4145 }
4146
4147 // CHECK-LABEL: @test_vmlaq_laneq_u16(
4148 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4149 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4150 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4151 // CHECK:   ret <8 x i16> [[ADD]]
4152 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4153   return vmlaq_laneq_u16(a, b, v, 7);
4154 }
4155
4156 // CHECK-LABEL: @test_vmla_laneq_u32(
4157 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4158 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4159 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4160 // CHECK:   ret <2 x i32> [[ADD]]
4161 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4162   return vmla_laneq_u32(a, b, v, 3);
4163 }
4164
4165 // CHECK-LABEL: @test_vmlaq_laneq_u32(
4166 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4167 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4168 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4169 // CHECK:   ret <4 x i32> [[ADD]]
4170 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4171   return vmlaq_laneq_u32(a, b, v, 3);
4172 }
4173
4174 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
4175 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4176 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4177 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4178 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4179 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
4180 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
4181 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4182 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4183   return vqdmlal_laneq_s16(a, b, v, 7);
4184 }
4185
4186 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
4187 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4188 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4189 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4190 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4191 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
4192 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
4193 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4194 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4195   return vqdmlal_laneq_s32(a, b, v, 3);
4196 }
4197
4198 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
4199 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4201 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4202 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4203 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4204 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
4205 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
4206 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4207 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4208   return vqdmlal_high_laneq_s16(a, b, v, 7);
4209 }
4210
4211 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
4212 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4213 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4214 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4215 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4216 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4217 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
4218 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
4219 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4220 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4221   return vqdmlal_high_laneq_s32(a, b, v, 3);
4222 }
4223
4224 // CHECK-LABEL: @test_vmls_lane_u16(
4225 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4226 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4227 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4228 // CHECK:   ret <4 x i16> [[SUB]]
4229 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4230   return vmls_lane_u16(a, b, v, 3);
4231 }
4232
4233 // CHECK-LABEL: @test_vmlsq_lane_u16(
4234 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4235 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4236 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4237 // CHECK:   ret <8 x i16> [[SUB]]
4238 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4239   return vmlsq_lane_u16(a, b, v, 3);
4240 }
4241
4242 // CHECK-LABEL: @test_vmls_lane_u32(
4243 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4244 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4245 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4246 // CHECK:   ret <2 x i32> [[SUB]]
4247 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4248   return vmls_lane_u32(a, b, v, 1);
4249 }
4250
4251 // CHECK-LABEL: @test_vmlsq_lane_u32(
4252 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4253 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4254 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4255 // CHECK:   ret <4 x i32> [[SUB]]
4256 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4257   return vmlsq_lane_u32(a, b, v, 1);
4258 }
4259
4260 // CHECK-LABEL: @test_vmls_laneq_u16(
4261 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4262 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4263 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4264 // CHECK:   ret <4 x i16> [[SUB]]
4265 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4266   return vmls_laneq_u16(a, b, v, 7);
4267 }
4268
4269 // CHECK-LABEL: @test_vmlsq_laneq_u16(
4270 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4271 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4272 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4273 // CHECK:   ret <8 x i16> [[SUB]]
4274 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4275   return vmlsq_laneq_u16(a, b, v, 7);
4276 }
4277
4278 // CHECK-LABEL: @test_vmls_laneq_u32(
4279 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4280 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4281 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4282 // CHECK:   ret <2 x i32> [[SUB]]
4283 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4284   return vmls_laneq_u32(a, b, v, 3);
4285 }
4286
4287 // CHECK-LABEL: @test_vmlsq_laneq_u32(
4288 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4289 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4290 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4291 // CHECK:   ret <4 x i32> [[SUB]]
4292 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4293   return vmlsq_laneq_u32(a, b, v, 3);
4294 }
4295
4296 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
4297 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4298 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4299 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4300 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4301 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) #2
4302 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
4303 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4304 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4305   return vqdmlsl_laneq_s16(a, b, v, 7);
4306 }
4307
4308 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
4309 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4310 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4311 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4312 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4313 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) #2
4314 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
4315 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4316 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4317   return vqdmlsl_laneq_s32(a, b, v, 3);
4318 }
4319
4320 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
4321 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4322 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4323 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4324 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4325 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4326 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #2
4327 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) #2
4328 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4329 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4330   return vqdmlsl_high_laneq_s16(a, b, v, 7);
4331 }
4332
4333 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
4334 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4335 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4336 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4337 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4338 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4339 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #2
4340 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) #2
4341 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4342 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4343   return vqdmlsl_high_laneq_s32(a, b, v, 3);
4344 }
4345
4346 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
4347 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4348 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4349 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4350 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
4351 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4352 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
4353 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4354   return vqdmulh_laneq_s16(a, v, 7);
4355 }
4356
4357 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
4358 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4359 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4360 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4361 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
4362 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4363 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
4364 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4365   return vqdmulhq_laneq_s16(a, v, 7);
4366 }
4367
4368 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
4369 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4370 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4371 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4372 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
4373 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4374 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
4375 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4376   return vqdmulh_laneq_s32(a, v, 3);
4377 }
4378
4379 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
4380 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4381 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4382 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4383 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
4384 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4385 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
4386 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4387   return vqdmulhq_laneq_s32(a, v, 3);
4388 }
4389
4390 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
4391 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4392 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4393 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4394 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) #2
4395 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4396 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
4397 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4398   return vqrdmulh_laneq_s16(a, v, 7);
4399 }
4400
4401 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
4402 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4403 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4404 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4405 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) #2
4406 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4407 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
4408 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4409   return vqrdmulhq_laneq_s16(a, v, 7);
4410 }
4411
4412 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
4413 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4414 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4415 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4416 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) #2
4417 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4418 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
4419 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4420   return vqrdmulh_laneq_s32(a, v, 3);
4421 }
4422
4423 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
4424 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4425 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4426 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4427 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) #2
4428 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4429 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
4430 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4431   return vqrdmulhq_laneq_s32(a, v, 3);
4432 }