test/CodeGen/aarch64-neon-2velem.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
   2
   3 // Test new aarch64 intrinsics and types
   4
   5 #include <arm_neon.h>
   6
   7 // CHECK-LABEL: @test_vmla_lane_s16(
   8 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   9 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  10 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
  11 // CHECK:   ret <4 x i16> [[ADD]]
  12 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
  13   return vmla_lane_s16(a, b, v, 3);
  14 }
  15
  16 // CHECK-LABEL: @test_vmlaq_lane_s16(
  17 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  18 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  19 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
  20 // CHECK:   ret <8 x i16> [[ADD]]
  21 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
  22   return vmlaq_lane_s16(a, b, v, 3);
  23 }
  24
  25 // CHECK-LABEL: @test_vmla_lane_s32(
  26 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
  27 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
  28 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
  29 // CHECK:   ret <2 x i32> [[ADD]]
  30 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
  31   return vmla_lane_s32(a, b, v, 1);
  32 }
  33
  34 // CHECK-LABEL: @test_vmlaq_lane_s32(
  35 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
  36 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
  37 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
  38 // CHECK:   ret <4 x i32> [[ADD]]
  39 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
  40   return vmlaq_lane_s32(a, b, v, 1);
  41 }
  42
  43 // CHECK-LABEL: @test_vmla_laneq_s16(
  44 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
  45 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  46 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
  47 // CHECK:   ret <4 x i16> [[ADD]]
  48 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
  49   return vmla_laneq_s16(a, b, v, 7);
  50 }
  51
  52 // CHECK-LABEL: @test_vmlaq_laneq_s16(
  53 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  54 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  55 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
  56 // CHECK:   ret <8 x i16> [[ADD]]
  57 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
  58   return vmlaq_laneq_s16(a, b, v, 7);
  59 }
  60
  61 // CHECK-LABEL: @test_vmla_laneq_s32(
  62 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
  63 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
  64 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
  65 // CHECK:   ret <2 x i32> [[ADD]]
  66 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
  67   return vmla_laneq_s32(a, b, v, 3);
  68 }
  69
  70 // CHECK-LABEL: @test_vmlaq_laneq_s32(
  71 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  72 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
  73 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
  74 // CHECK:   ret <4 x i32> [[ADD]]
  75 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
  76   return vmlaq_laneq_s32(a, b, v, 3);
  77 }
  78
  79 // CHECK-LABEL: @test_vmls_lane_s16(
  80 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
  81 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
  82 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
  83 // CHECK:   ret <4 x i16> [[SUB]]
  84 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
  85   return vmls_lane_s16(a, b, v, 3);
  86 }
  87
  88 // CHECK-LABEL: @test_vmlsq_lane_s16(
  89 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
  90 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
  91 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
  92 // CHECK:   ret <8 x i16> [[SUB]]
  93 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
  94   return vmlsq_lane_s16(a, b, v, 3);
  95 }
  96
  97 // CHECK-LABEL: @test_vmls_lane_s32(
  98 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
  99 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 100 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
 101 // CHECK:   ret <2 x i32> [[SUB]]
 102 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
 103   return vmls_lane_s32(a, b, v, 1);
 104 }
 105
 106 // CHECK-LABEL: @test_vmlsq_lane_s32(
 107 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 108 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 109 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
 110 // CHECK:   ret <4 x i32> [[SUB]]
 111 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
 112   return vmlsq_lane_s32(a, b, v, 1);
 113 }
 114
 115 // CHECK-LABEL: @test_vmls_laneq_s16(
 116 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 117 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
 118 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
 119 // CHECK:   ret <4 x i16> [[SUB]]
 120 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 121   return vmls_laneq_s16(a, b, v, 7);
 122 }
 123
 124 // CHECK-LABEL: @test_vmlsq_laneq_s16(
 125 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 126 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
 127 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
 128 // CHECK:   ret <8 x i16> [[SUB]]
 129 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 130   return vmlsq_laneq_s16(a, b, v, 7);
 131 }
 132
 133 // CHECK-LABEL: @test_vmls_laneq_s32(
 134 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 135 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
 136 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
 137 // CHECK:   ret <2 x i32> [[SUB]]
 138 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 139   return vmls_laneq_s32(a, b, v, 3);
 140 }
 141
 142 // CHECK-LABEL: @test_vmlsq_laneq_s32(
 143 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 144 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
 145 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
 146 // CHECK:   ret <4 x i32> [[SUB]]
 147 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 148   return vmlsq_laneq_s32(a, b, v, 3);
 149 }
 150
 151 // CHECK-LABEL: @test_vmul_lane_s16(
 152 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 153 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 154 // CHECK:   ret <4 x i16> [[MUL]]
 155 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
 156   return vmul_lane_s16(a, v, 3);
 157 }
 158
 159 // CHECK-LABEL: @test_vmulq_lane_s16(
 160 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 161 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 162 // CHECK:   ret <8 x i16> [[MUL]]
 163 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
 164   return vmulq_lane_s16(a, v, 3);
 165 }
 166
 167 // CHECK-LABEL: @test_vmul_lane_s32(
 168 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 169 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 170 // CHECK:   ret <2 x i32> [[MUL]]
 171 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
 172   return vmul_lane_s32(a, v, 1);
 173 }
 174
 175 // CHECK-LABEL: @test_vmulq_lane_s32(
 176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 177 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 178 // CHECK:   ret <4 x i32> [[MUL]]
 179 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
 180   return vmulq_lane_s32(a, v, 1);
 181 }
 182
 183 // CHECK-LABEL: @test_vmul_lane_u16(
 184 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 185 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 186 // CHECK:   ret <4 x i16> [[MUL]]
 187 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
 188   return vmul_lane_u16(a, v, 3);
 189 }
 190
 191 // CHECK-LABEL: @test_vmulq_lane_u16(
 192 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 193 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 194 // CHECK:   ret <8 x i16> [[MUL]]
 195 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
 196   return vmulq_lane_u16(a, v, 3);
 197 }
 198
 199 // CHECK-LABEL: @test_vmul_lane_u32(
 200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 201 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 202 // CHECK:   ret <2 x i32> [[MUL]]
 203 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
 204   return vmul_lane_u32(a, v, 1);
 205 }
 206
 207 // CHECK-LABEL: @test_vmulq_lane_u32(
 208 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 209 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 210 // CHECK:   ret <4 x i32> [[MUL]]
 211 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
 212   return vmulq_lane_u32(a, v, 1);
 213 }
 214
 215 // CHECK-LABEL: @test_vmul_laneq_s16(
 216 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 217 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 218 // CHECK:   ret <4 x i16> [[MUL]]
 219 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
 220   return vmul_laneq_s16(a, v, 7);
 221 }
 222
 223 // CHECK-LABEL: @test_vmulq_laneq_s16(
 224 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 225 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 226 // CHECK:   ret <8 x i16> [[MUL]]
 227 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
 228   return vmulq_laneq_s16(a, v, 7);
 229 }
 230
 231 // CHECK-LABEL: @test_vmul_laneq_s32(
 232 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 233 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 234 // CHECK:   ret <2 x i32> [[MUL]]
 235 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
 236   return vmul_laneq_s32(a, v, 3);
 237 }
 238
 239 // CHECK-LABEL: @test_vmulq_laneq_s32(
 240 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 241 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 242 // CHECK:   ret <4 x i32> [[MUL]]
 243 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
 244   return vmulq_laneq_s32(a, v, 3);
 245 }
 246
 247 // CHECK-LABEL: @test_vmul_laneq_u16(
 248 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 249 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
 250 // CHECK:   ret <4 x i16> [[MUL]]
 251 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
 252   return vmul_laneq_u16(a, v, 7);
 253 }
 254
 255 // CHECK-LABEL: @test_vmulq_laneq_u16(
 256 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 257 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
 258 // CHECK:   ret <8 x i16> [[MUL]]
 259 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
 260   return vmulq_laneq_u16(a, v, 7);
 261 }
 262
 263 // CHECK-LABEL: @test_vmul_laneq_u32(
 264 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 265 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
 266 // CHECK:   ret <2 x i32> [[MUL]]
 267 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
 268   return vmul_laneq_u32(a, v, 3);
 269 }
 270
 271 // CHECK-LABEL: @test_vmulq_laneq_u32(
 272 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 273 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
 274 // CHECK:   ret <4 x i32> [[MUL]]
 275 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
 276   return vmulq_laneq_u32(a, v, 3);
 277 }
 278
 279 // CHECK-LABEL: @test_vfma_lane_f32(
 280 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 281 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 282 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 283 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 284 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 285 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 286 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 287 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 288 // CHECK:   ret <2 x float> [[FMLA2]]
 289 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 290   return vfma_lane_f32(a, b, v, 1);
 291 }
 292
 293 // CHECK-LABEL: @test_vfmaq_lane_f32(
 294 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 295 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 296 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 297 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 298 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 299 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 300 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 301 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 302 // CHECK:   ret <4 x float> [[FMLA2]]
 303 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 304   return vfmaq_lane_f32(a, b, v, 1);
 305 }
 306
 307 // CHECK-LABEL: @test_vfma_laneq_f32(
 308 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
 310 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 311 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 312 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 313 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 314 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 315 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 316 // CHECK:   ret <2 x float> [[TMP6]]
 317 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 318   return vfma_laneq_f32(a, b, v, 3);
 319 }
 320
 321 // CHECK-LABEL: @test_vfmaq_laneq_f32(
 322 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 323 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
 324 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 325 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 326 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 327 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 328 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 329 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 330 // CHECK:   ret <4 x float> [[TMP6]]
 331 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 332   return vfmaq_laneq_f32(a, b, v, 3);
 333 }
 334
 335 // CHECK-LABEL: @test_vfms_lane_f32(
 336 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 337 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 338 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
 339 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 340 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 341 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
 342 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 343 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 344 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
 345 // CHECK:   ret <2 x float> [[FMLA2]]
 346 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
 347   return vfms_lane_f32(a, b, v, 1);
 348 }
 349
 350 // CHECK-LABEL: @test_vfmsq_lane_f32(
 351 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 352 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 353 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
 354 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
 355 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
 356 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 357 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 358 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 359 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
 360 // CHECK:   ret <4 x float> [[FMLA2]]
 361 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
 362   return vfmsq_lane_f32(a, b, v, 1);
 363 }
 364
 365 // CHECK-LABEL: @test_vfms_laneq_f32(
 366 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
 367 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 368 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
 369 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 370 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 371 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
 372 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 373 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
 374 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
 375 // CHECK:   ret <2 x float> [[TMP6]]
 376 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
 377   return vfms_laneq_f32(a, b, v, 3);
 378 }
 379
 380 // CHECK-LABEL: @test_vfmsq_laneq_f32(
 381 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
 382 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 383 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
 384 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
 385 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 386 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
 387 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
 388 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 389 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
 390 // CHECK:   ret <4 x float> [[TMP6]]
 391 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 392   return vfmsq_laneq_f32(a, b, v, 3);
 393 }
 394
 395 // CHECK-LABEL: @test_vfmaq_lane_f64(
 396 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 397 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 398 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 399 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 400 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 401 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 402 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 403 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 404 // CHECK:   ret <2 x double> [[FMLA2]]
 405 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 406   return vfmaq_lane_f64(a, b, v, 0);
 407 }
 408
 409 // CHECK-LABEL: @test_vfmaq_laneq_f64(
 410 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 411 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
 412 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 413 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 414 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 415 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 416 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 417 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 418 // CHECK:   ret <2 x double> [[TMP6]]
 419 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 420   return vfmaq_laneq_f64(a, b, v, 1);
 421 }
 422
 423 // CHECK-LABEL: @test_vfmsq_lane_f64(
 424 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 425 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 426 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
 427 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 428 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 429 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
 430 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 431 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 432 // CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
 433 // CHECK:   ret <2 x double> [[FMLA2]]
 434 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
 435   return vfmsq_lane_f64(a, b, v, 0);
 436 }
 437
 438 // CHECK-LABEL: @test_vfmsq_laneq_f64(
 439 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
 440 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 441 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
 442 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 443 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 444 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
 445 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 446 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
 447 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
 448 // CHECK:   ret <2 x double> [[TMP6]]
 449 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
 450   return vfmsq_laneq_f64(a, b, v, 1);
 451 }
 452
 453 // CHECK-LABEL: @test_vfmas_laneq_f32(
 454 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 455 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 456 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 457 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
 458 // CHECK:   ret float [[TMP2]]
 459 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 460   return vfmas_laneq_f32(a, b, v, 3);
 461 }
 462
 463 // CHECK-LABEL: @test_vfmsd_lane_f64(
 464 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 465 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
 466 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 467 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 468 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
 469 // CHECK:   ret double [[TMP2]]
 470 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
 471   return vfmsd_lane_f64(a, b, v, 0);
 472 }
 473
 474 // CHECK-LABEL: @test_vfmss_laneq_f32(
 475 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
 476 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
 477 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
 478 // CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 479 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
 480 // CHECK:   ret float [[TMP2]]
 481 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
 482   return vfmss_laneq_f32(a, b, v, 3);
 483 }
 484
 485 // CHECK-LABEL: @test_vfmsd_laneq_f64(
 486 // CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
 487 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
 488 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 489 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 490 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
 491 // CHECK:   ret double [[TMP2]]
 492 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
 493   return vfmsd_laneq_f64(a, b, v, 1);
 494 }
 495
 496 // CHECK-LABEL: @test_vmlal_lane_s16(
 497 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 498 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 499 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 500 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 501 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 502 // CHECK:   ret <4 x i32> [[ADD]]
 503 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 504   return vmlal_lane_s16(a, b, v, 3);
 505 }
 506
 507 // CHECK-LABEL: @test_vmlal_lane_s32(
 508 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 509 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 510 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 511 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 512 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 513 // CHECK:   ret <2 x i64> [[ADD]]
 514 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 515   return vmlal_lane_s32(a, b, v, 1);
 516 }
 517
 518 // CHECK-LABEL: @test_vmlal_laneq_s16(
 519 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 520 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 521 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 522 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 523 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 524 // CHECK:   ret <4 x i32> [[ADD]]
 525 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 526   return vmlal_laneq_s16(a, b, v, 7);
 527 }
 528
 529 // CHECK-LABEL: @test_vmlal_laneq_s32(
 530 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 531 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 532 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 533 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 534 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 535 // CHECK:   ret <2 x i64> [[ADD]]
 536 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 537   return vmlal_laneq_s32(a, b, v, 3);
 538 }
 539
 540 // CHECK-LABEL: @test_vmlal_high_lane_s16(
 541 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 542 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 543 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 544 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 545 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 546 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 547 // CHECK:   ret <4 x i32> [[ADD]]
 548 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 549   return vmlal_high_lane_s16(a, b, v, 3);
 550 }
 551
 552 // CHECK-LABEL: @test_vmlal_high_lane_s32(
 553 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 554 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 555 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 556 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 557 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 558 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 559 // CHECK:   ret <2 x i64> [[ADD]]
 560 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 561   return vmlal_high_lane_s32(a, b, v, 1);
 562 }
 563
 564 // CHECK-LABEL: @test_vmlal_high_laneq_s16(
 565 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 566 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 567 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 568 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 569 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 570 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 571 // CHECK:   ret <4 x i32> [[ADD]]
 572 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 573   return vmlal_high_laneq_s16(a, b, v, 7);
 574 }
 575
 576 // CHECK-LABEL: @test_vmlal_high_laneq_s32(
 577 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 578 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 579 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 580 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 581 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 582 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 583 // CHECK:   ret <2 x i64> [[ADD]]
 584 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 585   return vmlal_high_laneq_s32(a, b, v, 3);
 586 }
 587
 588 // CHECK-LABEL: @test_vmlsl_lane_s16(
 589 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 590 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 591 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 592 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 593 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 594 // CHECK:   ret <4 x i32> [[SUB]]
 595 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
 596   return vmlsl_lane_s16(a, b, v, 3);
 597 }
 598
 599 // CHECK-LABEL: @test_vmlsl_lane_s32(
 600 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 601 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 602 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 603 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 604 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 605 // CHECK:   ret <2 x i64> [[SUB]]
 606 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
 607   return vmlsl_lane_s32(a, b, v, 1);
 608 }
 609
 610 // CHECK-LABEL: @test_vmlsl_laneq_s16(
 611 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 612 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 613 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 614 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 615 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 616 // CHECK:   ret <4 x i32> [[SUB]]
 617 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
 618   return vmlsl_laneq_s16(a, b, v, 7);
 619 }
 620
 621 // CHECK-LABEL: @test_vmlsl_laneq_s32(
 622 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 623 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 624 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 625 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 626 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 627 // CHECK:   ret <2 x i64> [[SUB]]
 628 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
 629   return vmlsl_laneq_s32(a, b, v, 3);
 630 }
 631
 632 // CHECK-LABEL: @test_vmlsl_high_lane_s16(
 633 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 635 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 636 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 637 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 638 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 639 // CHECK:   ret <4 x i32> [[SUB]]
 640 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
 641   return vmlsl_high_lane_s16(a, b, v, 3);
 642 }
 643
 644 // CHECK-LABEL: @test_vmlsl_high_lane_s32(
 645 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 646 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 647 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 648 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 649 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 650 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 651 // CHECK:   ret <2 x i64> [[SUB]]
 652 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
 653   return vmlsl_high_lane_s32(a, b, v, 1);
 654 }
 655
 656 // CHECK-LABEL: @test_vmlsl_high_laneq_s16(
 657 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 658 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 659 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 660 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 661 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 662 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 663 // CHECK:   ret <4 x i32> [[SUB]]
 664 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
 665   return vmlsl_high_laneq_s16(a, b, v, 7);
 666 }
 667
 668 // CHECK-LABEL: @test_vmlsl_high_laneq_s32(
 669 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 670 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 671 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 672 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 673 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 674 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 675 // CHECK:   ret <2 x i64> [[SUB]]
 676 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
 677   return vmlsl_high_laneq_s32(a, b, v, 3);
 678 }
 679
 680 // CHECK-LABEL: @test_vmlal_lane_u16(
 681 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 682 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 683 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 684 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 685 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 686 // CHECK:   ret <4 x i32> [[ADD]]
 687 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 688   return vmlal_lane_u16(a, b, v, 3);
 689 }
 690
 691 // CHECK-LABEL: @test_vmlal_lane_u32(
 692 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 693 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 694 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 695 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 696 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 697 // CHECK:   ret <2 x i64> [[ADD]]
 698 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 699   return vmlal_lane_u32(a, b, v, 1);
 700 }
 701
 702 // CHECK-LABEL: @test_vmlal_laneq_u16(
 703 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 704 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 705 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 706 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 707 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 708 // CHECK:   ret <4 x i32> [[ADD]]
 709 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 710   return vmlal_laneq_u16(a, b, v, 7);
 711 }
 712
 713 // CHECK-LABEL: @test_vmlal_laneq_u32(
 714 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 715 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 716 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 717 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 718 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 719 // CHECK:   ret <2 x i64> [[ADD]]
 720 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 721   return vmlal_laneq_u32(a, b, v, 3);
 722 }
 723
 724 // CHECK-LABEL: @test_vmlal_high_lane_u16(
 725 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 727 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 728 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 729 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 730 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 731 // CHECK:   ret <4 x i32> [[ADD]]
 732 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 733   return vmlal_high_lane_u16(a, b, v, 3);
 734 }
 735
 736 // CHECK-LABEL: @test_vmlal_high_lane_u32(
 737 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 738 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 739 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 740 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 741 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 742 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 743 // CHECK:   ret <2 x i64> [[ADD]]
 744 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 745   return vmlal_high_lane_u32(a, b, v, 1);
 746 }
 747
 748 // CHECK-LABEL: @test_vmlal_high_laneq_u16(
 749 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 750 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 751 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 752 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 753 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 754 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
 755 // CHECK:   ret <4 x i32> [[ADD]]
 756 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 757   return vmlal_high_laneq_u16(a, b, v, 7);
 758 }
 759
 760 // CHECK-LABEL: @test_vmlal_high_laneq_u32(
 761 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 762 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 763 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 764 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 765 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 766 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
 767 // CHECK:   ret <2 x i64> [[ADD]]
 768 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 769   return vmlal_high_laneq_u32(a, b, v, 3);
 770 }
 771
 772 // CHECK-LABEL: @test_vmlsl_lane_u16(
 773 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 774 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 775 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 776 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 777 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 778 // CHECK:   ret <4 x i32> [[SUB]]
 779 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
 780   return vmlsl_lane_u16(a, b, v, 3);
 781 }
 782
 783 // CHECK-LABEL: @test_vmlsl_lane_u32(
 784 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 785 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 786 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 787 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 788 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 789 // CHECK:   ret <2 x i64> [[SUB]]
 790 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
 791   return vmlsl_lane_u32(a, b, v, 1);
 792 }
 793
 794 // CHECK-LABEL: @test_vmlsl_laneq_u16(
 795 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 796 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 797 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 798 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
 799 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 800 // CHECK:   ret <4 x i32> [[SUB]]
 801 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
 802   return vmlsl_laneq_u16(a, b, v, 7);
 803 }
 804
 805 // CHECK-LABEL: @test_vmlsl_laneq_u32(
 806 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 807 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 808 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 809 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
 810 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 811 // CHECK:   ret <2 x i64> [[SUB]]
 812 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
 813   return vmlsl_laneq_u32(a, b, v, 3);
 814 }
 815
 816 // CHECK-LABEL: @test_vmlsl_high_lane_u16(
 817 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 818 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 819 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 820 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 821 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 822 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 823 // CHECK:   ret <4 x i32> [[SUB]]
 824 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
 825   return vmlsl_high_lane_u16(a, b, v, 3);
 826 }
 827
 828 // CHECK-LABEL: @test_vmlsl_high_lane_u32(
 829 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 830 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 831 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 832 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 833 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 834 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 835 // CHECK:   ret <2 x i64> [[SUB]]
 836 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
 837   return vmlsl_high_lane_u32(a, b, v, 1);
 838 }
 839
 840 // CHECK-LABEL: @test_vmlsl_high_laneq_u16(
 841 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 842 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 843 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 844 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 845 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 846 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
 847 // CHECK:   ret <4 x i32> [[SUB]]
 848 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
 849   return vmlsl_high_laneq_u16(a, b, v, 7);
 850 }
 851
 852 // CHECK-LABEL: @test_vmlsl_high_laneq_u32(
 853 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
 854 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 855 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 856 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 857 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 858 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
 859 // CHECK:   ret <2 x i64> [[SUB]]
 860 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
 861   return vmlsl_high_laneq_u32(a, b, v, 3);
 862 }
 863
 864 // CHECK-LABEL: @test_vmull_lane_s16(
 865 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 866 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 867 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 868 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
 869 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 870 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
 871   return vmull_lane_s16(a, v, 3);
 872 }
 873
 874 // CHECK-LABEL: @test_vmull_lane_s32(
 875 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 876 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 877 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 878 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
 879 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 880 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
 881   return vmull_lane_s32(a, v, 1);
 882 }
 883
 884 // CHECK-LABEL: @test_vmull_lane_u16(
 885 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 886 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 887 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 888 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
 889 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 890 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
 891   return vmull_lane_u16(a, v, 3);
 892 }
 893
 894 // CHECK-LABEL: @test_vmull_lane_u32(
 895 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 896 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 897 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 898 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
 899 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 900 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
 901   return vmull_lane_u32(a, v, 1);
 902 }
 903
 904 // CHECK-LABEL: @test_vmull_high_lane_s16(
 905 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 906 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 907 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 909 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 910 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 911 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
 912   return vmull_high_lane_s16(a, v, 3);
 913 }
 914
 915 // CHECK-LABEL: @test_vmull_high_lane_s32(
 916 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 917 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 918 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 919 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 920 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 921 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 922 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
 923   return vmull_high_lane_s32(a, v, 1);
 924 }
 925
 926 // CHECK-LABEL: @test_vmull_high_lane_u16(
 927 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 928 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 929 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 930 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 931 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 932 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 933 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
 934   return vmull_high_lane_u16(a, v, 3);
 935 }
 936
 937 // CHECK-LABEL: @test_vmull_high_lane_u32(
 938 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
 939 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
 940 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
 941 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 942 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
 943 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 944 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
 945   return vmull_high_lane_u32(a, v, 1);
 946 }
 947
 948 // CHECK-LABEL: @test_vmull_laneq_s16(
 949 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 950 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 951 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 952 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
 953 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 954 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
 955   return vmull_laneq_s16(a, v, 7);
 956 }
 957
 958 // CHECK-LABEL: @test_vmull_laneq_s32(
 959 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 960 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 961 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 962 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
 963 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 964 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
 965   return vmull_laneq_s32(a, v, 3);
 966 }
 967
 968 // CHECK-LABEL: @test_vmull_laneq_u16(
 969 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 970 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 971 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 972 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
 973 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 974 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
 975   return vmull_laneq_u16(a, v, 7);
 976 }
 977
 978 // CHECK-LABEL: @test_vmull_laneq_u32(
 979 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
 980 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 981 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 982 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
 983 // CHECK:   ret <2 x i64> [[VMULL2_I]]
 984 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
 985   return vmull_laneq_u32(a, v, 3);
 986 }
 987
 988 // CHECK-LABEL: @test_vmull_high_laneq_s16(
 989 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 990 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 991 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
 992 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 993 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
 994 // CHECK:   ret <4 x i32> [[VMULL2_I]]
 995 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
 996   return vmull_high_laneq_s16(a, v, 7);
 997 }
 998
 999 // CHECK-LABEL: @test_vmull_high_laneq_s32(
1000 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1001 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1002 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1003 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1004 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1005 // CHECK:   ret <2 x i64> [[VMULL2_I]]
1006 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1007   return vmull_high_laneq_s32(a, v, 3);
1008 }
1009
1010 // CHECK-LABEL: @test_vmull_high_laneq_u16(
1011 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1012 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1013 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1014 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1015 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1016 // CHECK:   ret <4 x i32> [[VMULL2_I]]
1017 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1018   return vmull_high_laneq_u16(a, v, 7);
1019 }
1020
1021 // CHECK-LABEL: @test_vmull_high_laneq_u32(
1022 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1023 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1024 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1025 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1026 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1027 // CHECK:   ret <2 x i64> [[VMULL2_I]]
1028 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1029   return vmull_high_laneq_u32(a, v, 3);
1030 }
1031
1032 // CHECK-LABEL: @test_vqdmlal_lane_s16(
1033 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1034 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1035 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1036 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1037 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1038 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1039 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1040 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1041   return vqdmlal_lane_s16(a, b, v, 3);
1042 }
1043
1044 // CHECK-LABEL: @test_vqdmlal_lane_s32(
1045 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1046 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1047 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1048 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1049 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1050 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1051 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1052 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1053   return vqdmlal_lane_s32(a, b, v, 1);
1054 }
1055
1056 // CHECK-LABEL: @test_vqdmlal_high_lane_s16(
1057 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1058 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1059 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1060 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1061 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1062 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1063 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1064 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1065 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1066   return vqdmlal_high_lane_s16(a, b, v, 3);
1067 }
1068
1069 // CHECK-LABEL: @test_vqdmlal_high_lane_s32(
1070 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1071 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1072 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1073 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1074 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1075 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1076 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1077 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1078 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1079   return vqdmlal_high_lane_s32(a, b, v, 1);
1080 }
1081
1082 // CHECK-LABEL: @test_vqdmlsl_lane_s16(
1083 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1084 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1085 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1086 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1087 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1088 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1089 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1090 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1091   return vqdmlsl_lane_s16(a, b, v, 3);
1092 }
1093
1094 // CHECK-LABEL: @test_vqdmlsl_lane_s32(
1095 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1096 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1097 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1098 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1099 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1100 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1101 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1102 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1103   return vqdmlsl_lane_s32(a, b, v, 1);
1104 }
1105
1106 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16(
1107 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1108 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1109 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1110 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1111 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1112 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1113 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
1114 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1115 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1116   return vqdmlsl_high_lane_s16(a, b, v, 3);
1117 }
1118
1119 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32(
1120 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1121 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1122 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1123 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1124 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1125 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1126 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
1127 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1128 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1129   return vqdmlsl_high_lane_s32(a, b, v, 1);
1130 }
1131
1132 // CHECK-LABEL: @test_vqdmull_lane_s16(
1133 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1134 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1136 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1137 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1138 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1139 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1140   return vqdmull_lane_s16(a, v, 3);
1141 }
1142
1143 // CHECK-LABEL: @test_vqdmull_lane_s32(
1144 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1147 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1148 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1149 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1150 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1151   return vqdmull_lane_s32(a, v, 1);
1152 }
1153
1154 // CHECK-LABEL: @test_vqdmull_laneq_s16(
1155 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1156 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1157 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1158 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1159 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1160 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1161 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1162   return vqdmull_laneq_s16(a, v, 3);
1163 }
1164
1165 // CHECK-LABEL: @test_vqdmull_laneq_s32(
1166 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1167 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1168 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1169 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1170 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1171 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1172 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1173   return vqdmull_laneq_s32(a, v, 3);
1174 }
1175
1176 // CHECK-LABEL: @test_vqdmull_high_lane_s16(
1177 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1178 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1179 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1180 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1181 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1182 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1183 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1184 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1185   return vqdmull_high_lane_s16(a, v, 3);
1186 }
1187
1188 // CHECK-LABEL: @test_vqdmull_high_lane_s32(
1189 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1190 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1191 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1192 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1193 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1194 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1195 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1196 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1197   return vqdmull_high_lane_s32(a, v, 1);
1198 }
1199
1200 // CHECK-LABEL: @test_vqdmull_high_laneq_s16(
1201 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1202 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1203 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1204 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1205 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1206 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1207 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
1208 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1209   return vqdmull_high_laneq_s16(a, v, 7);
1210 }
1211
1212 // CHECK-LABEL: @test_vqdmull_high_laneq_s32(
1213 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1214 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1215 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1216 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1217 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1218 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1219 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
1220 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1221   return vqdmull_high_laneq_s32(a, v, 3);
1222 }
1223
1224 // CHECK-LABEL: @test_vqdmulh_lane_s16(
1225 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1226 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1227 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1228 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1229 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1230 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
1231 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1232   return vqdmulh_lane_s16(a, v, 3);
1233 }
1234
1235 // CHECK-LABEL: @test_vqdmulhq_lane_s16(
1236 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1237 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1238 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1239 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1240 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1241 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
1242 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1243   return vqdmulhq_lane_s16(a, v, 3);
1244 }
1245
1246 // CHECK-LABEL: @test_vqdmulh_lane_s32(
1247 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1248 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1249 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1250 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1251 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1252 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
1253 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1254   return vqdmulh_lane_s32(a, v, 1);
1255 }
1256
1257 // CHECK-LABEL: @test_vqdmulhq_lane_s32(
1258 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1259 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1260 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1261 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1262 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1263 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
1264 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1265   return vqdmulhq_lane_s32(a, v, 1);
1266 }
1267
1268 // CHECK-LABEL: @test_vqrdmulh_lane_s16(
1269 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1270 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1271 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1272 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
1273 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1274 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
1275 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1276   return vqrdmulh_lane_s16(a, v, 3);
1277 }
1278
1279 // CHECK-LABEL: @test_vqrdmulhq_lane_s16(
1280 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1281 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1282 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1283 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
1284 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1285 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
1286 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1287   return vqrdmulhq_lane_s16(a, v, 3);
1288 }
1289
1290 // CHECK-LABEL: @test_vqrdmulh_lane_s32(
1291 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1292 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1293 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1294 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
1295 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1296 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
1297 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1298   return vqrdmulh_lane_s32(a, v, 1);
1299 }
1300
1301 // CHECK-LABEL: @test_vqrdmulhq_lane_s32(
1302 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1303 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1304 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1305 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
1306 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1307 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
1308 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1309   return vqrdmulhq_lane_s32(a, v, 1);
1310 }
1311
1312 // CHECK-LABEL: @test_vmul_lane_f32(
1313 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1314 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1315 // CHECK:   ret <2 x float> [[MUL]]
1316 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1317   return vmul_lane_f32(a, v, 1);
1318 }
1319
1320 // CHECK-LABEL: @test_vmul_lane_f64(
1321 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1322 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1323 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1324 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1325 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1326 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1327 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1328 // CHECK:   ret <1 x double> [[TMP5]]
1329
1330 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1331   return vmul_lane_f64(a, v, 0);
1332 }
1333
1334 // CHECK-LABEL: @test_vmulq_lane_f32(
1335 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1336 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1337 // CHECK:   ret <4 x float> [[MUL]]
1338
1339 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1340   return vmulq_lane_f32(a, v, 1);
1341 }
1342
1343 // CHECK-LABEL: @test_vmulq_lane_f64(
1344 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1345 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1346 // CHECK:   ret <2 x double> [[MUL]]
1347 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1348   return vmulq_lane_f64(a, v, 0);
1349 }
1350
1351 // CHECK-LABEL: @test_vmul_laneq_f32(
1352 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1353 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1354 // CHECK:   ret <2 x float> [[MUL]]
1355 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1356   return vmul_laneq_f32(a, v, 3);
1357 }
1358
1359 // CHECK-LABEL: @test_vmul_laneq_f64(
1360 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1361 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1362 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1363 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1364 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1365 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1366 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1367 // CHECK:   ret <1 x double> [[TMP5]]
1368 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1369   return vmul_laneq_f64(a, v, 1);
1370 }
1371
1372 // CHECK-LABEL: @test_vmulq_laneq_f32(
1373 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1374 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1375 // CHECK:   ret <4 x float> [[MUL]]
1376
1377 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1378   return vmulq_laneq_f32(a, v, 3);
1379 }
1380
1381 // CHECK-LABEL: @test_vmulq_laneq_f64(
1382 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1383 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1384 // CHECK:   ret <2 x double> [[MUL]]
1385 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1386   return vmulq_laneq_f64(a, v, 1);
1387 }
1388
1389 // CHECK-LABEL: @test_vmulx_lane_f32(
1390 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1391 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1392 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1393 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1394 // CHECK:   ret <2 x float> [[VMULX2_I]]
1395 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1396   return vmulx_lane_f32(a, v, 1);
1397 }
1398
1399 // CHECK-LABEL: @test_vmulxq_lane_f32(
1400 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1401 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1402 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1403 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1404 // CHECK:   ret <4 x float> [[VMULX2_I]]
1405 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1406   return vmulxq_lane_f32(a, v, 1);
1407 }
1408
1409 // CHECK-LABEL: @test_vmulxq_lane_f64(
1410 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1411 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1412 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1413 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1414 // CHECK:   ret <2 x double> [[VMULX2_I]]
1415 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1416   return vmulxq_lane_f64(a, v, 0);
1417 }
1418
1419 // CHECK-LABEL: @test_vmulx_laneq_f32(
1420 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1421 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1422 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1423 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
1424 // CHECK:   ret <2 x float> [[VMULX2_I]]
1425 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1426   return vmulx_laneq_f32(a, v, 3);
1427 }
1428
1429 // CHECK-LABEL: @test_vmulxq_laneq_f32(
1430 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1431 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1432 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1433 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
1434 // CHECK:   ret <4 x float> [[VMULX2_I]]
1435 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1436   return vmulxq_laneq_f32(a, v, 3);
1437 }
1438
1439 // CHECK-LABEL: @test_vmulxq_laneq_f64(
1440 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1441 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1442 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1443 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
1444 // CHECK:   ret <2 x double> [[VMULX2_I]]
1445 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1446   return vmulxq_laneq_f64(a, v, 1);
1447 }
1448
1449 // CHECK-LABEL: @test_vmla_lane_s16_0(
1450 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1451 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1452 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1453 // CHECK:   ret <4 x i16> [[ADD]]
1454 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1455   return vmla_lane_s16(a, b, v, 0);
1456 }
1457
1458 // CHECK-LABEL: @test_vmlaq_lane_s16_0(
1459 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1460 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1461 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1462 // CHECK:   ret <8 x i16> [[ADD]]
1463 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1464   return vmlaq_lane_s16(a, b, v, 0);
1465 }
1466
1467 // CHECK-LABEL: @test_vmla_lane_s32_0(
1468 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1469 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1470 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1471 // CHECK:   ret <2 x i32> [[ADD]]
1472 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1473   return vmla_lane_s32(a, b, v, 0);
1474 }
1475
1476 // CHECK-LABEL: @test_vmlaq_lane_s32_0(
1477 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1478 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1479 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1480 // CHECK:   ret <4 x i32> [[ADD]]
1481 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1482   return vmlaq_lane_s32(a, b, v, 0);
1483 }
1484
1485 // CHECK-LABEL: @test_vmla_laneq_s16_0(
1486 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1487 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1488 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1489 // CHECK:   ret <4 x i16> [[ADD]]
1490 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1491   return vmla_laneq_s16(a, b, v, 0);
1492 }
1493
1494 // CHECK-LABEL: @test_vmlaq_laneq_s16_0(
1495 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1496 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1497 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1498 // CHECK:   ret <8 x i16> [[ADD]]
1499 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1500   return vmlaq_laneq_s16(a, b, v, 0);
1501 }
1502
1503 // CHECK-LABEL: @test_vmla_laneq_s32_0(
1504 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1505 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1506 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1507 // CHECK:   ret <2 x i32> [[ADD]]
1508 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1509   return vmla_laneq_s32(a, b, v, 0);
1510 }
1511
1512 // CHECK-LABEL: @test_vmlaq_laneq_s32_0(
1513 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1514 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1515 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1516 // CHECK:   ret <4 x i32> [[ADD]]
1517 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1518   return vmlaq_laneq_s32(a, b, v, 0);
1519 }
1520
1521 // CHECK-LABEL: @test_vmls_lane_s16_0(
1522 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1523 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1524 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1525 // CHECK:   ret <4 x i16> [[SUB]]
1526 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1527   return vmls_lane_s16(a, b, v, 0);
1528 }
1529
1530 // CHECK-LABEL: @test_vmlsq_lane_s16_0(
1531 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1532 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1533 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1534 // CHECK:   ret <8 x i16> [[SUB]]
1535 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1536   return vmlsq_lane_s16(a, b, v, 0);
1537 }
1538
1539 // CHECK-LABEL: @test_vmls_lane_s32_0(
1540 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1541 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1542 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1543 // CHECK:   ret <2 x i32> [[SUB]]
1544 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1545   return vmls_lane_s32(a, b, v, 0);
1546 }
1547
1548 // CHECK-LABEL: @test_vmlsq_lane_s32_0(
1549 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1550 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1551 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1552 // CHECK:   ret <4 x i32> [[SUB]]
1553 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1554   return vmlsq_lane_s32(a, b, v, 0);
1555 }
1556
1557 // CHECK-LABEL: @test_vmls_laneq_s16_0(
1558 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1559 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1560 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1561 // CHECK:   ret <4 x i16> [[SUB]]
1562 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1563   return vmls_laneq_s16(a, b, v, 0);
1564 }
1565
1566 // CHECK-LABEL: @test_vmlsq_laneq_s16_0(
1567 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1568 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1569 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1570 // CHECK:   ret <8 x i16> [[SUB]]
1571 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1572   return vmlsq_laneq_s16(a, b, v, 0);
1573 }
1574
1575 // CHECK-LABEL: @test_vmls_laneq_s32_0(
1576 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1577 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1578 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1579 // CHECK:   ret <2 x i32> [[SUB]]
1580 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1581   return vmls_laneq_s32(a, b, v, 0);
1582 }
1583
1584 // CHECK-LABEL: @test_vmlsq_laneq_s32_0(
1585 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1586 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1587 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1588 // CHECK:   ret <4 x i32> [[SUB]]
1589 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1590   return vmlsq_laneq_s32(a, b, v, 0);
1591 }
1592
1593 // CHECK-LABEL: @test_vmul_lane_s16_0(
1594 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1595 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1596 // CHECK:   ret <4 x i16> [[MUL]]
1597 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1598   return vmul_lane_s16(a, v, 0);
1599 }
1600
1601 // CHECK-LABEL: @test_vmulq_lane_s16_0(
1602 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1603 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1604 // CHECK:   ret <8 x i16> [[MUL]]
1605 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1606   return vmulq_lane_s16(a, v, 0);
1607 }
1608
1609 // CHECK-LABEL: @test_vmul_lane_s32_0(
1610 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1611 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1612 // CHECK:   ret <2 x i32> [[MUL]]
1613 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1614   return vmul_lane_s32(a, v, 0);
1615 }
1616
1617 // CHECK-LABEL: @test_vmulq_lane_s32_0(
1618 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1619 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1620 // CHECK:   ret <4 x i32> [[MUL]]
1621 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1622   return vmulq_lane_s32(a, v, 0);
1623 }
1624
1625 // CHECK-LABEL: @test_vmul_lane_u16_0(
1626 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1627 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1628 // CHECK:   ret <4 x i16> [[MUL]]
1629 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1630   return vmul_lane_u16(a, v, 0);
1631 }
1632
1633 // CHECK-LABEL: @test_vmulq_lane_u16_0(
1634 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1635 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1636 // CHECK:   ret <8 x i16> [[MUL]]
1637 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1638   return vmulq_lane_u16(a, v, 0);
1639 }
1640
1641 // CHECK-LABEL: @test_vmul_lane_u32_0(
1642 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1643 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1644 // CHECK:   ret <2 x i32> [[MUL]]
1645 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1646   return vmul_lane_u32(a, v, 0);
1647 }
1648
1649 // CHECK-LABEL: @test_vmulq_lane_u32_0(
1650 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1651 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1652 // CHECK:   ret <4 x i32> [[MUL]]
1653 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1654   return vmulq_lane_u32(a, v, 0);
1655 }
1656
1657 // CHECK-LABEL: @test_vmul_laneq_s16_0(
1658 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1659 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1660 // CHECK:   ret <4 x i16> [[MUL]]
1661 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1662   return vmul_laneq_s16(a, v, 0);
1663 }
1664
1665 // CHECK-LABEL: @test_vmulq_laneq_s16_0(
1666 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1667 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1668 // CHECK:   ret <8 x i16> [[MUL]]
1669 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1670   return vmulq_laneq_s16(a, v, 0);
1671 }
1672
1673 // CHECK-LABEL: @test_vmul_laneq_s32_0(
1674 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1675 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1676 // CHECK:   ret <2 x i32> [[MUL]]
1677 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1678   return vmul_laneq_s32(a, v, 0);
1679 }
1680
1681 // CHECK-LABEL: @test_vmulq_laneq_s32_0(
1682 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1683 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1684 // CHECK:   ret <4 x i32> [[MUL]]
1685 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1686   return vmulq_laneq_s32(a, v, 0);
1687 }
1688
1689 // CHECK-LABEL: @test_vmul_laneq_u16_0(
1690 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1691 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1692 // CHECK:   ret <4 x i16> [[MUL]]
1693 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1694   return vmul_laneq_u16(a, v, 0);
1695 }
1696
1697 // CHECK-LABEL: @test_vmulq_laneq_u16_0(
1698 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1699 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1700 // CHECK:   ret <8 x i16> [[MUL]]
1701 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1702   return vmulq_laneq_u16(a, v, 0);
1703 }
1704
1705 // CHECK-LABEL: @test_vmul_laneq_u32_0(
1706 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1707 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1708 // CHECK:   ret <2 x i32> [[MUL]]
1709 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1710   return vmul_laneq_u32(a, v, 0);
1711 }
1712
1713 // CHECK-LABEL: @test_vmulq_laneq_u32_0(
1714 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1715 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1716 // CHECK:   ret <4 x i32> [[MUL]]
1717 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1718   return vmulq_laneq_u32(a, v, 0);
1719 }
1720
1721 // CHECK-LABEL: @test_vfma_lane_f32_0(
1722 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1723 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1724 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1725 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1726 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1727 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1728 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1729 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1730 // CHECK:   ret <2 x float> [[FMLA2]]
1731 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1732   return vfma_lane_f32(a, b, v, 0);
1733 }
1734
1735 // CHECK-LABEL: @test_vfmaq_lane_f32_0(
1736 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1737 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1738 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1739 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1740 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1741 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1742 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1743 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1744 // CHECK:   ret <4 x float> [[FMLA2]]
1745 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1746   return vfmaq_lane_f32(a, b, v, 0);
1747 }
1748
1749 // CHECK-LABEL: @test_vfma_laneq_f32_0(
1750 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1751 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1752 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1753 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1754 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1755 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1756 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1757 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1758 // CHECK:   ret <2 x float> [[TMP6]]
1759 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1760   return vfma_laneq_f32(a, b, v, 0);
1761 }
1762
1763 // CHECK-LABEL: @test_vfmaq_laneq_f32_0(
1764 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1765 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1766 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1767 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1768 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1769 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1770 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1771 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1772 // CHECK:   ret <4 x float> [[TMP6]]
1773 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1774   return vfmaq_laneq_f32(a, b, v, 0);
1775 }
1776
1777 // CHECK-LABEL: @test_vfms_lane_f32_0(
1778 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1779 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1780 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1781 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1782 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1783 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1784 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1785 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1786 // CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1787 // CHECK:   ret <2 x float> [[FMLA2]]
1788 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1789   return vfms_lane_f32(a, b, v, 0);
1790 }
1791
1792 // CHECK-LABEL: @test_vfmsq_lane_f32_0(
1793 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1794 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1795 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1796 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1797 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1798 // CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1799 // CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1800 // CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1801 // CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1802 // CHECK:   ret <4 x float> [[FMLA2]]
1803 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1804   return vfmsq_lane_f32(a, b, v, 0);
1805 }
1806
1807 // CHECK-LABEL: @test_vfms_laneq_f32_0(
1808 // CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1810 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1811 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1812 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1813 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1814 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1815 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1816 // CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1817 // CHECK:   ret <2 x float> [[TMP6]]
1818 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1819   return vfms_laneq_f32(a, b, v, 0);
1820 }
1821
1822 // CHECK-LABEL: @test_vfmsq_laneq_f32_0(
1823 // CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1824 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1825 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1826 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1827 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1828 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1829 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1830 // CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1831 // CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1832 // CHECK:   ret <4 x float> [[TMP6]]
1833 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1834   return vfmsq_laneq_f32(a, b, v, 0);
1835 }
1836
1837 // CHECK-LABEL: @test_vfmaq_laneq_f64_0(
1838 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1839 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
1840 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1841 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1842 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1843 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1844 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1845 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1846 // CHECK:   ret <2 x double> [[TMP6]]
1847 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1848   return vfmaq_laneq_f64(a, b, v, 0);
1849 }
1850
1851 // CHECK-LABEL: @test_vfmsq_laneq_f64_0(
1852 // CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
1853 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1854 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
1855 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
1856 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1857 // CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1858 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
1859 // CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
1860 // CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
1861 // CHECK:   ret <2 x double> [[TMP6]]
1862 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
1863   return vfmsq_laneq_f64(a, b, v, 0);
1864 }
1865
1866 // CHECK-LABEL: @test_vmlal_lane_s16_0(
1867 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1868 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1869 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1870 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1871 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1872 // CHECK:   ret <4 x i32> [[ADD]]
1873 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1874   return vmlal_lane_s16(a, b, v, 0);
1875 }
1876
1877 // CHECK-LABEL: @test_vmlal_lane_s32_0(
1878 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1879 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1880 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1881 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1882 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1883 // CHECK:   ret <2 x i64> [[ADD]]
1884 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1885   return vmlal_lane_s32(a, b, v, 0);
1886 }
1887
1888 // CHECK-LABEL: @test_vmlal_laneq_s16_0(
1889 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1890 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1891 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1892 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1893 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1894 // CHECK:   ret <4 x i32> [[ADD]]
1895 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1896   return vmlal_laneq_s16(a, b, v, 0);
1897 }
1898
1899 // CHECK-LABEL: @test_vmlal_laneq_s32_0(
1900 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1901 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1902 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1903 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1904 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1905 // CHECK:   ret <2 x i64> [[ADD]]
1906 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1907   return vmlal_laneq_s32(a, b, v, 0);
1908 }
1909
1910 // CHECK-LABEL: @test_vmlal_high_lane_s16_0(
1911 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1912 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1913 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1914 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1915 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1916 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1917 // CHECK:   ret <4 x i32> [[ADD]]
1918 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
1919   return vmlal_high_lane_s16(a, b, v, 0);
1920 }
1921
1922 // CHECK-LABEL: @test_vmlal_high_lane_s32_0(
1923 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1924 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1925 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1926 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1927 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1928 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1929 // CHECK:   ret <2 x i64> [[ADD]]
1930 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
1931   return vmlal_high_lane_s32(a, b, v, 0);
1932 }
1933
1934 // CHECK-LABEL: @test_vmlal_high_laneq_s16_0(
1935 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1936 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1937 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1938 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1939 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
1940 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
1941 // CHECK:   ret <4 x i32> [[ADD]]
1942 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
1943   return vmlal_high_laneq_s16(a, b, v, 0);
1944 }
1945
1946 // CHECK-LABEL: @test_vmlal_high_laneq_s32_0(
1947 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1948 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1949 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1950 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1951 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
1952 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
1953 // CHECK:   ret <2 x i64> [[ADD]]
1954 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
1955   return vmlal_high_laneq_s32(a, b, v, 0);
1956 }
1957
1958 // CHECK-LABEL: @test_vmlsl_lane_s16_0(
1959 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1960 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1961 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1962 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1963 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1964 // CHECK:   ret <4 x i32> [[SUB]]
1965 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
1966   return vmlsl_lane_s16(a, b, v, 0);
1967 }
1968
1969 // CHECK-LABEL: @test_vmlsl_lane_s32_0(
1970 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1971 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1972 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1973 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1974 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1975 // CHECK:   ret <2 x i64> [[SUB]]
1976 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
1977   return vmlsl_lane_s32(a, b, v, 0);
1978 }
1979
1980 // CHECK-LABEL: @test_vmlsl_laneq_s16_0(
1981 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1982 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1983 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1984 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
1985 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
1986 // CHECK:   ret <4 x i32> [[SUB]]
1987 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
1988   return vmlsl_laneq_s16(a, b, v, 0);
1989 }
1990
1991 // CHECK-LABEL: @test_vmlsl_laneq_s32_0(
1992 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1993 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1994 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1995 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
1996 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
1997 // CHECK:   ret <2 x i64> [[SUB]]
1998 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
1999   return vmlsl_laneq_s32(a, b, v, 0);
2000 }
2001
2002 // CHECK-LABEL: @test_vmlsl_high_lane_s16_0(
2003 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2004 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2005 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2006 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2007 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2008 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2009 // CHECK:   ret <4 x i32> [[SUB]]
2010 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2011   return vmlsl_high_lane_s16(a, b, v, 0);
2012 }
2013
2014 // CHECK-LABEL: @test_vmlsl_high_lane_s32_0(
2015 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2016 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2017 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2018 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2019 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2020 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2021 // CHECK:   ret <2 x i64> [[SUB]]
2022 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2023   return vmlsl_high_lane_s32(a, b, v, 0);
2024 }
2025
2026 // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0(
2027 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2028 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2029 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2030 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2031 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2032 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2033 // CHECK:   ret <4 x i32> [[SUB]]
2034 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2035   return vmlsl_high_laneq_s16(a, b, v, 0);
2036 }
2037
2038 // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0(
2039 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2040 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2041 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2042 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2043 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2044 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2045 // CHECK:   ret <2 x i64> [[SUB]]
2046 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2047   return vmlsl_high_laneq_s32(a, b, v, 0);
2048 }
2049
2050 // CHECK-LABEL: @test_vmlal_lane_u16_0(
2051 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2052 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2053 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2054 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2055 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2056 // CHECK:   ret <4 x i32> [[ADD]]
2057 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2058   return vmlal_lane_u16(a, b, v, 0);
2059 }
2060
2061 // CHECK-LABEL: @test_vmlal_lane_u32_0(
2062 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2063 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2064 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2065 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2066 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067 // CHECK:   ret <2 x i64> [[ADD]]
2068 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069   return vmlal_lane_u32(a, b, v, 0);
2070 }
2071
2072 // CHECK-LABEL: @test_vmlal_laneq_u16_0(
2073 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2077 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2078 // CHECK:   ret <4 x i32> [[ADD]]
2079 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2080   return vmlal_laneq_u16(a, b, v, 0);
2081 }
2082
2083 // CHECK-LABEL: @test_vmlal_laneq_u32_0(
2084 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2085 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2086 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2087 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2088 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2089 // CHECK:   ret <2 x i64> [[ADD]]
2090 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2091   return vmlal_laneq_u32(a, b, v, 0);
2092 }
2093
2094 // CHECK-LABEL: @test_vmlal_high_lane_u16_0(
2095 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2096 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2097 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2098 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2099 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2100 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2101 // CHECK:   ret <4 x i32> [[ADD]]
2102 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2103   return vmlal_high_lane_u16(a, b, v, 0);
2104 }
2105
2106 // CHECK-LABEL: @test_vmlal_high_lane_u32_0(
2107 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2108 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2109 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2110 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2111 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2112 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2113 // CHECK:   ret <2 x i64> [[ADD]]
2114 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2115   return vmlal_high_lane_u32(a, b, v, 0);
2116 }
2117
2118 // CHECK-LABEL: @test_vmlal_high_laneq_u16_0(
2119 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2120 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2121 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2122 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2123 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2124 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2125 // CHECK:   ret <4 x i32> [[ADD]]
2126 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2127   return vmlal_high_laneq_u16(a, b, v, 0);
2128 }
2129
2130 // CHECK-LABEL: @test_vmlal_high_laneq_u32_0(
2131 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2132 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2133 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2134 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2135 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2136 // CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2137 // CHECK:   ret <2 x i64> [[ADD]]
2138 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2139   return vmlal_high_laneq_u32(a, b, v, 0);
2140 }
2141
2142 // CHECK-LABEL: @test_vmlsl_lane_u16_0(
2143 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2144 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2145 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2146 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2147 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2148 // CHECK:   ret <4 x i32> [[SUB]]
2149 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2150   return vmlsl_lane_u16(a, b, v, 0);
2151 }
2152
2153 // CHECK-LABEL: @test_vmlsl_lane_u32_0(
2154 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2155 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2156 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2157 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2158 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2159 // CHECK:   ret <2 x i64> [[SUB]]
2160 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2161   return vmlsl_lane_u32(a, b, v, 0);
2162 }
2163
2164 // CHECK-LABEL: @test_vmlsl_laneq_u16_0(
2165 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2166 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2167 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2168 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2169 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2170 // CHECK:   ret <4 x i32> [[SUB]]
2171 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2172   return vmlsl_laneq_u16(a, b, v, 0);
2173 }
2174
2175 // CHECK-LABEL: @test_vmlsl_laneq_u32_0(
2176 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2177 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2178 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2179 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2180 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2181 // CHECK:   ret <2 x i64> [[SUB]]
2182 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2183   return vmlsl_laneq_u32(a, b, v, 0);
2184 }
2185
2186 // CHECK-LABEL: @test_vmlsl_high_lane_u16_0(
2187 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2188 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2189 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2190 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2191 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2192 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2193 // CHECK:   ret <4 x i32> [[SUB]]
2194 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2195   return vmlsl_high_lane_u16(a, b, v, 0);
2196 }
2197
2198 // CHECK-LABEL: @test_vmlsl_high_lane_u32_0(
2199 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2200 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2201 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2202 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2203 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2204 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2205 // CHECK:   ret <2 x i64> [[SUB]]
2206 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2207   return vmlsl_high_lane_u32(a, b, v, 0);
2208 }
2209
2210 // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0(
2211 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2212 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2213 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2214 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2215 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2216 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2217 // CHECK:   ret <4 x i32> [[SUB]]
2218 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2219   return vmlsl_high_laneq_u16(a, b, v, 0);
2220 }
2221
2222 // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0(
2223 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2224 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2225 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2226 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2227 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2228 // CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229 // CHECK:   ret <2 x i64> [[SUB]]
2230 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2231   return vmlsl_high_laneq_u32(a, b, v, 0);
2232 }
2233
2234 // CHECK-LABEL: @test_vmull_lane_s16_0(
2235 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2236 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2237 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2238 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2239 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2240 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2241   return vmull_lane_s16(a, v, 0);
2242 }
2243
2244 // CHECK-LABEL: @test_vmull_lane_s32_0(
2245 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2246 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2247 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2248 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2249 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2250 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2251   return vmull_lane_s32(a, v, 0);
2252 }
2253
2254 // CHECK-LABEL: @test_vmull_lane_u16_0(
2255 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2256 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2257 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2258 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2259 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2260 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2261   return vmull_lane_u16(a, v, 0);
2262 }
2263
2264 // CHECK-LABEL: @test_vmull_lane_u32_0(
2265 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2266 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2267 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2268 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2269 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2270 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2271   return vmull_lane_u32(a, v, 0);
2272 }
2273
2274 // CHECK-LABEL: @test_vmull_high_lane_s16_0(
2275 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2276 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2277 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2278 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2279 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2280 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2281 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2282   return vmull_high_lane_s16(a, v, 0);
2283 }
2284
2285 // CHECK-LABEL: @test_vmull_high_lane_s32_0(
2286 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2287 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2288 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2289 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2290 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2291 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2292 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2293   return vmull_high_lane_s32(a, v, 0);
2294 }
2295
2296 // CHECK-LABEL: @test_vmull_high_lane_u16_0(
2297 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2298 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2299 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2300 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2301 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2302 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2303 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2304   return vmull_high_lane_u16(a, v, 0);
2305 }
2306
2307 // CHECK-LABEL: @test_vmull_high_lane_u32_0(
2308 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2309 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2310 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2311 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2312 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2313 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2314 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2315   return vmull_high_lane_u32(a, v, 0);
2316 }
2317
2318 // CHECK-LABEL: @test_vmull_laneq_s16_0(
2319 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2320 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2321 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2322 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2323 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2324 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2325   return vmull_laneq_s16(a, v, 0);
2326 }
2327
2328 // CHECK-LABEL: @test_vmull_laneq_s32_0(
2329 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2330 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2331 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2332 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2333 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2334 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2335   return vmull_laneq_s32(a, v, 0);
2336 }
2337
2338 // CHECK-LABEL: @test_vmull_laneq_u16_0(
2339 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2340 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2341 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2342 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2343 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2344 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2345   return vmull_laneq_u16(a, v, 0);
2346 }
2347
2348 // CHECK-LABEL: @test_vmull_laneq_u32_0(
2349 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2350 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2351 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2352 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2353 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2354 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2355   return vmull_laneq_u32(a, v, 0);
2356 }
2357
2358 // CHECK-LABEL: @test_vmull_high_laneq_s16_0(
2359 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2360 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2361 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2362 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2363 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2364 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2365 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2366   return vmull_high_laneq_s16(a, v, 0);
2367 }
2368
2369 // CHECK-LABEL: @test_vmull_high_laneq_s32_0(
2370 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2371 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2372 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2373 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2374 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2375 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2376 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2377   return vmull_high_laneq_s32(a, v, 0);
2378 }
2379
2380 // CHECK-LABEL: @test_vmull_high_laneq_u16_0(
2381 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2382 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2383 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2384 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2385 // CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2386 // CHECK:   ret <4 x i32> [[VMULL2_I]]
2387 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2388   return vmull_high_laneq_u16(a, v, 0);
2389 }
2390
2391 // CHECK-LABEL: @test_vmull_high_laneq_u32_0(
2392 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2393 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2394 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2395 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2396 // CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2397 // CHECK:   ret <2 x i64> [[VMULL2_I]]
2398 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2399   return vmull_high_laneq_u32(a, v, 0);
2400 }
2401
2402 // CHECK-LABEL: @test_vqdmlal_lane_s16_0(
2403 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2404 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2405 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2406 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2407 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2408 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2409 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2410 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2411   return vqdmlal_lane_s16(a, b, v, 0);
2412 }
2413
2414 // CHECK-LABEL: @test_vqdmlal_lane_s32_0(
2415 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2416 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2417 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2418 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2419 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2420 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2421 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2422 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2423   return vqdmlal_lane_s32(a, b, v, 0);
2424 }
2425
2426 // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0(
2427 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2428 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2429 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2430 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2431 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2432 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2433 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2434 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2435 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2436   return vqdmlal_high_lane_s16(a, b, v, 0);
2437 }
2438
2439 // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0(
2440 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2441 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2442 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2443 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2444 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2445 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2446 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2447 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2448 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2449   return vqdmlal_high_lane_s32(a, b, v, 0);
2450 }
2451
2452 // CHECK-LABEL: @test_vqdmlsl_lane_s16_0(
2453 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2454 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2455 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2456 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2457 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
2458 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2459 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2460 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2461   return vqdmlsl_lane_s16(a, b, v, 0);
2462 }
2463
2464 // CHECK-LABEL: @test_vqdmlsl_lane_s32_0(
2465 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2466 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2467 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2468 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
2470 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2471 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2472 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2473   return vqdmlsl_lane_s32(a, b, v, 0);
2474 }
2475
2476 // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0(
2477 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2478 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2479 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2480 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2481 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2483 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
2484 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2485 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2486   return vqdmlsl_high_lane_s16(a, b, v, 0);
2487 }
2488
2489 // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0(
2490 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2491 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2493 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2494 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2495 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2496 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
2497 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2498 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2499   return vqdmlsl_high_lane_s32(a, b, v, 0);
2500 }
2501
2502 // CHECK-LABEL: @test_vqdmull_lane_s16_0(
2503 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2507 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2508 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2509 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2510   return vqdmull_lane_s16(a, v, 0);
2511 }
2512
2513 // CHECK-LABEL: @test_vqdmull_lane_s32_0(
2514 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2515 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2516 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2517 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2518 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2519 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2520 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2521   return vqdmull_lane_s32(a, v, 0);
2522 }
2523
2524 // CHECK-LABEL: @test_vqdmull_laneq_s16_0(
2525 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2526 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2527 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2528 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2529 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2530 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2531 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2532   return vqdmull_laneq_s16(a, v, 0);
2533 }
2534
2535 // CHECK-LABEL: @test_vqdmull_laneq_s32_0(
2536 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2537 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2538 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2539 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2540 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2541 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2542 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2543   return vqdmull_laneq_s32(a, v, 0);
2544 }
2545
2546 // CHECK-LABEL: @test_vqdmull_high_lane_s16_0(
2547 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2548 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2549 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2550 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2551 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2552 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2553 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2554 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2555   return vqdmull_high_lane_s16(a, v, 0);
2556 }
2557
2558 // CHECK-LABEL: @test_vqdmull_high_lane_s32_0(
2559 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2560 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2561 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2562 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2563 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2564 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2565 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2566 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2567   return vqdmull_high_lane_s32(a, v, 0);
2568 }
2569
2570 // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0(
2571 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2572 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2573 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2574 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2575 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
2576 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2577 // CHECK:   ret <4 x i32> [[VQDMULL_V2_I]]
2578 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2579   return vqdmull_high_laneq_s16(a, v, 0);
2580 }
2581
2582 // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0(
2583 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2584 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2585 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2586 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2587 // CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
2588 // CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2589 // CHECK:   ret <2 x i64> [[VQDMULL_V2_I]]
2590 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2591   return vqdmull_high_laneq_s32(a, v, 0);
2592 }
2593
2594 // CHECK-LABEL: @test_vqdmulh_lane_s16_0(
2595 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2596 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2597 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2598 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2599 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2600 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
2601 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2602   return vqdmulh_lane_s16(a, v, 0);
2603 }
2604
2605 // CHECK-LABEL: @test_vqdmulhq_lane_s16_0(
2606 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2607 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2608 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2609 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2610 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2611 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
2612 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2613   return vqdmulhq_lane_s16(a, v, 0);
2614 }
2615
2616 // CHECK-LABEL: @test_vqdmulh_lane_s32_0(
2617 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2618 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2619 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2620 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2621 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2622 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
2623 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2624   return vqdmulh_lane_s32(a, v, 0);
2625 }
2626
2627 // CHECK-LABEL: @test_vqdmulhq_lane_s32_0(
2628 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2629 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2630 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2631 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2632 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2633 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
2634 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2635   return vqdmulhq_lane_s32(a, v, 0);
2636 }
2637
2638 // CHECK-LABEL: @test_vqrdmulh_lane_s16_0(
2639 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2640 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2641 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2642 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
2643 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2644 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
2645 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2646   return vqrdmulh_lane_s16(a, v, 0);
2647 }
2648
2649 // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0(
2650 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2651 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2652 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2653 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
2654 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
2655 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2656   return vqrdmulhq_lane_s16(a, v, 0);
2657 }
2658
2659 // CHECK-LABEL: @test_vqrdmulh_lane_s32_0(
2660 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2661 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2662 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2663 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
2664 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
2665 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
2666 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2667   return vqrdmulh_lane_s32(a, v, 0);
2668 }
2669
2670 // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0(
2671 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2672 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2673 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2674 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
2675 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
2676 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2677   return vqrdmulhq_lane_s32(a, v, 0);
2678 }
2679
2680 // CHECK-LABEL: @test_vmul_lane_f32_0(
2681 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2682 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2683 // CHECK:   ret <2 x float> [[MUL]]
2684 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
2685   return vmul_lane_f32(a, v, 0);
2686 }
2687
2688 // CHECK-LABEL: @test_vmulq_lane_f32_0(
2689 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2690 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2691 // CHECK:   ret <4 x float> [[MUL]]
2692 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
2693   return vmulq_lane_f32(a, v, 0);
2694 }
2695
2696 // CHECK-LABEL: @test_vmul_laneq_f32_0(
2697 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2698 // CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
2699 // CHECK:   ret <2 x float> [[MUL]]
2700 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
2701   return vmul_laneq_f32(a, v, 0);
2702 }
2703
2704 // CHECK-LABEL: @test_vmul_laneq_f64_0(
2705 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
2706 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
2707 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
2708 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2709 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
2710 // CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
2711 // CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
2712 // CHECK:   ret <1 x double> [[TMP5]]
2713 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
2714   return vmul_laneq_f64(a, v, 0);
2715 }
2716
2717 // CHECK-LABEL: @test_vmulq_laneq_f32_0(
2718 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2719 // CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
2720 // CHECK:   ret <4 x float> [[MUL]]
2721 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2722   return vmulq_laneq_f32(a, v, 0);
2723 }
2724
2725 // CHECK-LABEL: @test_vmulq_laneq_f64_0(
2726 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2727 // CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
2728 // CHECK:   ret <2 x double> [[MUL]]
2729 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2730   return vmulq_laneq_f64(a, v, 0);
2731 }
2732
2733 // CHECK-LABEL: @test_vmulx_lane_f32_0(
2734 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
2735 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2736 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2737 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2738 // CHECK:   ret <2 x float> [[VMULX2_I]]
2739 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
2740   return vmulx_lane_f32(a, v, 0);
2741 }
2742
2743 // CHECK-LABEL: @test_vmulxq_lane_f32_0(
2744 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
2745 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2746 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2747 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2748 // CHECK:   ret <4 x float> [[VMULX2_I]]
2749 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
2750   return vmulxq_lane_f32(a, v, 0);
2751 }
2752
2753 // CHECK-LABEL: @test_vmulxq_lane_f64_0(
2754 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
2755 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2756 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2757 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2758 // CHECK:   ret <2 x double> [[VMULX2_I]]
2759 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
2760   return vmulxq_lane_f64(a, v, 0);
2761 }
2762
2763 // CHECK-LABEL: @test_vmulx_laneq_f32_0(
2764 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
2765 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2766 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
2767 // CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> [[SHUFFLE]])
2768 // CHECK:   ret <2 x float> [[VMULX2_I]]
2769 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
2770   return vmulx_laneq_f32(a, v, 0);
2771 }
2772
2773 // CHECK-LABEL: @test_vmulxq_laneq_f32_0(
2774 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
2775 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2776 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
2777 // CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> [[SHUFFLE]])
2778 // CHECK:   ret <4 x float> [[VMULX2_I]]
2779 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
2780   return vmulxq_laneq_f32(a, v, 0);
2781 }
2782
2783 // CHECK-LABEL: @test_vmulxq_laneq_f64_0(
2784 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
2785 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2786 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
2787 // CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> [[SHUFFLE]])
2788 // CHECK:   ret <2 x double> [[VMULX2_I]]
2789 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
2790   return vmulxq_laneq_f64(a, v, 0);
2791 }
2792
2793 // CHECK-LABEL: @test_vmull_high_n_s16(
2794 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2795 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2796 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2797 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2798 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2799 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2800 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2801 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2802 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
2803 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
2804   return vmull_high_n_s16(a, b);
2805 }
2806
2807 // CHECK-LABEL: @test_vmull_high_n_s32(
2808 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2809 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2810 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2811 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2812 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2813 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2814 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
2815 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
2816   return vmull_high_n_s32(a, b);
2817 }
2818
2819 // CHECK-LABEL: @test_vmull_high_n_u16(
2820 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2821 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2822 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2823 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2824 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2825 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2826 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2827 // CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2828 // CHECK:   ret <4 x i32> [[VMULL5_I_I]]
2829 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
2830   return vmull_high_n_u16(a, b);
2831 }
2832
2833 // CHECK-LABEL: @test_vmull_high_n_u32(
2834 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2835 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2836 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2837 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2838 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2839 // CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2840 // CHECK:   ret <2 x i64> [[VMULL3_I_I]]
2841 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
2842   return vmull_high_n_u32(a, b);
2843 }
2844
2845 // CHECK-LABEL: @test_vqdmull_high_n_s16(
2846 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2847 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2848 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
2849 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
2850 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
2851 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
2852 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2853 // CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2854 // CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
2855 // CHECK:   ret <4 x i32> [[VQDMULL_V5_I_I]]
2856 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
2857   return vqdmull_high_n_s16(a, b);
2858 }
2859
2860 // CHECK-LABEL: @test_vqdmull_high_n_s32(
2861 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2862 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2863 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
2864 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
2865 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2866 // CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2867 // CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
2868 // CHECK:   ret <2 x i64> [[VQDMULL_V3_I_I]]
2869 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
2870   return vqdmull_high_n_s32(a, b);
2871 }
2872
2873 // CHECK-LABEL: @test_vmlal_high_n_s16(
2874 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2875 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2876 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2877 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2878 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2879 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2880 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2881 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2882 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2883 // CHECK:   ret <4 x i32> [[ADD_I_I]]
2884 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2885   return vmlal_high_n_s16(a, b, c);
2886 }
2887
2888 // CHECK-LABEL: @test_vmlal_high_n_s32(
2889 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2890 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2891 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2892 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2893 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2894 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2895 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2896 // CHECK:   ret <2 x i64> [[ADD_I_I]]
2897 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2898   return vmlal_high_n_s32(a, b, c);
2899 }
2900
2901 // CHECK-LABEL: @test_vmlal_high_n_u16(
2902 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2903 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2904 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2905 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2906 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2907 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2908 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2909 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2910 // CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
2911 // CHECK:   ret <4 x i32> [[ADD_I_I]]
2912 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2913   return vmlal_high_n_u16(a, b, c);
2914 }
2915
2916 // CHECK-LABEL: @test_vmlal_high_n_u32(
2917 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2918 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2919 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2920 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2921 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2922 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2923 // CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
2924 // CHECK:   ret <2 x i64> [[ADD_I_I]]
2925 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
2926   return vmlal_high_n_u32(a, b, c);
2927 }
2928
2929 // CHECK-LABEL: @test_vqdmlal_high_n_s16(
2930 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2931 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2932 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2933 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2934 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2935 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2936 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2937 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2938 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2939 // CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
2940 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
2941 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2942   return vqdmlal_high_n_s16(a, b, c);
2943 }
2944
2945 // CHECK-LABEL: @test_vqdmlal_high_n_s32(
2946 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2947 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2948 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2949 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2950 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2951 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2952 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2953 // CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
2954 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
2955 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2956   return vqdmlal_high_n_s32(a, b, c);
2957 }
2958
2959 // CHECK-LABEL: @test_vmlsl_high_n_s16(
2960 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2961 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2962 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2963 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2964 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2965 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2966 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2967 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2968 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2969 // CHECK:   ret <4 x i32> [[SUB_I_I]]
2970 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
2971   return vmlsl_high_n_s16(a, b, c);
2972 }
2973
2974 // CHECK-LABEL: @test_vmlsl_high_n_s32(
2975 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2976 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
2977 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
2978 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
2979 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
2980 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
2981 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
2982 // CHECK:   ret <2 x i64> [[SUB_I_I]]
2983 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
2984   return vmlsl_high_n_s32(a, b, c);
2985 }
2986
2987 // CHECK-LABEL: @test_vmlsl_high_n_u16(
2988 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2989 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
2990 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
2991 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
2992 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
2993 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
2994 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
2995 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
2996 // CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
2997 // CHECK:   ret <4 x i32> [[SUB_I_I]]
2998 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
2999   return vmlsl_high_n_u16(a, b, c);
3000 }
3001
3002 // CHECK-LABEL: @test_vmlsl_high_n_u32(
3003 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3004 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3005 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3006 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3007 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3008 // CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3009 // CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3010 // CHECK:   ret <2 x i64> [[SUB_I_I]]
3011 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3012   return vmlsl_high_n_u32(a, b, c);
3013 }
3014
3015 // CHECK-LABEL: @test_vqdmlsl_high_n_s16(
3016 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3017 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3018 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3019 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3020 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3021 // CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3022 // CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3023 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3024 // CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I_I]])
3025 // CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I_I]])
3026 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
3027 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3028   return vqdmlsl_high_n_s16(a, b, c);
3029 }
3030
3031 // CHECK-LABEL: @test_vqdmlsl_high_n_s32(
3032 // CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3033 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3034 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3035 // CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3036 // CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3037 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3038 // CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I_I]])
3039 // CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I_I]])
3040 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
3041 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3042   return vqdmlsl_high_n_s32(a, b, c);
3043 }
3044
3045 // CHECK-LABEL: @test_vmul_n_f32(
3046 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3047 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3048 // CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3049 // CHECK:   ret <2 x float> [[MUL_I]]
3050 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3051   return vmul_n_f32(a, b);
3052 }
3053
3054 // CHECK-LABEL: @test_vmulq_n_f32(
3055 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3056 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3057 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3058 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3059 // CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3060 // CHECK:   ret <4 x float> [[MUL_I]]
3061 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3062   return vmulq_n_f32(a, b);
3063 }
3064
3065 // CHECK-LABEL: @test_vmulq_n_f64(
3066 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3067 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3068 // CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3069 // CHECK:   ret <2 x double> [[MUL_I]]
3070 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3071   return vmulq_n_f64(a, b);
3072 }
3073
3074 // CHECK-LABEL: @test_vfma_n_f32(
3075 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3076 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3077 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3078 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3079 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3080 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a)
3081 // CHECK:   ret <2 x float> [[TMP3]]
3082 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3083   return vfma_n_f32(a, b, n);
3084 }
3085
3086 // CHECK-LABEL: @test_vfma_n_f64(
3087 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3088 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3089 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
3090 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3091 // CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> [[VECINIT_I]], <1 x double> %a)
3092 // CHECK:   ret <1 x double> [[TMP3]]
3093 float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3094   return vfma_n_f64(a, b, n);
3095 }
3096
3097 // CHECK-LABEL: @test_vfmaq_n_f32(
3098 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3099 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3100 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3101 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3102 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3103 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3104 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3105 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a)
3106 // CHECK:   ret <4 x float> [[TMP3]]
3107 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3108   return vfmaq_n_f32(a, b, n);
3109 }
3110
3111 // CHECK-LABEL: @test_vfms_n_f32(
3112 // CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3113 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3114 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3115 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3116 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3117 // CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3118 // CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> [[VECINIT1_I]], <2 x float> %a)
3119 // CHECK:   ret <2 x float> [[TMP3]]
3120 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3121   return vfms_n_f32(a, b, n);
3122 }
3123
3124 // CHECK-LABEL: @test_vfms_n_f64(
3125 // CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
3126 // CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x double> undef, double %n, i32 0
3127 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3128 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
3129 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
3130 // CHECK:   [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> [[VECINIT_I]], <1 x double> %a)
3131 // CHECK:   ret <1 x double> [[TMP3]]
3132 float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
3133   return vfms_n_f64(a, b, n);
3134 }
3135
3136 // CHECK-LABEL: @test_vfmsq_n_f32(
3137 // CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3138 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3139 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3140 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3141 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3142 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3143 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3144 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3145 // CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> [[VECINIT3_I]], <4 x float> %a)
3146 // CHECK:   ret <4 x float> [[TMP3]]
3147 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3148   return vfmsq_n_f32(a, b, n);
3149 }
3150
3151 // CHECK-LABEL: @test_vmul_n_s16(
3152 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3153 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3154 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3155 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3156 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3157 // CHECK:   ret <4 x i16> [[MUL_I]]
3158 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3159   return vmul_n_s16(a, b);
3160 }
3161
3162 // CHECK-LABEL: @test_vmulq_n_s16(
3163 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3164 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3165 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3166 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3167 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3168 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3169 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3170 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3171 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3172 // CHECK:   ret <8 x i16> [[MUL_I]]
3173 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3174   return vmulq_n_s16(a, b);
3175 }
3176
3177 // CHECK-LABEL: @test_vmul_n_s32(
3178 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3179 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3180 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3181 // CHECK:   ret <2 x i32> [[MUL_I]]
3182 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3183   return vmul_n_s32(a, b);
3184 }
3185
3186 // CHECK-LABEL: @test_vmulq_n_s32(
3187 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3188 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3189 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3190 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3191 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3192 // CHECK:   ret <4 x i32> [[MUL_I]]
3193 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3194   return vmulq_n_s32(a, b);
3195 }
3196
3197 // CHECK-LABEL: @test_vmul_n_u16(
3198 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3199 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3200 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3201 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3202 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3203 // CHECK:   ret <4 x i16> [[MUL_I]]
3204 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3205   return vmul_n_u16(a, b);
3206 }
3207
3208 // CHECK-LABEL: @test_vmulq_n_u16(
3209 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3210 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3211 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3212 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3213 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3214 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3215 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3216 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3217 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3218 // CHECK:   ret <8 x i16> [[MUL_I]]
3219 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3220   return vmulq_n_u16(a, b);
3221 }
3222
3223 // CHECK-LABEL: @test_vmul_n_u32(
3224 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3225 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3226 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3227 // CHECK:   ret <2 x i32> [[MUL_I]]
3228 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3229   return vmul_n_u32(a, b);
3230 }
3231
3232 // CHECK-LABEL: @test_vmulq_n_u32(
3233 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3234 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3235 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3236 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3237 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3238 // CHECK:   ret <4 x i32> [[MUL_I]]
3239 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3240   return vmulq_n_u32(a, b);
3241 }
3242
3243 // CHECK-LABEL: @test_vmull_n_s16(
3244 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3245 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3246 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3247 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3248 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3249 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3250 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3251 // CHECK:   ret <4 x i32> [[VMULL5_I]]
3252 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3253   return vmull_n_s16(a, b);
3254 }
3255
3256 // CHECK-LABEL: @test_vmull_n_s32(
3257 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3258 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3259 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3260 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3261 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3262 // CHECK:   ret <2 x i64> [[VMULL3_I]]
3263 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3264   return vmull_n_s32(a, b);
3265 }
3266
3267 // CHECK-LABEL: @test_vmull_n_u16(
3268 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3269 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3270 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3271 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3272 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3273 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3274 // CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3275 // CHECK:   ret <4 x i32> [[VMULL5_I]]
3276 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3277   return vmull_n_u16(a, b);
3278 }
3279
3280 // CHECK-LABEL: @test_vmull_n_u32(
3281 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3282 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3283 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3284 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3285 // CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3286 // CHECK:   ret <2 x i64> [[VMULL3_I]]
3287 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3288   return vmull_n_u32(a, b);
3289 }
3290
3291 // CHECK-LABEL: @test_vqdmull_n_s16(
3292 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3293 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3294 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3295 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3296 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3297 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3298 // CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3299 // CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3300 // CHECK:   ret <4 x i32> [[VQDMULL_V5_I]]
3301 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3302   return vqdmull_n_s16(a, b);
3303 }
3304
3305 // CHECK-LABEL: @test_vqdmull_n_s32(
3306 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3307 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3308 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3309 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3310 // CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3311 // CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3312 // CHECK:   ret <2 x i64> [[VQDMULL_V3_I]]
3313 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3314   return vqdmull_n_s32(a, b);
3315 }
3316
3317 // CHECK-LABEL: @test_vqdmulh_n_s16(
3318 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3319 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3320 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3321 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3322 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3323 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3324 // CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3325 // CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3326 // CHECK:   ret <4 x i16> [[VQDMULH_V5_I]]
3327 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3328   return vqdmulh_n_s16(a, b);
3329 }
3330
3331 // CHECK-LABEL: @test_vqdmulhq_n_s16(
3332 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3333 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3334 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3335 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3336 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3337 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3338 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3339 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3340 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3341 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3342 // CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3343 // CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3344 // CHECK:   ret <8 x i16> [[VQDMULHQ_V9_I]]
3345 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3346   return vqdmulhq_n_s16(a, b);
3347 }
3348
3349 // CHECK-LABEL: @test_vqdmulh_n_s32(
3350 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3351 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3352 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3353 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3354 // CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3355 // CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3356 // CHECK:   ret <2 x i32> [[VQDMULH_V3_I]]
3357 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3358   return vqdmulh_n_s32(a, b);
3359 }
3360
3361 // CHECK-LABEL: @test_vqdmulhq_n_s32(
3362 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3363 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3364 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3365 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3366 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3367 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3368 // CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3369 // CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3370 // CHECK:   ret <4 x i32> [[VQDMULHQ_V5_I]]
3371 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3372   return vqdmulhq_n_s32(a, b);
3373 }
3374
3375 // CHECK-LABEL: @test_vqrdmulh_n_s16(
3376 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3377 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3378 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3379 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3380 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3381 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3382 // CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]])
3383 // CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3384 // CHECK:   ret <4 x i16> [[VQRDMULH_V5_I]]
3385 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3386   return vqrdmulh_n_s16(a, b);
3387 }
3388
3389 // CHECK-LABEL: @test_vqrdmulhq_n_s16(
3390 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3391 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3392 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3393 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3394 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3395 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3396 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3397 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3398 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3399 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3400 // CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]])
3401 // CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3402 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V9_I]]
3403 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3404   return vqrdmulhq_n_s16(a, b);
3405 }
3406
3407 // CHECK-LABEL: @test_vqrdmulh_n_s32(
3408 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3409 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3410 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3411 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3412 // CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]])
3413 // CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3414 // CHECK:   ret <2 x i32> [[VQRDMULH_V3_I]]
3415 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3416   return vqrdmulh_n_s32(a, b);
3417 }
3418
3419 // CHECK-LABEL: @test_vqrdmulhq_n_s32(
3420 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3421 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3422 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3423 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3424 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3425 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3426 // CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]])
3427 // CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3428 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V5_I]]
3429 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3430   return vqrdmulhq_n_s32(a, b);
3431 }
3432
3433 // CHECK-LABEL: @test_vmla_n_s16(
3434 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3435 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3436 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3437 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3438 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3439 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3440 // CHECK:   ret <4 x i16> [[ADD_I]]
3441 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3442   return vmla_n_s16(a, b, c);
3443 }
3444
3445 // CHECK-LABEL: @test_vmlaq_n_s16(
3446 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3447 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3448 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3449 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3450 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3451 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3452 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3453 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3454 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3455 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3456 // CHECK:   ret <8 x i16> [[ADD_I]]
3457 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3458   return vmlaq_n_s16(a, b, c);
3459 }
3460
3461 // CHECK-LABEL: @test_vmla_n_s32(
3462 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3463 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3464 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3465 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3466 // CHECK:   ret <2 x i32> [[ADD_I]]
3467 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3468   return vmla_n_s32(a, b, c);
3469 }
3470
3471 // CHECK-LABEL: @test_vmlaq_n_s32(
3472 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3473 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3474 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3475 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3476 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3477 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3478 // CHECK:   ret <4 x i32> [[ADD_I]]
3479 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3480   return vmlaq_n_s32(a, b, c);
3481 }
3482
3483 // CHECK-LABEL: @test_vmla_n_u16(
3484 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3485 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3486 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3487 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3488 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3489 // CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3490 // CHECK:   ret <4 x i16> [[ADD_I]]
3491 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3492   return vmla_n_u16(a, b, c);
3493 }
3494
3495 // CHECK-LABEL: @test_vmlaq_n_u16(
3496 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3497 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3498 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3499 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3500 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3501 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3502 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3503 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3504 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3505 // CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3506 // CHECK:   ret <8 x i16> [[ADD_I]]
3507 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3508   return vmlaq_n_u16(a, b, c);
3509 }
3510
3511 // CHECK-LABEL: @test_vmla_n_u32(
3512 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3513 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3514 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3515 // CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3516 // CHECK:   ret <2 x i32> [[ADD_I]]
3517 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3518   return vmla_n_u32(a, b, c);
3519 }
3520
3521 // CHECK-LABEL: @test_vmlaq_n_u32(
3522 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3523 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3524 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3525 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3526 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3527 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3528 // CHECK:   ret <4 x i32> [[ADD_I]]
3529 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3530   return vmlaq_n_u32(a, b, c);
3531 }
3532
3533 // CHECK-LABEL: @test_vmlal_n_s16(
3534 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3535 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3536 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3537 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3538 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3539 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3540 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3541 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3542 // CHECK:   ret <4 x i32> [[ADD_I]]
3543 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3544   return vmlal_n_s16(a, b, c);
3545 }
3546
3547 // CHECK-LABEL: @test_vmlal_n_s32(
3548 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3549 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3550 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3551 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3552 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3553 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3554 // CHECK:   ret <2 x i64> [[ADD_I]]
3555 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3556   return vmlal_n_s32(a, b, c);
3557 }
3558
3559 // CHECK-LABEL: @test_vmlal_n_u16(
3560 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3561 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3562 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3563 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3564 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3565 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3566 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3567 // CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3568 // CHECK:   ret <4 x i32> [[ADD_I]]
3569 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3570   return vmlal_n_u16(a, b, c);
3571 }
3572
3573 // CHECK-LABEL: @test_vmlal_n_u32(
3574 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3575 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3576 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3577 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3578 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3579 // CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3580 // CHECK:   ret <2 x i64> [[ADD_I]]
3581 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3582   return vmlal_n_u32(a, b, c);
3583 }
3584
3585 // CHECK-LABEL: @test_vqdmlal_n_s16(
3586 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3587 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3588 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3589 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3590 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3591 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3592 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3593 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3594 // CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3595 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
3596 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3597   return vqdmlal_n_s16(a, b, c);
3598 }
3599
3600 // CHECK-LABEL: @test_vqdmlal_n_s32(
3601 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3602 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3603 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3604 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3605 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3606 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3607 // CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3608 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
3609 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3610   return vqdmlal_n_s32(a, b, c);
3611 }
3612
3613 // CHECK-LABEL: @test_vmls_n_s16(
3614 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3615 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3616 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3617 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3618 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3619 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3620 // CHECK:   ret <4 x i16> [[SUB_I]]
3621 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3622   return vmls_n_s16(a, b, c);
3623 }
3624
3625 // CHECK-LABEL: @test_vmlsq_n_s16(
3626 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3627 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3628 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3629 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3630 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3631 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3632 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3633 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3634 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3635 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3636 // CHECK:   ret <8 x i16> [[SUB_I]]
3637 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3638   return vmlsq_n_s16(a, b, c);
3639 }
3640
3641 // CHECK-LABEL: @test_vmls_n_s32(
3642 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3643 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3644 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3645 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3646 // CHECK:   ret <2 x i32> [[SUB_I]]
3647 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3648   return vmls_n_s32(a, b, c);
3649 }
3650
3651 // CHECK-LABEL: @test_vmlsq_n_s32(
3652 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3653 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3654 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3655 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3656 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3657 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3658 // CHECK:   ret <4 x i32> [[SUB_I]]
3659 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3660   return vmlsq_n_s32(a, b, c);
3661 }
3662
3663 // CHECK-LABEL: @test_vmls_n_u16(
3664 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3665 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3666 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3667 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3668 // CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3669 // CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
3670 // CHECK:   ret <4 x i16> [[SUB_I]]
3671 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3672   return vmls_n_u16(a, b, c);
3673 }
3674
3675 // CHECK-LABEL: @test_vmlsq_n_u16(
3676 // CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3677 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3678 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3679 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3680 // CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3681 // CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3682 // CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3683 // CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3684 // CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3685 // CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
3686 // CHECK:   ret <8 x i16> [[SUB_I]]
3687 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3688   return vmlsq_n_u16(a, b, c);
3689 }
3690
3691 // CHECK-LABEL: @test_vmls_n_u32(
3692 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3693 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3694 // CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3695 // CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
3696 // CHECK:   ret <2 x i32> [[SUB_I]]
3697 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3698   return vmls_n_u32(a, b, c);
3699 }
3700
3701 // CHECK-LABEL: @test_vmlsq_n_u32(
3702 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3703 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3704 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3705 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3706 // CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3707 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
3708 // CHECK:   ret <4 x i32> [[SUB_I]]
3709 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3710   return vmlsq_n_u32(a, b, c);
3711 }
3712
3713 // CHECK-LABEL: @test_vmlsl_n_s16(
3714 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3715 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3716 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3717 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3718 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3719 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3720 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3721 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3722 // CHECK:   ret <4 x i32> [[SUB_I]]
3723 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3724   return vmlsl_n_s16(a, b, c);
3725 }
3726
3727 // CHECK-LABEL: @test_vmlsl_n_s32(
3728 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3729 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3730 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3731 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3732 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3733 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3734 // CHECK:   ret <2 x i64> [[SUB_I]]
3735 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3736   return vmlsl_n_s32(a, b, c);
3737 }
3738
3739 // CHECK-LABEL: @test_vmlsl_n_u16(
3740 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3741 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3742 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3743 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3744 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3745 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3746 // CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3747 // CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
3748 // CHECK:   ret <4 x i32> [[SUB_I]]
3749 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
3750   return vmlsl_n_u16(a, b, c);
3751 }
3752
3753 // CHECK-LABEL: @test_vmlsl_n_u32(
3754 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3755 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3756 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3757 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3758 // CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3759 // CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
3760 // CHECK:   ret <2 x i64> [[SUB_I]]
3761 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
3762   return vmlsl_n_u32(a, b, c);
3763 }
3764
3765 // CHECK-LABEL: @test_vqdmlsl_n_s16(
3766 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3767 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3768 // CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3769 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3770 // CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3771 // CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3772 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3773 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
3774 // CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
3775 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
3776 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3777   return vqdmlsl_n_s16(a, b, c);
3778 }
3779
3780 // CHECK-LABEL: @test_vqdmlsl_n_s32(
3781 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3782 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3783 // CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3784 // CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3785 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3786 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
3787 // CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
3788 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
3789 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3790   return vqdmlsl_n_s32(a, b, c);
3791 }
3792
3793 // CHECK-LABEL: @test_vmla_lane_u16_0(
3794 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3795 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3796 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3797 // CHECK:   ret <4 x i16> [[ADD]]
3798 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3799   return vmla_lane_u16(a, b, v, 0);
3800 }
3801
3802 // CHECK-LABEL: @test_vmlaq_lane_u16_0(
3803 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3804 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3805 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3806 // CHECK:   ret <8 x i16> [[ADD]]
3807 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3808   return vmlaq_lane_u16(a, b, v, 0);
3809 }
3810
3811 // CHECK-LABEL: @test_vmla_lane_u32_0(
3812 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3813 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3814 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3815 // CHECK:   ret <2 x i32> [[ADD]]
3816 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3817   return vmla_lane_u32(a, b, v, 0);
3818 }
3819
3820 // CHECK-LABEL: @test_vmlaq_lane_u32_0(
3821 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3822 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3823 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3824 // CHECK:   ret <4 x i32> [[ADD]]
3825 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3826   return vmlaq_lane_u32(a, b, v, 0);
3827 }
3828
3829 // CHECK-LABEL: @test_vmla_laneq_u16_0(
3830 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3831 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3832 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
3833 // CHECK:   ret <4 x i16> [[ADD]]
3834 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3835   return vmla_laneq_u16(a, b, v, 0);
3836 }
3837
3838 // CHECK-LABEL: @test_vmlaq_laneq_u16_0(
3839 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3840 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3841 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
3842 // CHECK:   ret <8 x i16> [[ADD]]
3843 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3844   return vmlaq_laneq_u16(a, b, v, 0);
3845 }
3846
3847 // CHECK-LABEL: @test_vmla_laneq_u32_0(
3848 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3849 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3850 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
3851 // CHECK:   ret <2 x i32> [[ADD]]
3852 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3853   return vmla_laneq_u32(a, b, v, 0);
3854 }
3855
3856 // CHECK-LABEL: @test_vmlaq_laneq_u32_0(
3857 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3858 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3859 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
3860 // CHECK:   ret <4 x i32> [[ADD]]
3861 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3862   return vmlaq_laneq_u32(a, b, v, 0);
3863 }
3864
3865 // CHECK-LABEL: @test_vqdmlal_laneq_s16_0(
3866 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3867 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3868 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3869 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3870 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3871 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3872 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
3873 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3874   return vqdmlal_laneq_s16(a, b, v, 0);
3875 }
3876
3877 // CHECK-LABEL: @test_vqdmlal_laneq_s32_0(
3878 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3879 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3880 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3881 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3882 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
3883 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3884 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
3885 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
3886   return vqdmlal_laneq_s32(a, b, v, 0);
3887 }
3888
3889 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0(
3890 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3891 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3892 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3893 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
3894 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3895 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
3896 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3897 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
3898 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
3899   return vqdmlal_high_laneq_s16(a, b, v, 0);
3900 }
3901
3902 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0(
3903 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3904 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3905 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3906 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
3907 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3908 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
3909 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
3910 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
3911 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
3912   return vqdmlal_high_laneq_s32(a, b, v, 0);
3913 }
3914
3915 // CHECK-LABEL: @test_vmls_lane_u16_0(
3916 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
3917 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3918 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3919 // CHECK:   ret <4 x i16> [[SUB]]
3920 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
3921   return vmls_lane_u16(a, b, v, 0);
3922 }
3923
3924 // CHECK-LABEL: @test_vmlsq_lane_u16_0(
3925 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
3926 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3927 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3928 // CHECK:   ret <8 x i16> [[SUB]]
3929 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
3930   return vmlsq_lane_u16(a, b, v, 0);
3931 }
3932
3933 // CHECK-LABEL: @test_vmls_lane_u32_0(
3934 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3935 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3936 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3937 // CHECK:   ret <2 x i32> [[SUB]]
3938 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
3939   return vmls_lane_u32(a, b, v, 0);
3940 }
3941
3942 // CHECK-LABEL: @test_vmlsq_lane_u32_0(
3943 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3944 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3945 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3946 // CHECK:   ret <4 x i32> [[SUB]]
3947 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
3948   return vmlsq_lane_u32(a, b, v, 0);
3949 }
3950
3951 // CHECK-LABEL: @test_vmls_laneq_u16_0(
3952 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3953 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
3954 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
3955 // CHECK:   ret <4 x i16> [[SUB]]
3956 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
3957   return vmls_laneq_u16(a, b, v, 0);
3958 }
3959
3960 // CHECK-LABEL: @test_vmlsq_laneq_u16_0(
3961 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
3962 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
3963 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
3964 // CHECK:   ret <8 x i16> [[SUB]]
3965 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
3966   return vmlsq_laneq_u16(a, b, v, 0);
3967 }
3968
3969 // CHECK-LABEL: @test_vmls_laneq_u32_0(
3970 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
3971 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
3972 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
3973 // CHECK:   ret <2 x i32> [[SUB]]
3974 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
3975   return vmls_laneq_u32(a, b, v, 0);
3976 }
3977
3978 // CHECK-LABEL: @test_vmlsq_laneq_u32_0(
3979 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
3980 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
3981 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
3982 // CHECK:   ret <4 x i32> [[SUB]]
3983 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
3984   return vmlsq_laneq_u32(a, b, v, 0);
3985 }
3986
3987 // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0(
3988 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
3989 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3990 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3991 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
3992 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
3993 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
3994 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
3995 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
3996   return vqdmlsl_laneq_s16(a, b, v, 0);
3997 }
3998
3999 // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0(
4000 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4001 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4002 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4003 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4004 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4005 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4006 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4007 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4008   return vqdmlsl_laneq_s32(a, b, v, 0);
4009 }
4010
4011 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0(
4012 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4013 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4014 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4015 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4016 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4017 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4018 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4019 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4020 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4021   return vqdmlsl_high_laneq_s16(a, b, v, 0);
4022 }
4023
4024 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0(
4025 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4026 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4027 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4028 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4029 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4030 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4031 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4032 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4033 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4034   return vqdmlsl_high_laneq_s32(a, b, v, 0);
4035 }
4036
4037 // CHECK-LABEL: @test_vqdmulh_laneq_s16_0(
4038 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4039 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4040 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4041 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4042 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4043 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
4044 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4045   return vqdmulh_laneq_s16(a, v, 0);
4046 }
4047
4048 // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0(
4049 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4050 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4051 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4052 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4053 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4054 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
4055 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4056   return vqdmulhq_laneq_s16(a, v, 0);
4057 }
4058
4059 // CHECK-LABEL: @test_vqdmulh_laneq_s32_0(
4060 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4061 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4062 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4063 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4064 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4065 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
4066 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4067   return vqdmulh_laneq_s32(a, v, 0);
4068 }
4069
4070 // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0(
4071 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4072 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4073 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4074 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4075 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4076 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
4077 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4078   return vqdmulhq_laneq_s32(a, v, 0);
4079 }
4080
4081 // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0(
4082 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4083 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4084 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4085 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4086 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4087 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
4088 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4089   return vqrdmulh_laneq_s16(a, v, 0);
4090 }
4091
4092 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0(
4093 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4094 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4095 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4096 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4097 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4098 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
4099 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4100   return vqrdmulhq_laneq_s16(a, v, 0);
4101 }
4102
4103 // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0(
4104 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4105 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4106 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4107 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4108 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4109 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
4110 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4111   return vqrdmulh_laneq_s32(a, v, 0);
4112 }
4113
4114 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0(
4115 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4116 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4117 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4118 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4119 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4120 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
4121 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4122   return vqrdmulhq_laneq_s32(a, v, 0);
4123 }
4124
4125 // CHECK-LABEL: @test_vmla_lane_u16(
4126 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4127 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4128 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4129 // CHECK:   ret <4 x i16> [[ADD]]
4130 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4131   return vmla_lane_u16(a, b, v, 3);
4132 }
4133
4134 // CHECK-LABEL: @test_vmlaq_lane_u16(
4135 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4136 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4137 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4138 // CHECK:   ret <8 x i16> [[ADD]]
4139 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4140   return vmlaq_lane_u16(a, b, v, 3);
4141 }
4142
4143 // CHECK-LABEL: @test_vmla_lane_u32(
4144 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4145 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4146 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4147 // CHECK:   ret <2 x i32> [[ADD]]
4148 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4149   return vmla_lane_u32(a, b, v, 1);
4150 }
4151
4152 // CHECK-LABEL: @test_vmlaq_lane_u32(
4153 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4154 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4155 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4156 // CHECK:   ret <4 x i32> [[ADD]]
4157 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4158   return vmlaq_lane_u32(a, b, v, 1);
4159 }
4160
4161 // CHECK-LABEL: @test_vmla_laneq_u16(
4162 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4163 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4164 // CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4165 // CHECK:   ret <4 x i16> [[ADD]]
4166 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4167   return vmla_laneq_u16(a, b, v, 7);
4168 }
4169
4170 // CHECK-LABEL: @test_vmlaq_laneq_u16(
4171 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4172 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4173 // CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4174 // CHECK:   ret <8 x i16> [[ADD]]
4175 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4176   return vmlaq_laneq_u16(a, b, v, 7);
4177 }
4178
4179 // CHECK-LABEL: @test_vmla_laneq_u32(
4180 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4181 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4182 // CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4183 // CHECK:   ret <2 x i32> [[ADD]]
4184 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4185   return vmla_laneq_u32(a, b, v, 3);
4186 }
4187
4188 // CHECK-LABEL: @test_vmlaq_laneq_u32(
4189 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4190 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4191 // CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4192 // CHECK:   ret <4 x i32> [[ADD]]
4193 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4194   return vmlaq_laneq_u32(a, b, v, 3);
4195 }
4196
4197 // CHECK-LABEL: @test_vqdmlal_laneq_s16(
4198 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4199 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4200 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4201 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4202 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4203 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4204 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4205 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4206   return vqdmlal_laneq_s16(a, b, v, 7);
4207 }
4208
4209 // CHECK-LABEL: @test_vqdmlal_laneq_s32(
4210 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4211 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4212 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4213 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4214 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4215 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4216 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4217 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4218   return vqdmlal_laneq_s32(a, b, v, 3);
4219 }
4220
4221 // CHECK-LABEL: @test_vqdmlal_high_laneq_s16(
4222 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4223 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4224 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4225 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4226 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4227 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4228 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4229 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4230 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4231   return vqdmlal_high_laneq_s16(a, b, v, 7);
4232 }
4233
4234 // CHECK-LABEL: @test_vqdmlal_high_laneq_s32(
4235 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4236 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4237 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4238 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4239 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4240 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4241 // CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4242 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4243 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4244   return vqdmlal_high_laneq_s32(a, b, v, 3);
4245 }
4246
4247 // CHECK-LABEL: @test_vmls_lane_u16(
4248 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4249 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4250 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4251 // CHECK:   ret <4 x i16> [[SUB]]
4252 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4253   return vmls_lane_u16(a, b, v, 3);
4254 }
4255
4256 // CHECK-LABEL: @test_vmlsq_lane_u16(
4257 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4258 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4259 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4260 // CHECK:   ret <8 x i16> [[SUB]]
4261 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4262   return vmlsq_lane_u16(a, b, v, 3);
4263 }
4264
4265 // CHECK-LABEL: @test_vmls_lane_u32(
4266 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4267 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4268 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4269 // CHECK:   ret <2 x i32> [[SUB]]
4270 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4271   return vmls_lane_u32(a, b, v, 1);
4272 }
4273
4274 // CHECK-LABEL: @test_vmlsq_lane_u32(
4275 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4276 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4277 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4278 // CHECK:   ret <4 x i32> [[SUB]]
4279 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4280   return vmlsq_lane_u32(a, b, v, 1);
4281 }
4282
4283 // CHECK-LABEL: @test_vmls_laneq_u16(
4284 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4285 // CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4286 // CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4287 // CHECK:   ret <4 x i16> [[SUB]]
4288 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4289   return vmls_laneq_u16(a, b, v, 7);
4290 }
4291
4292 // CHECK-LABEL: @test_vmlsq_laneq_u16(
4293 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4294 // CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4295 // CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4296 // CHECK:   ret <8 x i16> [[SUB]]
4297 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4298   return vmlsq_laneq_u16(a, b, v, 7);
4299 }
4300
4301 // CHECK-LABEL: @test_vmls_laneq_u32(
4302 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4303 // CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4304 // CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4305 // CHECK:   ret <2 x i32> [[SUB]]
4306 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4307   return vmls_laneq_u32(a, b, v, 3);
4308 }
4309
4310 // CHECK-LABEL: @test_vmlsq_laneq_u32(
4311 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4312 // CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4313 // CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4314 // CHECK:   ret <4 x i32> [[SUB]]
4315 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4316   return vmlsq_laneq_u32(a, b, v, 3);
4317 }
4318
4319 // CHECK-LABEL: @test_vqdmlsl_laneq_s16(
4320 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4321 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4322 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4323 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4324 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
4325 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4326 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4327 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4328   return vqdmlsl_laneq_s16(a, b, v, 7);
4329 }
4330
4331 // CHECK-LABEL: @test_vqdmlsl_laneq_s32(
4332 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4333 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4334 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4335 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4336 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
4337 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4338 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4339 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4340   return vqdmlsl_laneq_s32(a, b, v, 3);
4341 }
4342
4343 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16(
4344 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4345 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4346 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4347 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4348 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4349 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]])
4350 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
4351 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4352 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4353   return vqdmlsl_high_laneq_s16(a, b, v, 7);
4354 }
4355
4356 // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32(
4357 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4358 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4359 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4360 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4361 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4362 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]])
4363 // CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
4364 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4365 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4366   return vqdmlsl_high_laneq_s32(a, b, v, 3);
4367 }
4368
4369 // CHECK-LABEL: @test_vqdmulh_laneq_s16(
4370 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4371 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4372 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4373 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4374 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4375 // CHECK:   ret <4 x i16> [[VQDMULH_V2_I]]
4376 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4377   return vqdmulh_laneq_s16(a, v, 7);
4378 }
4379
4380 // CHECK-LABEL: @test_vqdmulhq_laneq_s16(
4381 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4382 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4383 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4384 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4385 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4386 // CHECK:   ret <8 x i16> [[VQDMULHQ_V2_I]]
4387 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4388   return vqdmulhq_laneq_s16(a, v, 7);
4389 }
4390
4391 // CHECK-LABEL: @test_vqdmulh_laneq_s32(
4392 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4393 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4394 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4395 // CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4396 // CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4397 // CHECK:   ret <2 x i32> [[VQDMULH_V2_I]]
4398 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4399   return vqdmulh_laneq_s32(a, v, 3);
4400 }
4401
4402 // CHECK-LABEL: @test_vqdmulhq_laneq_s32(
4403 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4404 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4405 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4406 // CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4407 // CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4408 // CHECK:   ret <4 x i32> [[VQDMULHQ_V2_I]]
4409 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4410   return vqdmulhq_laneq_s32(a, v, 3);
4411 }
4412
4413 // CHECK-LABEL: @test_vqrdmulh_laneq_s16(
4414 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4415 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4416 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4417 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]])
4418 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4419 // CHECK:   ret <4 x i16> [[VQRDMULH_V2_I]]
4420 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4421   return vqrdmulh_laneq_s16(a, v, 7);
4422 }
4423
4424 // CHECK-LABEL: @test_vqrdmulhq_laneq_s16(
4425 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4426 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4427 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4428 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]])
4429 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4430 // CHECK:   ret <8 x i16> [[VQRDMULHQ_V2_I]]
4431 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4432   return vqrdmulhq_laneq_s16(a, v, 7);
4433 }
4434
4435 // CHECK-LABEL: @test_vqrdmulh_laneq_s32(
4436 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4437 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4438 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4439 // CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]])
4440 // CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4441 // CHECK:   ret <2 x i32> [[VQRDMULH_V2_I]]
4442 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4443   return vqrdmulh_laneq_s32(a, v, 3);
4444 }
4445
4446 // CHECK-LABEL: @test_vqrdmulhq_laneq_s32(
4447 // CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4448 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4449 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4450 // CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]])
4451 // CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4452 // CHECK:   ret <4 x i32> [[VQRDMULHQ_V2_I]]
4453 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4454   return vqrdmulhq_laneq_s32(a, v, 3);
4455 }