test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c

   1 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
   2 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
   3
   4 // Test new aarch64 intrinsics and types
   5
   6 #include <arm_neon.h>
   7
   8
   9 // CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
  10 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
  11 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
  12 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
  13 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
  14 // CHECK:   ret float [[MUL]]
  15 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
  16   return vmuls_lane_f32(a, b, 1);
  17 }
  18
  19 // CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
  20 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
  21 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
  22 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
  23 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
  24 // CHECK:   ret double [[MUL]]
  25 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
  26   return vmuld_lane_f64(a, b, 0);
  27 }
  28
  29 // CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #1 {
  30 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
  31 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
  32 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
  33 // CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
  34 // CHECK:   ret float [[MUL]]
  35 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
  36   return vmuls_laneq_f32(a, b, 3);
  37 }
  38
  39 // CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #1 {
  40 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
  41 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
  42 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
  43 // CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
  44 // CHECK:   ret double [[MUL]]
  45 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
  46   return vmuld_laneq_f64(a, b, 1);
  47 }
  48
  49 // CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
  50 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
  51 // CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
  52 // CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
  53 // CHECK:   ret <1 x double> [[TMP4]]
  54 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
  55   return vmul_n_f64(a, b);
  56 }
  57
  58 // CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
  59 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
  60 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
  61 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
  62 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]])
  63 // CHECK:   ret float [[VMULXS_F32_I]]
  64 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
  65   return vmulxs_lane_f32(a, b, 1);
  66 }
  67
  68 // CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #1 {
  69 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
  70 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
  71 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
  72 // CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]])
  73 // CHECK:   ret float [[VMULXS_F32_I]]
  74 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
  75   return vmulxs_laneq_f32(a, b, 3);
  76 }
  77
  78 // CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
  79 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
  80 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
  81 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
  82 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]])
  83 // CHECK:   ret double [[VMULXD_F64_I]]
  84 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
  85   return vmulxd_lane_f64(a, b, 0);
  86 }
  87
  88 // CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #1 {
  89 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
  90 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
  91 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
  92 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]])
  93 // CHECK:   ret double [[VMULXD_F64_I]]
  94 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
  95   return vmulxd_laneq_f64(a, b, 1);
  96 }
  97
  98 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
  99 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 100 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 101 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 102 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
 103 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 104 // CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
 105 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]])
 106 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
 107 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
 108 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
 109 // CHECK:   ret <1 x double> [[VSET_LANE]]
 110 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
 111   return vmulx_lane_f64(a, b, 0);
 112 }
 113
 114
 115 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #1 {
 116 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 117 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 118 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 119 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
 120 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 121 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
 122 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
 123 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
 124 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
 125 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
 126 // CHECK:   ret <1 x double> [[VSET_LANE]]
 127 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
 128   return vmulx_laneq_f64(a, b, 0);
 129 }
 130
 131 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #1 {
 132 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 133 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 134 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 135 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
 136 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 137 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
 138 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
 139 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
 140 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
 141 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
 142 // CHECK:   ret <1 x double> [[VSET_LANE]]
 143 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
 144   return vmulx_laneq_f64(a, b, 1);
 145 }
 146
 147
 148 // CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
 149 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
 150 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 151 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
 152 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
 153 // CHECK:   ret float [[TMP2]]
 154 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 155   return vfmas_lane_f32(a, b, c, 1);
 156 }
 157
 158 // CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
 159 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
 160 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 161 // CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
 162 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
 163 // CHECK:   ret double [[TMP2]]
 164 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
 165   return vfmad_lane_f64(a, b, c, 0);
 166 }
 167
 168 // CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #1 {
 169 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
 170 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
 171 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 172 // CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
 173 // CHECK:   ret double [[TMP2]]
 174 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
 175   return vfmad_laneq_f64(a, b, c, 1);
 176 }
 177
 178 // CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
 179 // CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
 180 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
 181 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
 182 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
 183 // CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
 184 // CHECK:   ret float [[TMP2]]
 185 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
 186   return vfmss_lane_f32(a, b, c, 1);
 187 }
 188
 189 // CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
 190 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 191 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
 192 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 193 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 194 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
 195 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
 196 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 197 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 198 // CHECK:   ret <1 x double> [[FMLA2]]
 199 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 200   return vfma_lane_f64(a, b, v, 0);
 201 }
 202
 203 // CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
 204 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
 205 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 206 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
 207 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
 208 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 209 // CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
 210 // CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
 211 // CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
 212 // CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
 213 // CHECK:   ret <1 x double> [[FMLA2]]
 214 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 215   return vfms_lane_f64(a, b, v, 0);
 216 }
 217
 218 // CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
 219 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 220 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
 221 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 222 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
 223 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
 224 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 225 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 226 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
 227 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
 228 // CHECK:   ret <1 x double> [[TMP7]]
 229 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 230   return vfma_laneq_f64(a, b, v, 0);
 231 }
 232
 233 // CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
 234 // CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
 235 // CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
 236 // CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
 237 // CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
 238 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
 239 // CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
 240 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
 241 // CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 242 // CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
 243 // CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
 244 // CHECK:   ret <1 x double> [[TMP7]]
 245 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 246   return vfms_laneq_f64(a, b, v, 0);
 247 }
 248
 249 // CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
 250 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 251 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 252 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
 253 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 254 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
 255 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 256 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
 257 // CHECK:   ret i32 [[TMP4]]
 258 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
 259   return vqdmullh_lane_s16(a, b, 3);
 260 }
 261
 262 // CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
 263 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 264 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 265 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 266 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]])
 267 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
 268 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
 269   return vqdmulls_lane_s32(a, b, 1);
 270 }
 271
 272 // CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
 273 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 274 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 275 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
 276 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 277 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
 278 // CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 279 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
 280 // CHECK:   ret i32 [[TMP4]]
 281 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
 282   return vqdmullh_laneq_s16(a, b, 7);
 283 }
 284
 285 // CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #1 {
 286 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 287 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 288 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 289 // CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]])
 290 // CHECK:   ret i64 [[VQDMULLS_S32_I]]
 291 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
 292   return vqdmulls_laneq_s32(a, b, 3);
 293 }
 294
 295 // CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
 296 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 297 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 298 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
 299 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 300 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
 301 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 302 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
 303 // CHECK:   ret i16 [[TMP4]]
 304 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
 305   return vqdmulhh_lane_s16(a, b, 3);
 306 }
 307
 308 // CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
 309 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 310 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 311 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 312 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]])
 313 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
 314 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
 315   return vqdmulhs_lane_s32(a, b, 1);
 316 }
 317
 318
 319 // CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
 320 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 321 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 322 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
 323 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 324 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
 325 // CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 326 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
 327 // CHECK:   ret i16 [[TMP4]]
 328 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
 329   return vqdmulhh_laneq_s16(a, b, 7);
 330 }
 331
 332
 333 // CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
 334 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 335 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 336 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 337 // CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
 338 // CHECK:   ret i32 [[VQDMULHS_S32_I]]
 339 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
 340   return vqdmulhs_laneq_s32(a, b, 3);
 341 }
 342
 343 // CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
 344 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 345 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 346 // CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
 347 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 348 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
 349 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 350 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
 351 // CHECK:   ret i16 [[TMP4]]
 352 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
 353   return vqrdmulhh_lane_s16(a, b, 3);
 354 }
 355
 356 // CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
 357 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 358 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 359 // CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 360 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]])
 361 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 362 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
 363   return vqrdmulhs_lane_s32(a, b, 1);
 364 }
 365
 366
 367 // CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
 368 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
 369 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 370 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
 371 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
 372 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
 373 // CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 374 // CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
 375 // CHECK:   ret i16 [[TMP4]]
 376 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
 377   return vqrdmulhh_laneq_s16(a, b, 7);
 378 }
 379
 380
 381 // CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
 382 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
 383 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 384 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 385 // CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
 386 // CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 387 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
 388   return vqrdmulhs_laneq_s32(a, b, 3);
 389 }
 390
 391 // CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
 392 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 393 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 394 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
 395 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 396 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
 397 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 398 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
 399 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
 400 // CHECK:   ret i32 [[VQDMLXL1]]
 401 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
 402   return vqdmlalh_lane_s16(a, b, c, 3);
 403 }
 404
 405 // CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
 406 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 407 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 408 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 409 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
 410 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
 411 // CHECK:   ret i64 [[VQDMLXL1]]
 412 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
 413   return vqdmlals_lane_s32(a, b, c, 1);
 414 }
 415
 416 // CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
 417 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
 418 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 419 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
 420 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 421 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
 422 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 423 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
 424 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
 425 // CHECK:   ret i32 [[VQDMLXL1]]
 426 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
 427   return vqdmlalh_laneq_s16(a, b, c, 7);
 428 }
 429
 430 // CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
 431 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
 432 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 433 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 434 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
 435 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
 436 // CHECK:   ret i64 [[VQDMLXL1]]
 437 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
 438   return vqdmlals_laneq_s32(a, b, c, 3);
 439 }
 440
 441 // CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
 442 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 443 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 444 // CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
 445 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 446 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
 447 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 448 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
 449 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
 450 // CHECK:   ret i32 [[VQDMLXL1]]
 451 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
 452   return vqdmlslh_lane_s16(a, b, c, 3);
 453 }
 454
 455 // CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
 456 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 457 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 458 // CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 459 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
 460 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
 461 // CHECK:   ret i64 [[VQDMLXL1]]
 462 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
 463   return vqdmlsls_lane_s32(a, b, c, 1);
 464 }
 465
 466 // CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
 467 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
 468 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 469 // CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
 470 // CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
 471 // CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
 472 // CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
 473 // CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
 474 // CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
 475 // CHECK:   ret i32 [[VQDMLXL1]]
 476 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
 477   return vqdmlslh_laneq_s16(a, b, c, 7);
 478 }
 479
 480 // CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
 481 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
 482 // CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 483 // CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
 484 // CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
 485 // CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
 486 // CHECK:   ret i64 [[VQDMLXL1]]
 487 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
 488   return vqdmlsls_laneq_s32(a, b, c, 3);
 489 }
 490
 491 // CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
 492 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
 493 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
 494 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
 495 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 496 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
 497 // CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
 498 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
 499 // CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
 500 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
 501 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
 502 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
 503 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
 504 // CHECK:   ret <1 x double> [[VSET_LANE]]
 505 float64x1_t test_vmulx_lane_f64_0() {
 506       float64x1_t arg1;
 507       float64x1_t arg2;
 508       float64x1_t result;
 509       float64_t sarg1, sarg2, sres;
 510       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
 511       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
 512       result = vmulx_lane_f64(arg1, arg2, 0);
 513       return result;
 514 }
 515
 516 // CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #1 {
 517 // CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
 518 // CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
 519 // CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
 520 // CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
 521 // CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
 522 // CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
 523 // CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
 524 // CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
 525 // CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
 526 // CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
 527 // CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
 528 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
 529 // CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
 530 // CHECK:   ret <1 x double> [[VSET_LANE]]
 531 float64x1_t test_vmulx_laneq_f64_2() {
 532       float64x1_t arg1;
 533       float64x1_t arg2;
 534       float64x2_t arg3;
 535       float64x1_t result;
 536       float64_t sarg1, sarg2, sres;
 537       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
 538       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
 539       arg3 = vcombine_f64(arg1, arg2);
 540       result = vmulx_laneq_f64(arg1, arg3, 1);
 541       return result;
 542 }
 543
 544 // CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
 545 // CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"