test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
   4 ; CHECK-LABEL: test_ldst1_v16i8:
   5 ; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
   6 ; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
   7   %tmp = load <16 x i8>* %ptr
   8   store <16 x i8> %tmp, <16 x i8>* %ptr2
   9   ret void
  10 }
  11
  12 define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
  13 ; CHECK-LABEL: test_ldst1_v8i16:
  14 ; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
  15 ; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
  16   %tmp = load <8 x i16>* %ptr
  17   store <8 x i16> %tmp, <8 x i16>* %ptr2
  18   ret void
  19 }
  20
  21 define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
  22 ; CHECK-LABEL: test_ldst1_v4i32:
  23 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
  24 ; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
  25   %tmp = load <4 x i32>* %ptr
  26   store <4 x i32> %tmp, <4 x i32>* %ptr2
  27   ret void
  28 }
  29
  30 define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
  31 ; CHECK-LABEL: test_ldst1_v2i64:
  32 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
  33 ; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
  34   %tmp = load <2 x i64>* %ptr
  35   store <2 x i64> %tmp, <2 x i64>* %ptr2
  36   ret void
  37 }
  38
  39 define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
  40 ; CHECK-LABEL: test_ldst1_v8i8:
  41 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
  42 ; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
  43   %tmp = load <8 x i8>* %ptr
  44   store <8 x i8> %tmp, <8 x i8>* %ptr2
  45   ret void
  46 }
  47
  48 define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
  49 ; CHECK-LABEL: test_ldst1_v4i16:
  50 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
  51 ; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
  52   %tmp = load <4 x i16>* %ptr
  53   store <4 x i16> %tmp, <4 x i16>* %ptr2
  54   ret void
  55 }
  56
  57 define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
  58 ; CHECK-LABEL: test_ldst1_v2i32:
  59 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
  60 ; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
  61   %tmp = load <2 x i32>* %ptr
  62   store <2 x i32> %tmp, <2 x i32>* %ptr2
  63   ret void
  64 }
  65
  66 define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
  67 ; CHECK-LABEL: test_ldst1_v1i64:
  68 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
  69 ; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
  70   %tmp = load <1 x i64>* %ptr
  71   store <1 x i64> %tmp, <1 x i64>* %ptr2
  72   ret void
  73 }
  74
  75 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
  76 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
  77 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
  78 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
  79 %struct.float32x4x2_t = type { [2 x <4 x float>] }
  80 %struct.float64x2x2_t = type { [2 x <2 x double>] }
  81 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  82 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  83 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  84 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  85 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  86 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  87 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  88 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  89 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  90 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  91 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  92 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  93 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  94 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  95 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  96 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  97 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  98 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  99 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
 100 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
 101 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
 102 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
 103 %struct.float32x4x4_t = type { [4 x <4 x float>] }
 104 %struct.float64x2x4_t = type { [4 x <2 x double>] }
 105 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
 106 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
 107 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
 108 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
 109 %struct.float32x2x4_t = type { [4 x <2 x float>] }
 110 %struct.float64x1x4_t = type { [4 x <1 x double>] }
 111
 112
 113 define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
 114 ; CHECK-LABEL: test_vld1q_s8
 115 ; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 116   %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
 117   ret <16 x i8> %vld1
 118 }
 119
 120 define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
 121 ; CHECK-LABEL: test_vld1q_s16
 122 ; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 123   %1 = bitcast i16* %a to i8*
 124   %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
 125   ret <8 x i16> %vld1
 126 }
 127
 128 define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
 129 ; CHECK-LABEL: test_vld1q_s32
 130 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 131   %1 = bitcast i32* %a to i8*
 132   %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
 133   ret <4 x i32> %vld1
 134 }
 135
 136 define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
 137 ; CHECK-LABEL: test_vld1q_s64
 138 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 139   %1 = bitcast i64* %a to i8*
 140   %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
 141   ret <2 x i64> %vld1
 142 }
 143
 144 define <4 x float> @test_vld1q_f32(float* readonly %a) {
 145 ; CHECK-LABEL: test_vld1q_f32
 146 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 147   %1 = bitcast float* %a to i8*
 148   %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
 149   ret <4 x float> %vld1
 150 }
 151
 152 define <2 x double> @test_vld1q_f64(double* readonly %a) {
 153 ; CHECK-LABEL: test_vld1q_f64
 154 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 155   %1 = bitcast double* %a to i8*
 156   %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
 157   ret <2 x double> %vld1
 158 }
 159
 160 define <8 x i8> @test_vld1_s8(i8* readonly %a) {
 161 ; CHECK-LABEL: test_vld1_s8
 162 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 163   %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
 164   ret <8 x i8> %vld1
 165 }
 166
 167 define <4 x i16> @test_vld1_s16(i16* readonly %a) {
 168 ; CHECK-LABEL: test_vld1_s16
 169 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 170   %1 = bitcast i16* %a to i8*
 171   %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
 172   ret <4 x i16> %vld1
 173 }
 174
 175 define <2 x i32> @test_vld1_s32(i32* readonly %a) {
 176 ; CHECK-LABEL: test_vld1_s32
 177 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 178   %1 = bitcast i32* %a to i8*
 179   %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
 180   ret <2 x i32> %vld1
 181 }
 182
 183 define <1 x i64> @test_vld1_s64(i64* readonly %a) {
 184 ; CHECK-LABEL: test_vld1_s64
 185 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 186   %1 = bitcast i64* %a to i8*
 187   %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
 188   ret <1 x i64> %vld1
 189 }
 190
 191 define <2 x float> @test_vld1_f32(float* readonly %a) {
 192 ; CHECK-LABEL: test_vld1_f32
 193 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 194   %1 = bitcast float* %a to i8*
 195   %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
 196   ret <2 x float> %vld1
 197 }
 198
 199 define <1 x double> @test_vld1_f64(double* readonly %a) {
 200 ; CHECK-LABEL: test_vld1_f64
 201 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 202   %1 = bitcast double* %a to i8*
 203   %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
 204   ret <1 x double> %vld1
 205 }
 206
 207 define <8 x i8> @test_vld1_p8(i8* readonly %a) {
 208 ; CHECK-LABEL: test_vld1_p8
 209 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 210   %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
 211   ret <8 x i8> %vld1
 212 }
 213
 214 define <4 x i16> @test_vld1_p16(i16* readonly %a) {
 215 ; CHECK-LABEL: test_vld1_p16
 216 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 217   %1 = bitcast i16* %a to i8*
 218   %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
 219   ret <4 x i16> %vld1
 220 }
 221
 222 define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
 223 ; CHECK-LABEL: test_vld2q_s8
 224 ; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 225   %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
 226   %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
 227   %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
 228   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
 229   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
 230   ret %struct.int8x16x2_t %.fca.0.1.insert
 231 }
 232
 233 define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
 234 ; CHECK-LABEL: test_vld2q_s16
 235 ; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 236   %1 = bitcast i16* %a to i8*
 237   %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
 238   %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
 239   %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
 240   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
 241   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
 242   ret %struct.int16x8x2_t %.fca.0.1.insert
 243 }
 244
 245 define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
 246 ; CHECK-LABEL: test_vld2q_s32
 247 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 248   %1 = bitcast i32* %a to i8*
 249   %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
 250   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
 251   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
 252   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
 253   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
 254   ret %struct.int32x4x2_t %.fca.0.1.insert
 255 }
 256
 257 define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
 258 ; CHECK-LABEL: test_vld2q_s64
 259 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 260   %1 = bitcast i64* %a to i8*
 261   %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
 262   %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
 263   %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
 264   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
 265   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
 266   ret %struct.int64x2x2_t %.fca.0.1.insert
 267 }
 268
 269 define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
 270 ; CHECK-LABEL: test_vld2q_f32
 271 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 272   %1 = bitcast float* %a to i8*
 273   %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
 274   %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
 275   %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
 276   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
 277   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
 278   ret %struct.float32x4x2_t %.fca.0.1.insert
 279 }
 280
 281 define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
 282 ; CHECK-LABEL: test_vld2q_f64
 283 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 284   %1 = bitcast double* %a to i8*
 285   %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
 286   %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
 287   %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
 288   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
 289   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
 290   ret %struct.float64x2x2_t %.fca.0.1.insert
 291 }
 292
 293 define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
 294 ; CHECK-LABEL: test_vld2_s8
 295 ; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 296   %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
 297   %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
 298   %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
 299   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
 300   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
 301   ret %struct.int8x8x2_t %.fca.0.1.insert
 302 }
 303
 304 define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
 305 ; CHECK-LABEL: test_vld2_s16
 306 ; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 307   %1 = bitcast i16* %a to i8*
 308   %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
 309   %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
 310   %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
 311   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
 312   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
 313   ret %struct.int16x4x2_t %.fca.0.1.insert
 314 }
 315
 316 define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
 317 ; CHECK-LABEL: test_vld2_s32
 318 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 319   %1 = bitcast i32* %a to i8*
 320   %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
 321   %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
 322   %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
 323   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
 324   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
 325   ret %struct.int32x2x2_t %.fca.0.1.insert
 326 }
 327
 328 define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
 329 ; CHECK-LABEL: test_vld2_s64
 330 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 331   %1 = bitcast i64* %a to i8*
 332   %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
 333   %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
 334   %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
 335   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
 336   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
 337   ret %struct.int64x1x2_t %.fca.0.1.insert
 338 }
 339
 340 define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
 341 ; CHECK-LABEL: test_vld2_f32
 342 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 343   %1 = bitcast float* %a to i8*
 344   %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
 345   %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
 346   %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
 347   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
 348   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
 349   ret %struct.float32x2x2_t %.fca.0.1.insert
 350 }
 351
 352 define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
 353 ; CHECK-LABEL: test_vld2_f64
 354 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 355   %1 = bitcast double* %a to i8*
 356   %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
 357   %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
 358   %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
 359   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
 360   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
 361   ret %struct.float64x1x2_t %.fca.0.1.insert
 362 }
 363
 364 define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
 365 ; CHECK-LABEL: test_vld3q_s8
 366 ; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 367   %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
 368   %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
 369   %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
 370   %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
 371   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
 372   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
 373   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
 374   ret %struct.int8x16x3_t %.fca.0.2.insert
 375 }
 376
 377 define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
 378 ; CHECK-LABEL: test_vld3q_s16
 379 ; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 380   %1 = bitcast i16* %a to i8*
 381   %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
 382   %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
 383   %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
 384   %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
 385   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
 386   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
 387   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
 388   ret %struct.int16x8x3_t %.fca.0.2.insert
 389 }
 390
 391 define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
 392 ; CHECK-LABEL: test_vld3q_s32
 393 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 394   %1 = bitcast i32* %a to i8*
 395   %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
 396   %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
 397   %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
 398   %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
 399   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
 400   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
 401   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
 402   ret %struct.int32x4x3_t %.fca.0.2.insert
 403 }
 404
 405 define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
 406 ; CHECK-LABEL: test_vld3q_s64
 407 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 408   %1 = bitcast i64* %a to i8*
 409   %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
 410   %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
 411   %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
 412   %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
 413   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
 414   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
 415   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
 416   ret %struct.int64x2x3_t %.fca.0.2.insert
 417 }
 418
 419 define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
 420 ; CHECK-LABEL: test_vld3q_f32
 421 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 422   %1 = bitcast float* %a to i8*
 423   %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
 424   %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
 425   %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
 426   %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
 427   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
 428   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
 429   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
 430   ret %struct.float32x4x3_t %.fca.0.2.insert
 431 }
 432
 433 define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
 434 ; CHECK-LABEL: test_vld3q_f64
 435 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 436   %1 = bitcast double* %a to i8*
 437   %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
 438   %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
 439   %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
 440   %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
 441   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
 442   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
 443   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
 444   ret %struct.float64x2x3_t %.fca.0.2.insert
 445 }
 446
 447 define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
 448 ; CHECK-LABEL: test_vld3_s8
 449 ; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 450   %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
 451   %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
 452   %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
 453   %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
 454   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
 455   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
 456   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
 457   ret %struct.int8x8x3_t %.fca.0.2.insert
 458 }
 459
 460 define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
 461 ; CHECK-LABEL: test_vld3_s16
 462 ; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 463   %1 = bitcast i16* %a to i8*
 464   %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
 465   %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
 466   %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
 467   %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
 468   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
 469   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
 470   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
 471   ret %struct.int16x4x3_t %.fca.0.2.insert
 472 }
 473
 474 define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
 475 ; CHECK-LABEL: test_vld3_s32
 476 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 477   %1 = bitcast i32* %a to i8*
 478   %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
 479   %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
 480   %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
 481   %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
 482   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
 483   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
 484   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
 485   ret %struct.int32x2x3_t %.fca.0.2.insert
 486 }
 487
 488 define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
 489 ; CHECK-LABEL: test_vld3_s64
 490 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 491   %1 = bitcast i64* %a to i8*
 492   %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
 493   %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
 494   %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
 495   %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
 496   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
 497   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
 498   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
 499   ret %struct.int64x1x3_t %.fca.0.2.insert
 500 }
 501
 502 define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
 503 ; CHECK-LABEL: test_vld3_f32
 504 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 505   %1 = bitcast float* %a to i8*
 506   %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
 507   %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
 508   %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
 509   %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
 510   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
 511   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
 512   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
 513   ret %struct.float32x2x3_t %.fca.0.2.insert
 514 }
 515
 516 define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
 517 ; CHECK-LABEL: test_vld3_f64
 518 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 519   %1 = bitcast double* %a to i8*
 520   %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
 521   %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
 522   %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
 523   %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
 524   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
 525   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
 526   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
 527   ret %struct.float64x1x3_t %.fca.0.2.insert
 528 }
 529
 530 define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
 531 ; CHECK-LABEL: test_vld4q_s8
 532 ; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 533   %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
 534   %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
 535   %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
 536   %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
 537   %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
 538   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
 539   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
 540   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
 541   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
 542   ret %struct.int8x16x4_t %.fca.0.3.insert
 543 }
 544
 545 define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
 546 ; CHECK-LABEL: test_vld4q_s16
 547 ; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 548   %1 = bitcast i16* %a to i8*
 549   %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
 550   %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
 551   %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
 552   %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
 553   %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
 554   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
 555   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
 556   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
 557   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
 558   ret %struct.int16x8x4_t %.fca.0.3.insert
 559 }
 560
 561 define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
 562 ; CHECK-LABEL: test_vld4q_s32
 563 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 564   %1 = bitcast i32* %a to i8*
 565   %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
 566   %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
 567   %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
 568   %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
 569   %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
 570   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
 571   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
 572   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
 573   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
 574   ret %struct.int32x4x4_t %.fca.0.3.insert
 575 }
 576
 577 define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
 578 ; CHECK-LABEL: test_vld4q_s64
 579 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 580   %1 = bitcast i64* %a to i8*
 581   %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
 582   %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
 583   %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
 584   %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
 585   %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
 586   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
 587   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
 588   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
 589   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
 590   ret %struct.int64x2x4_t %.fca.0.3.insert
 591 }
 592
 593 define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
 594 ; CHECK-LABEL: test_vld4q_f32
 595 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 596   %1 = bitcast float* %a to i8*
 597   %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
 598   %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
 599   %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
 600   %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
 601   %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
 602   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
 603   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
 604   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
 605   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
 606   ret %struct.float32x4x4_t %.fca.0.3.insert
 607 }
 608
 609 define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
 610 ; CHECK-LABEL: test_vld4q_f64
 611 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 612   %1 = bitcast double* %a to i8*
 613   %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
 614   %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
 615   %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
 616   %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
 617   %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
 618   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
 619   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
 620   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
 621   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
 622   ret %struct.float64x2x4_t %.fca.0.3.insert
 623 }
 624
 625 define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
 626 ; CHECK-LABEL: test_vld4_s8
 627 ; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 628   %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
 629   %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
 630   %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
 631   %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
 632   %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
 633   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
 634   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
 635   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
 636   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
 637   ret %struct.int8x8x4_t %.fca.0.3.insert
 638 }
 639
 640 define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
 641 ; CHECK-LABEL: test_vld4_s16
 642 ; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 643   %1 = bitcast i16* %a to i8*
 644   %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
 645   %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
 646   %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
 647   %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
 648   %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
 649   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
 650   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
 651   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
 652   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
 653   ret %struct.int16x4x4_t %.fca.0.3.insert
 654 }
 655
 656 define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
 657 ; CHECK-LABEL: test_vld4_s32
 658 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 659   %1 = bitcast i32* %a to i8*
 660   %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
 661   %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
 662   %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
 663   %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
 664   %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
 665   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
 666   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
 667   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
 668   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
 669   ret %struct.int32x2x4_t %.fca.0.3.insert
 670 }
 671
 672 define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
 673 ; CHECK-LABEL: test_vld4_s64
 674 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 675   %1 = bitcast i64* %a to i8*
 676   %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
 677   %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
 678   %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
 679   %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
 680   %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
 681   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
 682   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
 683   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
 684   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
 685   ret %struct.int64x1x4_t %.fca.0.3.insert
 686 }
 687
 688 define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
 689 ; CHECK-LABEL: test_vld4_f32
 690 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 691   %1 = bitcast float* %a to i8*
 692   %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
 693   %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
 694   %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
 695   %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
 696   %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
 697   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
 698   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
 699   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
 700   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
 701   ret %struct.float32x2x4_t %.fca.0.3.insert
 702 }
 703
 704 define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
 705 ; CHECK-LABEL: test_vld4_f64
 706 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 707   %1 = bitcast double* %a to i8*
 708   %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
 709   %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
 710   %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
 711   %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
 712   %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
 713   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
 714   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
 715   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
 716   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
 717   ret %struct.float64x1x4_t %.fca.0.3.insert
 718 }
 719
 720 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
 721 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
 722 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
 723 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
 724 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
 725 declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
 726 declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
 727 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
 728 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
 729 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
 730 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
 731 declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
 732 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
 733 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
 734 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
 735 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
 736 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
 737 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
 738 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
 739 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
 740 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
 741 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
 742 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
 743 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
 744 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
 745 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
 746 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
 747 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
 748 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
 749 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
 750 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
 751 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
 752 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
 753 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
 754 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
 755 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
 756 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
 757 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
 758 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
 759 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
 760 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
 761 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
 762 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
 763 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
 764 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
 765 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
 766 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
 767 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
 768
 769 define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
 770 ; CHECK-LABEL: test_vst1q_s8
 771 ; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 772   tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
 773   ret void
 774 }
 775
 776 define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
 777 ; CHECK-LABEL: test_vst1q_s16
 778 ; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 779   %1 = bitcast i16* %a to i8*
 780   tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
 781   ret void
 782 }
 783
 784 define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
 785 ; CHECK-LABEL: test_vst1q_s32
 786 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 787   %1 = bitcast i32* %a to i8*
 788   tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
 789   ret void
 790 }
 791
 792 define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
 793 ; CHECK-LABEL: test_vst1q_s64
 794 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 795   %1 = bitcast i64* %a to i8*
 796   tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
 797   ret void
 798 }
 799
 800 define void @test_vst1q_f32(float* %a, <4 x float> %b) {
 801 ; CHECK-LABEL: test_vst1q_f32
 802 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 803   %1 = bitcast float* %a to i8*
 804   tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
 805   ret void
 806 }
 807
 808 define void @test_vst1q_f64(double* %a, <2 x double> %b) {
 809 ; CHECK-LABEL: test_vst1q_f64
 810 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 811   %1 = bitcast double* %a to i8*
 812   tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
 813   ret void
 814 }
 815
 816 define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
 817 ; CHECK-LABEL: test_vst1_s8
 818 ; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
 819   tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
 820   ret void
 821 }
 822
 823 define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
 824 ; CHECK-LABEL: test_vst1_s16
 825 ; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
 826   %1 = bitcast i16* %a to i8*
 827   tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
 828   ret void
 829 }
 830
 831 define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
 832 ; CHECK-LABEL: test_vst1_s32
 833 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 834   %1 = bitcast i32* %a to i8*
 835   tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
 836   ret void
 837 }
 838
 839 define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
 840 ; CHECK-LABEL: test_vst1_s64
 841 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 842   %1 = bitcast i64* %a to i8*
 843   tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
 844   ret void
 845 }
 846
 847 define void @test_vst1_f32(float* %a, <2 x float> %b) {
 848 ; CHECK-LABEL: test_vst1_f32
 849 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 850   %1 = bitcast float* %a to i8*
 851   tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
 852   ret void
 853 }
 854
 855 define void @test_vst1_f64(double* %a, <1 x double> %b) {
 856 ; CHECK-LABEL: test_vst1_f64
 857 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 858   %1 = bitcast double* %a to i8*
 859   tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
 860   ret void
 861 }
 862
 863 define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
 864 ; CHECK-LABEL: test_vst2q_s8
 865 ; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 866   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
 867   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
 868   tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
 869   ret void
 870 }
 871
 872 define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
 873 ; CHECK-LABEL: test_vst2q_s16
 874 ; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 875   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
 876   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
 877   %1 = bitcast i16* %a to i8*
 878   tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
 879   ret void
 880 }
 881
 882 define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
 883 ; CHECK-LABEL: test_vst2q_s32
 884 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 885   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
 886   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
 887   %1 = bitcast i32* %a to i8*
 888   tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
 889   ret void
 890 }
 891
 892 define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
 893 ; CHECK-LABEL: test_vst2q_s64
 894 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 895   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
 896   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
 897   %1 = bitcast i64* %a to i8*
 898   tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
 899   ret void
 900 }
 901
 902 define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
 903 ; CHECK-LABEL: test_vst2q_f32
 904 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 905   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
 906   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
 907   %1 = bitcast float* %a to i8*
 908   tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
 909   ret void
 910 }
 911
 912 define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
 913 ; CHECK-LABEL: test_vst2q_f64
 914 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 915   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
 916   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
 917   %1 = bitcast double* %a to i8*
 918   tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
 919   ret void
 920 }
 921
 922 define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
 923 ; CHECK-LABEL: test_vst2_s8
 924 ; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
 925   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
 926   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
 927   tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
 928   ret void
 929 }
 930
 931 define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
 932 ; CHECK-LABEL: test_vst2_s16
 933 ; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
 934   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
 935   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
 936   %1 = bitcast i16* %a to i8*
 937   tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
 938   ret void
 939 }
 940
 941 define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
 942 ; CHECK-LABEL: test_vst2_s32
 943 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 944   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
 945   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
 946   %1 = bitcast i32* %a to i8*
 947   tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
 948   ret void
 949 }
 950
 951 define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
 952 ; CHECK-LABEL: test_vst2_s64
 953 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 954   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
 955   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
 956   %1 = bitcast i64* %a to i8*
 957   tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
 958   ret void
 959 }
 960
 961 define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
 962 ; CHECK-LABEL: test_vst2_f32
 963 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 964   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
 965   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
 966   %1 = bitcast float* %a to i8*
 967   tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
 968   ret void
 969 }
 970
 971 define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
 972 ; CHECK-LABEL: test_vst2_f64
 973 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 974   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
 975   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
 976   %1 = bitcast double* %a to i8*
 977   tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
 978   ret void
 979 }
 980
 981 define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
 982 ; CHECK-LABEL: test_vst3q_s8
 983 ; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 984   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
 985   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
 986   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
 987   tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
 988   ret void
 989 }
 990
 991 define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
 992 ; CHECK-LABEL: test_vst3q_s16
 993 ; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 994   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
 995   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
 996   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
 997   %1 = bitcast i16* %a to i8*
 998   tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
 999   ret void
1000 }
1001
1002 define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1003 ; CHECK-LABEL: test_vst3q_s32
1004 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1005   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1006   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1007   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1008   %1 = bitcast i32* %a to i8*
1009   tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
1010   ret void
1011 }
1012
1013 define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1014 ; CHECK-LABEL: test_vst3q_s64
1015 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1016   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1017   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1018   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1019   %1 = bitcast i64* %a to i8*
1020   tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
1021   ret void
1022 }
1023
1024 define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1025 ; CHECK-LABEL: test_vst3q_f32
1026 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1027   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1028   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1029   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1030   %1 = bitcast float* %a to i8*
1031   tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
1032   ret void
1033 }
1034
1035 define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1036 ; CHECK-LABEL: test_vst3q_f64
1037 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1038   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1039   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1040   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1041   %1 = bitcast double* %a to i8*
1042   tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
1043   ret void
1044 }
1045
1046 define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1047 ; CHECK-LABEL: test_vst3_s8
1048 ; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1049   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1050   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1051   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1052   tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
1053   ret void
1054 }
1055
1056 define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1057 ; CHECK-LABEL: test_vst3_s16
1058 ; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1059   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1060   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1061   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1062   %1 = bitcast i16* %a to i8*
1063   tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
1064   ret void
1065 }
1066
1067 define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1068 ; CHECK-LABEL: test_vst3_s32
1069 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1070   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1071   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1072   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1073   %1 = bitcast i32* %a to i8*
1074   tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
1075   ret void
1076 }
1077
1078 define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1079 ; CHECK-LABEL: test_vst3_s64
1080 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1081   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1082   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1083   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1084   %1 = bitcast i64* %a to i8*
1085   tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
1086   ret void
1087 }
1088
1089 define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1090 ; CHECK-LABEL: test_vst3_f32
1091 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1092   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1093   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1094   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1095   %1 = bitcast float* %a to i8*
1096   tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
1097   ret void
1098 }
1099
1100 define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1101 ; CHECK-LABEL: test_vst3_f64
1102 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1103   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1104   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1105   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1106   %1 = bitcast double* %a to i8*
1107   tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
1108   ret void
1109 }
1110
1111 define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1112 ; CHECK-LABEL: test_vst4q_s8
1113 ; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1114   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1115   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1116   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1117   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1118   tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
1119   ret void
1120 }
1121
1122 define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1123 ; CHECK-LABEL: test_vst4q_s16
1124 ; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1125   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1126   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1127   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1128   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1129   %1 = bitcast i16* %a to i8*
1130   tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
1131   ret void
1132 }
1133
1134 define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1135 ; CHECK-LABEL: test_vst4q_s32
1136 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1137   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1138   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1139   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1140   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1141   %1 = bitcast i32* %a to i8*
1142   tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
1143   ret void
1144 }
1145
1146 define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1147 ; CHECK-LABEL: test_vst4q_s64
1148 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1149   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1150   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1151   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1152   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1153   %1 = bitcast i64* %a to i8*
1154   tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
1155   ret void
1156 }
1157
1158 define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1159 ; CHECK-LABEL: test_vst4q_f32
1160 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1161   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1162   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1163   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1164   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1165   %1 = bitcast float* %a to i8*
1166   tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
1167   ret void
1168 }
1169
1170 define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1171 ; CHECK-LABEL: test_vst4q_f64
1172 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1173   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1174   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1175   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1176   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1177   %1 = bitcast double* %a to i8*
1178   tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
1179   ret void
1180 }
1181
1182 define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1183 ; CHECK-LABEL: test_vst4_s8
1184 ; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1185   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1186   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1187   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1188   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1189   tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
1190   ret void
1191 }
1192
1193 define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1194 ; CHECK-LABEL: test_vst4_s16
1195 ; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1196   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1197   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1198   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1199   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1200   %1 = bitcast i16* %a to i8*
1201   tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
1202   ret void
1203 }
1204
1205 define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1206 ; CHECK-LABEL: test_vst4_s32
1207 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1208   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1209   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1210   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1211   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1212   %1 = bitcast i32* %a to i8*
1213   tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
1214   ret void
1215 }
1216
1217 define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1218 ; CHECK-LABEL: test_vst4_s64
1219 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1220   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1221   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1222   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1223   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1224   %1 = bitcast i64* %a to i8*
1225   tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
1226   ret void
1227 }
1228
1229 define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1230 ; CHECK-LABEL: test_vst4_f32
1231 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1232   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1233   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1234   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1235   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1236   %1 = bitcast float* %a to i8*
1237   tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
1238   ret void
1239 }
1240
1241 define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1242 ; CHECK-LABEL: test_vst4_f64
1243 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1244   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1245   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1246   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1247   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1248   %1 = bitcast double* %a to i8*
1249   tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
1250   ret void
1251 }
1252
1253 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
1254 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
1255 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
1256 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
1257 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
1258 declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
1259 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
1260 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
1261 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
1262 declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
1263 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
1264 declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
1265 declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
1266 declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
1267 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
1268 declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
1269 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
1270 declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
1271 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
1272 declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
1273 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
1274 declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
1275 declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
1276 declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
1277 declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1278 declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1279 declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1280 declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1281 declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
1282 declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
1283 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1284 declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1285 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1286 declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1287 declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
1288 declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
1289 declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1290 declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1291 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1292 declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1293 declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
1294 declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
1295 declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1296 declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1297 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1298 declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1299 declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
1300 declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
1301
1302 define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
1303 ; CHECK-LABEL: test_vld1q_s8_x2
1304 ; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1305   %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
1306   %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
1307   %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
1308   %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
1309   %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
1310   ret %struct.int8x16x2_t %5
1311 }
1312
1313 define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
1314 ; CHECK-LABEL: test_vld1q_s16_x2
1315 ; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1316   %1 = bitcast i16* %a to i8*
1317   %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
1318   %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
1319   %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
1320   %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
1321   %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
1322   ret %struct.int16x8x2_t %6
1323 }
1324
1325 define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
1326 ; CHECK-LABEL: test_vld1q_s32_x2
1327 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1328   %1 = bitcast i32* %a to i8*
1329   %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
1330   %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
1331   %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
1332   %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
1333   %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
1334   ret %struct.int32x4x2_t %6
1335 }
1336
1337 define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
1338 ; CHECK-LABEL: test_vld1q_s64_x2
1339 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1340   %1 = bitcast i64* %a to i8*
1341   %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
1342   %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
1343   %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
1344   %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
1345   %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
1346   ret %struct.int64x2x2_t %6
1347 }
1348
1349 define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
1350 ; CHECK-LABEL: test_vld1q_f32_x2
1351 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1352   %1 = bitcast float* %a to i8*
1353   %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
1354   %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
1355   %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
1356   %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
1357   %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
1358   ret %struct.float32x4x2_t %6
1359 }
1360
1361
1362 define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
1363 ; CHECK-LABEL: test_vld1q_f64_x2
1364 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1365   %1 = bitcast double* %a to i8*
1366   %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
1367   %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
1368   %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
1369   %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
1370   %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
1371   ret %struct.float64x2x2_t %6
1372 }
1373
1374 define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
1375 ; CHECK-LABEL: test_vld1_s8_x2
1376 ; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1377   %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
1378   %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
1379   %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
1380   %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
1381   %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
1382   ret %struct.int8x8x2_t %5
1383 }
1384
1385 define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
1386 ; CHECK-LABEL: test_vld1_s16_x2
1387 ; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1388   %1 = bitcast i16* %a to i8*
1389   %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
1390   %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
1391   %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
1392   %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
1393   %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
1394   ret %struct.int16x4x2_t %6
1395 }
1396
1397 define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
1398 ; CHECK-LABEL: test_vld1_s32_x2
1399 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1400   %1 = bitcast i32* %a to i8*
1401   %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
1402   %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
1403   %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
1404   %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
1405   %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
1406   ret %struct.int32x2x2_t %6
1407 }
1408
1409 define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
1410 ; CHECK-LABEL: test_vld1_s64_x2
1411 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1412   %1 = bitcast i64* %a to i8*
1413   %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
1414   %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
1415   %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
1416   %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
1417   %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
1418   ret %struct.int64x1x2_t %6
1419 }
1420
1421 define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
1422 ; CHECK-LABEL: test_vld1_f32_x2
1423 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1424   %1 = bitcast float* %a to i8*
1425   %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
1426   %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
1427   %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
1428   %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
1429   %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
1430   ret %struct.float32x2x2_t %6
1431 }
1432
1433 define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
1434 ; CHECK-LABEL: test_vld1_f64_x2
1435 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1436   %1 = bitcast double* %a to i8*
1437   %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
1438   %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
1439   %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
1440   %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
1441   %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
1442   ret %struct.float64x1x2_t %6
1443 }
1444
1445 define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
1446 ; CHECK-LABEL: test_vld1q_s8_x3
1447 ; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
1448 ; [{{x[0-9]+|sp}}]
1449   %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
1450   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
1451   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
1452   %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
1453   %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
1454   %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
1455   %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
1456   ret %struct.int8x16x3_t %7
1457 }
1458
1459 define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
1460 ; CHECK-LABEL: test_vld1q_s16_x3
1461 ; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
1462 ; [{{x[0-9]+|sp}}]
1463   %1 = bitcast i16* %a to i8*
1464   %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
1465   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
1466   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
1467   %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
1468   %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
1469   %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
1470   %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
1471   ret %struct.int16x8x3_t %8
1472 }
1473
1474 define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
1475 ; CHECK-LABEL: test_vld1q_s32_x3
1476 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
1477 ; [{{x[0-9]+|sp}}]
1478   %1 = bitcast i32* %a to i8*
1479   %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
1480   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
1481   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
1482   %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
1483   %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
1484   %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
1485   %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
1486   ret %struct.int32x4x3_t %8
1487 }
1488
1489 define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
1490 ; CHECK-LABEL: test_vld1q_s64_x3
1491 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
1492 ; [{{x[0-9]+|sp}}]
1493   %1 = bitcast i64* %a to i8*
1494   %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
1495   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
1496   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
1497   %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
1498   %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
1499   %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
1500   %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
1501   ret %struct.int64x2x3_t %8
1502 }
1503
1504 define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
1505 ; CHECK-LABEL: test_vld1q_f32_x3
1506 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
1507 ; [{{x[0-9]+|sp}}]
1508   %1 = bitcast float* %a to i8*
1509   %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
1510   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
1511   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
1512   %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
1513   %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
1514   %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
1515   %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
1516   ret %struct.float32x4x3_t %8
1517 }
1518
1519
1520 define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
1521 ; CHECK-LABEL: test_vld1q_f64_x3
1522 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
1523 ; [{{x[0-9]+|sp}}]
1524   %1 = bitcast double* %a to i8*
1525   %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
1526   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
1527   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
1528   %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
1529   %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
1530   %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
1531   %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
1532   ret %struct.float64x2x3_t %8
1533 }
1534
1535 define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
1536 ; CHECK-LABEL: test_vld1_s8_x3
1537 ; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
1538 ; [{{x[0-9]+|sp}}]
1539   %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
1540   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
1541   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
1542   %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
1543   %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
1544   %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
1545   %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
1546   ret %struct.int8x8x3_t %7
1547 }
1548
1549 define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
1550 ; CHECK-LABEL: test_vld1_s16_x3
1551 ; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
1552 ; [{{x[0-9]+|sp}}]
1553   %1 = bitcast i16* %a to i8*
1554   %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
1555   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
1556   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
1557   %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
1558   %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
1559   %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
1560   %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
1561   ret %struct.int16x4x3_t %8
1562 }
1563
1564 define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
1565   %1 = bitcast i32* %a to i8*
1566 ; CHECK-LABEL: test_vld1_s32_x3
1567 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
1568 ; [{{x[0-9]+|sp}}]
1569   %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
1570   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
1571   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
1572   %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
1573   %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
1574   %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
1575   %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
1576   ret %struct.int32x2x3_t %8
1577 }
1578
1579 define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
1580 ; CHECK-LABEL: test_vld1_s64_x3
1581 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
1582 ; [{{x[0-9]+|sp}}]
1583   %1 = bitcast i64* %a to i8*
1584   %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
1585   %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
1586   %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
1587   %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
1588   %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
1589   %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
1590   %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
1591   ret %struct.int64x1x3_t %8
1592 }
1593
1594 define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
1595 ; CHECK-LABEL: test_vld1_f32_x3
1596 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
1597 ; [{{x[0-9]+|sp}}]
1598   %1 = bitcast float* %a to i8*
1599   %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
1600   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
1601   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
1602   %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
1603   %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
1604   %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
1605   %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
1606   ret %struct.float32x2x3_t %8
1607 }
1608
1609
1610 define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
1611 ; CHECK-LABEL: test_vld1_f64_x3
1612 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
1613 ; [{{x[0-9]+|sp}}]
1614   %1 = bitcast double* %a to i8*
1615   %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
1616   %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
1617   %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
1618   %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
1619   %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
1620   %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
1621   %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
1622   ret %struct.float64x1x3_t %8
1623 }
1624
1625 define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
1626 ; CHECK-LABEL: test_vld1q_s8_x4
1627 ; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
1628 ; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1629   %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
1630   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
1631   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
1632   %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
1633   %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
1634   %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
1635   %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
1636   %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
1637   %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
1638   ret %struct.int8x16x4_t %9
1639 }
1640
1641 define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
1642 ; CHECK-LABEL: test_vld1q_s16_x4
1643 ; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
1644 ; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1645   %1 = bitcast i16* %a to i8*
1646   %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
1647   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
1648   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
1649   %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
1650   %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
1651   %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
1652   %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
1653   %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
1654   %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
1655   ret %struct.int16x8x4_t %10
1656 }
1657
1658 define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
1659 ; CHECK-LABEL: test_vld1q_s32_x4
1660 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
1661 ; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1662   %1 = bitcast i32* %a to i8*
1663   %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
1664   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
1665   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
1666   %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
1667   %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
1668   %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
1669   %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
1670   %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
1671   %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
1672   ret %struct.int32x4x4_t %10
1673 }
1674
1675 define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
1676 ; CHECK-LABEL: test_vld1q_s64_x4
1677 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
1678 ; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1679   %1 = bitcast i64* %a to i8*
1680   %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
1681   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
1682   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
1683   %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
1684   %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
1685   %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
1686   %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
1687   %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
1688   %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
1689   ret %struct.int64x2x4_t %10
1690 }
1691
1692 define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
1693 ; CHECK-LABEL: test_vld1q_f32_x4
1694 ; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
1695 ; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1696   %1 = bitcast float* %a to i8*
1697   %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
1698   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
1699   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
1700   %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
1701   %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
1702   %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
1703   %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
1704   %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
1705   %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
1706   ret %struct.float32x4x4_t %10
1707 }
1708
1709 define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
1710 ; CHECK-LABEL: test_vld1q_f64_x4
1711 ; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
1712 ; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1713   %1 = bitcast double* %a to i8*
1714   %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
1715   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
1716   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
1717   %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
1718   %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
1719   %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
1720   %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
1721   %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
1722   %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
1723   ret %struct.float64x2x4_t %10
1724 }
1725
1726 define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
1727 ; CHECK-LABEL: test_vld1_s8_x4
1728 ; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
1729 ; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1730   %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
1731   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
1732   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
1733   %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
1734   %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
1735   %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
1736   %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
1737   %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
1738   %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
1739   ret %struct.int8x8x4_t %9
1740 }
1741
1742 define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
1743 ; CHECK-LABEL: test_vld1_s16_x4
1744 ; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
1745 ; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1746   %1 = bitcast i16* %a to i8*
1747   %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
1748   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
1749   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
1750   %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
1751   %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
1752   %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
1753   %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
1754   %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
1755   %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
1756   ret %struct.int16x4x4_t %10
1757 }
1758
1759 define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
1760 ; CHECK-LABEL: test_vld1_s32_x4
1761 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
1762 ; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1763   %1 = bitcast i32* %a to i8*
1764   %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
1765   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
1766   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
1767   %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
1768   %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
1769   %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
1770   %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
1771   %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
1772   %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
1773   ret %struct.int32x2x4_t %10
1774 }
1775
1776 define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
1777 ; CHECK-LABEL: test_vld1_s64_x4
1778 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
1779 ; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1780   %1 = bitcast i64* %a to i8*
1781   %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
1782   %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
1783   %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
1784   %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
1785   %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
1786   %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
1787   %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
1788   %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
1789   %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
1790   ret %struct.int64x1x4_t %10
1791 }
1792
1793 define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
1794 ; CHECK-LABEL: test_vld1_f32_x4
1795 ; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
1796 ; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1797   %1 = bitcast float* %a to i8*
1798   %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
1799   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
1800   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
1801   %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
1802   %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
1803   %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
1804   %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
1805   %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
1806   %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
1807   ret %struct.float32x2x4_t %10
1808 }
1809
1810
1811 define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
1812 ; CHECK-LABEL: test_vld1_f64_x4
1813 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
1814 ; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1815   %1 = bitcast double* %a to i8*
1816   %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
1817   %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
1818   %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
1819   %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
1820   %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
1821   %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
1822   %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
1823   %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
1824   %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
1825   ret %struct.float64x1x4_t %10
1826 }
1827
1828 define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
1829 ; CHECK-LABEL: test_vst1q_s8_x2
1830 ; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1831   %1 = extractvalue [2 x <16 x i8>] %b, 0
1832   %2 = extractvalue [2 x <16 x i8>] %b, 1
1833   tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
1834   ret void
1835 }
1836
1837 define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
1838 ; CHECK-LABEL: test_vst1q_s16_x2
1839 ; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1840   %1 = extractvalue [2 x <8 x i16>] %b, 0
1841   %2 = extractvalue [2 x <8 x i16>] %b, 1
1842   %3 = bitcast i16* %a to i8*
1843   tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
1844   ret void
1845 }
1846
1847 define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
1848 ; CHECK-LABEL: test_vst1q_s32_x2
1849 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1850   %1 = extractvalue [2 x <4 x i32>] %b, 0
1851   %2 = extractvalue [2 x <4 x i32>] %b, 1
1852   %3 = bitcast i32* %a to i8*
1853   tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
1854   ret void
1855 }
1856
1857 define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
1858 ; CHECK-LABEL: test_vst1q_s64_x2
1859 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1860   %1 = extractvalue [2 x <2 x i64>] %b, 0
1861   %2 = extractvalue [2 x <2 x i64>] %b, 1
1862   %3 = bitcast i64* %a to i8*
1863   tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
1864   ret void
1865 }
1866
1867 define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
1868 ; CHECK-LABEL: test_vst1q_f32_x2
1869 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1870   %1 = extractvalue [2 x <4 x float>] %b, 0
1871   %2 = extractvalue [2 x <4 x float>] %b, 1
1872   %3 = bitcast float* %a to i8*
1873   tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
1874   ret void
1875 }
1876
1877
1878 define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
1879 ; CHECK-LABEL: test_vst1q_f64_x2
1880 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1881   %1 = extractvalue [2 x <2 x double>] %b, 0
1882   %2 = extractvalue [2 x <2 x double>] %b, 1
1883   %3 = bitcast double* %a to i8*
1884   tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
1885   ret void
1886 }
1887
1888 define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
1889 ; CHECK-LABEL: test_vst1_s8_x2
1890 ; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1891   %1 = extractvalue [2 x <8 x i8>] %b, 0
1892   %2 = extractvalue [2 x <8 x i8>] %b, 1
1893   tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
1894   ret void
1895 }
1896
1897 define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
1898 ; CHECK-LABEL: test_vst1_s16_x2
1899 ; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1900   %1 = extractvalue [2 x <4 x i16>] %b, 0
1901   %2 = extractvalue [2 x <4 x i16>] %b, 1
1902   %3 = bitcast i16* %a to i8*
1903   tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
1904   ret void
1905 }
1906
1907 define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
1908 ; CHECK-LABEL: test_vst1_s32_x2
1909 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1910   %1 = extractvalue [2 x <2 x i32>] %b, 0
1911   %2 = extractvalue [2 x <2 x i32>] %b, 1
1912   %3 = bitcast i32* %a to i8*
1913   tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
1914   ret void
1915 }
1916
1917 define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
1918 ; CHECK-LABEL: test_vst1_s64_x2
1919 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1920   %1 = extractvalue [2 x <1 x i64>] %b, 0
1921   %2 = extractvalue [2 x <1 x i64>] %b, 1
1922   %3 = bitcast i64* %a to i8*
1923   tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
1924   ret void
1925 }
1926
1927 define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
1928 ; CHECK-LABEL: test_vst1_f32_x2
1929 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1930   %1 = extractvalue [2 x <2 x float>] %b, 0
1931   %2 = extractvalue [2 x <2 x float>] %b, 1
1932   %3 = bitcast float* %a to i8*
1933   tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
1934   ret void
1935 }
1936
1937 define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
1938 ; CHECK-LABEL: test_vst1_f64_x2
1939 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1940   %1 = extractvalue [2 x <1 x double>] %b, 0
1941   %2 = extractvalue [2 x <1 x double>] %b, 1
1942   %3 = bitcast double* %a to i8*
1943   tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
1944   ret void
1945 }
1946
1947 define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
1948 ; CHECK-LABEL: test_vst1q_s8_x3
1949 ; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
1950 ; [{{x[0-9]+|sp}}]
1951   %1 = extractvalue [3 x <16 x i8>] %b, 0
1952   %2 = extractvalue [3 x <16 x i8>] %b, 1
1953   %3 = extractvalue [3 x <16 x i8>] %b, 2
1954   tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
1955   ret void
1956 }
1957
1958 define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
1959 ; CHECK-LABEL: test_vst1q_s16_x3
1960 ; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
1961 ; [{{x[0-9]+|sp}}]
1962   %1 = extractvalue [3 x <8 x i16>] %b, 0
1963   %2 = extractvalue [3 x <8 x i16>] %b, 1
1964   %3 = extractvalue [3 x <8 x i16>] %b, 2
1965   %4 = bitcast i16* %a to i8*
1966   tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
1967   ret void
1968 }
1969
1970 define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
1971 ; CHECK-LABEL: test_vst1q_s32_x3
1972 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
1973 ; [{{x[0-9]+|sp}}]
1974   %1 = extractvalue [3 x <4 x i32>] %b, 0
1975   %2 = extractvalue [3 x <4 x i32>] %b, 1
1976   %3 = extractvalue [3 x <4 x i32>] %b, 2
1977   %4 = bitcast i32* %a to i8*
1978   tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
1979   ret void
1980 }
1981
1982 define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
1983 ; CHECK-LABEL: test_vst1q_s64_x3
1984 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
1985 ; [{{x[0-9]+|sp}}]
1986   %1 = extractvalue [3 x <2 x i64>] %b, 0
1987   %2 = extractvalue [3 x <2 x i64>] %b, 1
1988   %3 = extractvalue [3 x <2 x i64>] %b, 2
1989   %4 = bitcast i64* %a to i8*
1990   tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
1991   ret void
1992 }
1993
1994 define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
1995 ; CHECK-LABEL: test_vst1q_f32_x3
1996 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
1997 ; [{{x[0-9]+|sp}}]
1998   %1 = extractvalue [3 x <4 x float>] %b, 0
1999   %2 = extractvalue [3 x <4 x float>] %b, 1
2000   %3 = extractvalue [3 x <4 x float>] %b, 2
2001   %4 = bitcast float* %a to i8*
2002   tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
2003   ret void
2004 }
2005
2006 define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
2007 ; CHECK-LABEL: test_vst1q_f64_x3
2008 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
2009 ; [{{x[0-9]+|sp}}]
2010   %1 = extractvalue [3 x <2 x double>] %b, 0
2011   %2 = extractvalue [3 x <2 x double>] %b, 1
2012   %3 = extractvalue [3 x <2 x double>] %b, 2
2013   %4 = bitcast double* %a to i8*
2014   tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
2015   ret void
2016 }
2017
2018 define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
2019 ; CHECK-LABEL: test_vst1_s8_x3
2020 ; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
2021 ; [{{x[0-9]+|sp}}]
2022   %1 = extractvalue [3 x <8 x i8>] %b, 0
2023   %2 = extractvalue [3 x <8 x i8>] %b, 1
2024   %3 = extractvalue [3 x <8 x i8>] %b, 2
2025   tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
2026   ret void
2027 }
2028
2029 define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
2030 ; CHECK-LABEL: test_vst1_s16_x3
2031 ; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
2032 ; [{{x[0-9]+|sp}}]
2033   %1 = extractvalue [3 x <4 x i16>] %b, 0
2034   %2 = extractvalue [3 x <4 x i16>] %b, 1
2035   %3 = extractvalue [3 x <4 x i16>] %b, 2
2036   %4 = bitcast i16* %a to i8*
2037   tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
2038   ret void
2039 }
2040
2041 define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
2042 ; CHECK-LABEL: test_vst1_s32_x3
2043 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
2044 ; [{{x[0-9]+|sp}}]
2045   %1 = extractvalue [3 x <2 x i32>] %b, 0
2046   %2 = extractvalue [3 x <2 x i32>] %b, 1
2047   %3 = extractvalue [3 x <2 x i32>] %b, 2
2048   %4 = bitcast i32* %a to i8*
2049   tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
2050   ret void
2051 }
2052
2053 define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
2054 ; CHECK-LABEL: test_vst1_s64_x3
2055 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
2056 ; [{{x[0-9]+|sp}}]
2057   %1 = extractvalue [3 x <1 x i64>] %b, 0
2058   %2 = extractvalue [3 x <1 x i64>] %b, 1
2059   %3 = extractvalue [3 x <1 x i64>] %b, 2
2060   %4 = bitcast i64* %a to i8*
2061   tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
2062   ret void
2063 }
2064
2065 define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
2066 ; CHECK-LABEL: test_vst1_f32_x3
2067 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
2068 ; [{{x[0-9]+|sp}}]
2069   %1 = extractvalue [3 x <2 x float>] %b, 0
2070   %2 = extractvalue [3 x <2 x float>] %b, 1
2071   %3 = extractvalue [3 x <2 x float>] %b, 2
2072   %4 = bitcast float* %a to i8*
2073   tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
2074   ret void
2075 }
2076
2077 define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
2078 ; CHECK-LABEL: test_vst1_f64_x3
2079 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
2080 ; [{{x[0-9]+|sp}}]
2081   %1 = extractvalue [3 x <1 x double>] %b, 0
2082   %2 = extractvalue [3 x <1 x double>] %b, 1
2083   %3 = extractvalue [3 x <1 x double>] %b, 2
2084   %4 = bitcast double* %a to i8*
2085   tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
2086   ret void
2087 }
2088
2089 define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
2090 ; CHECK-LABEL: test_vst1q_s8_x4
2091 ; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
2092 ; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
2093   %1 = extractvalue [4 x <16 x i8>] %b, 0
2094   %2 = extractvalue [4 x <16 x i8>] %b, 1
2095   %3 = extractvalue [4 x <16 x i8>] %b, 2
2096   %4 = extractvalue [4 x <16 x i8>] %b, 3
2097   tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
2098   ret void
2099 }
2100
2101 define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
2102 ; CHECK-LABEL: test_vst1q_s16_x4
2103 ; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
2104 ; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
2105   %1 = extractvalue [4 x <8 x i16>] %b, 0
2106   %2 = extractvalue [4 x <8 x i16>] %b, 1
2107   %3 = extractvalue [4 x <8 x i16>] %b, 2
2108   %4 = extractvalue [4 x <8 x i16>] %b, 3
2109   %5 = bitcast i16* %a to i8*
2110   tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
2111   ret void
2112 }
2113
2114 define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
2115 ; CHECK-LABEL: test_vst1q_s32_x4
2116 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
2117 ; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
2118   %1 = extractvalue [4 x <4 x i32>] %b, 0
2119   %2 = extractvalue [4 x <4 x i32>] %b, 1
2120   %3 = extractvalue [4 x <4 x i32>] %b, 2
2121   %4 = extractvalue [4 x <4 x i32>] %b, 3
2122   %5 = bitcast i32* %a to i8*
2123   tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
2124   ret void
2125 }
2126
2127 define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
2128 ; CHECK-LABEL: test_vst1q_s64_x4
2129 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
2130 ; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
2131   %1 = extractvalue [4 x <2 x i64>] %b, 0
2132   %2 = extractvalue [4 x <2 x i64>] %b, 1
2133   %3 = extractvalue [4 x <2 x i64>] %b, 2
2134   %4 = extractvalue [4 x <2 x i64>] %b, 3
2135   %5 = bitcast i64* %a to i8*
2136   tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
2137   ret void
2138 }
2139
2140 define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
2141 ; CHECK-LABEL: test_vst1q_f32_x4
2142 ; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
2143 ; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
2144   %1 = extractvalue [4 x <4 x float>] %b, 0
2145   %2 = extractvalue [4 x <4 x float>] %b, 1
2146   %3 = extractvalue [4 x <4 x float>] %b, 2
2147   %4 = extractvalue [4 x <4 x float>] %b, 3
2148   %5 = bitcast float* %a to i8*
2149   tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
2150   ret void
2151 }
2152
2153 define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
2154 ; CHECK-LABEL: test_vst1q_f64_x4
2155 ; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
2156 ; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
2157   %1 = extractvalue [4 x <2 x double>] %b, 0
2158   %2 = extractvalue [4 x <2 x double>] %b, 1
2159   %3 = extractvalue [4 x <2 x double>] %b, 2
2160   %4 = extractvalue [4 x <2 x double>] %b, 3
2161   %5 = bitcast double* %a to i8*
2162   tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
2163   ret void
2164 }
2165
2166 define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
2167 ; CHECK-LABEL: test_vst1_s8_x4
2168 ; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
2169 ; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
2170   %1 = extractvalue [4 x <8 x i8>] %b, 0
2171   %2 = extractvalue [4 x <8 x i8>] %b, 1
2172   %3 = extractvalue [4 x <8 x i8>] %b, 2
2173   %4 = extractvalue [4 x <8 x i8>] %b, 3
2174   tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
2175   ret void
2176 }
2177
2178 define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
2179 ; CHECK-LABEL: test_vst1_s16_x4
2180 ; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
2181 ; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
2182   %1 = extractvalue [4 x <4 x i16>] %b, 0
2183   %2 = extractvalue [4 x <4 x i16>] %b, 1
2184   %3 = extractvalue [4 x <4 x i16>] %b, 2
2185   %4 = extractvalue [4 x <4 x i16>] %b, 3
2186   %5 = bitcast i16* %a to i8*
2187   tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
2188   ret void
2189 }
2190
2191 define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
2192 ; CHECK-LABEL: test_vst1_s32_x4
2193 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
2194 ; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
2195   %1 = extractvalue [4 x <2 x i32>] %b, 0
2196   %2 = extractvalue [4 x <2 x i32>] %b, 1
2197   %3 = extractvalue [4 x <2 x i32>] %b, 2
2198   %4 = extractvalue [4 x <2 x i32>] %b, 3
2199   %5 = bitcast i32* %a to i8*
2200   tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
2201   ret void
2202 }
2203
2204 define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
2205 ; CHECK-LABEL: test_vst1_s64_x4
2206 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
2207 ; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
2208   %1 = extractvalue [4 x <1 x i64>] %b, 0
2209   %2 = extractvalue [4 x <1 x i64>] %b, 1
2210   %3 = extractvalue [4 x <1 x i64>] %b, 2
2211   %4 = extractvalue [4 x <1 x i64>] %b, 3
2212   %5 = bitcast i64* %a to i8*
2213   tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
2214   ret void
2215 }
2216
2217 define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
2218 ; CHECK-LABEL: test_vst1_f32_x4
2219 ; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
2220 ; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
2221   %1 = extractvalue [4 x <2 x float>] %b, 0
2222   %2 = extractvalue [4 x <2 x float>] %b, 1
2223   %3 = extractvalue [4 x <2 x float>] %b, 2
2224   %4 = extractvalue [4 x <2 x float>] %b, 3
2225   %5 = bitcast float* %a to i8*
2226   tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
2227   ret void
2228 }
2229
2230 define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
2231 ; CHECK-LABEL: test_vst1_f64_x4
2232 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
2233 ; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
2234   %1 = extractvalue [4 x <1 x double>] %b, 0
2235   %2 = extractvalue [4 x <1 x double>] %b, 1
2236   %3 = extractvalue [4 x <1 x double>] %b, 2
2237   %4 = extractvalue [4 x <1 x double>] %b, 3
2238   %5 = bitcast double* %a to i8*
2239   tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
2240   ret void
2241 }
2242
2243 declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
2244 declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
2245 declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
2246 declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
2247 declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
2248 declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
2249 declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
2250 declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
2251 declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
2252 declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
2253 declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
2254 declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
2255 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
2256 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
2257 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
2258 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
2259 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
2260 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
2261 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
2262 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
2263 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
2264 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
2265 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
2266 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
2267 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
2268 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
2269 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
2270 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
2271 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
2272 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
2273 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
2274 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
2275 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
2276 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
2277 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
2278 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
2279 declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
2280 declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
2281 declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
2282 declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
2283 declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
2284 declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
2285 declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
2286 declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
2287 declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
2288 declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
2289 declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
2290 declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
2291 declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
2292 declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
2293 declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
2294 declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
2295 declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
2296 declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
2297 declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
2298 declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
2299 declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
2300 declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
2301 declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
2302 declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
2303 declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
2304 declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
2305 declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
2306 declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
2307 declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
2308 declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
2309 declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
2310 declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
2311 declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
2312 declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
2313 declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
2314 declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)