]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
dts: Update our copy to Linux 4.17
[FreeBSD/FreeBSD.git] / contrib / llvm / lib / Target / ARM / ARMScheduleA57.td
1 //=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the machine model for ARM Cortex-A57 to support
11 // instruction scheduling and other instruction cost heuristics.
12 //
13 //===----------------------------------------------------------------------===//
14
15 //===----------------------------------------------------------------------===//
16 // *** Common description and scheduling model parameters taken from AArch64 ***
17 // The Cortex-A57 is a traditional superscalar microprocessor with a
18 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
19 // much wider out-of-order issue stage, this produced a need to carefully
20 // schedule micro-ops so that all three decoded each cycle are successfully
21 // issued as the reservation station(s) simply don't stay occupied for long.
22 // Therefore, IssueWidth is set to the narrower of the two at three, while still
23 // modeling the machine as out-of-order.
24
25 def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
26 def IsCPSRDefinedAndPredicatedPred :
27   SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
28
29 // Cortex A57 rev. r1p0 or later (false = r0px)
30 def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
31
32 // If Addrmode3 contains register offset (not immediate)
33 def IsLdrAm3RegOffPred :
34   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
35 // The same predicate with operand offset 2 and 3:
36 def IsLdrAm3RegOffPredX2 :
37   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
38 def IsLdrAm3RegOffPredX3 :
39   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
40
41 // If Addrmode3 contains "minus register"
42 def IsLdrAm3NegRegOffPred :
43   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
44 // The same predicate with operand offset 2 and 3:
45 def IsLdrAm3NegRegOffPredX2 :
46   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
47 def IsLdrAm3NegRegOffPredX3 :
48   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
49
50 // Load, scaled register offset, not plus LSL2
51 def IsLdstsoScaledNotOptimalPredX0 :
52   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
53 def IsLdstsoScaledNotOptimalPred :
54   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
55 def IsLdstsoScaledNotOptimalPredX2 :
56   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
57
58 // Load, scaled register offset
59 def IsLdstsoScaledPred :
60   SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
61 def IsLdstsoScaledPredX2 :
62   SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
63
64 def IsLdstsoMinusRegPredX0 :
65   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
66 def IsLdstsoMinusRegPred :
67   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
68 def IsLdstsoMinusRegPredX2 :
69   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
70
71 // Load, scaled register offset
72 def IsLdrAm2ScaledPred :
73   SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
74
75 // LDM, base reg in list
76 def IsLdmBaseRegInList :
77   SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
78
79 class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
80   list <SchedWriteRes> Writes = writes;
81   SchedMachineModel SchedModel = ?;
82 }
83
84 // *** Common description and scheduling model parameters taken from AArch64 ***
85 // (AArch64SchedA57.td)
86 def CortexA57Model : SchedMachineModel {
87   let IssueWidth        =   3; // 3-way decode and dispatch
88   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
89   let LoadLatency       =   4; // Optimistic load latency
90   let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
91
92   // Enable partial & runtime unrolling.
93   let LoopMicroOpBufferSize = 16;
94   let CompleteModel = 1;
95 }
96
97 //===----------------------------------------------------------------------===//
98 // Define each kind of processor resource and number available on Cortex-A57.
99 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
100 // micro-ops wait for their operands and then issue out-of-order.
101
102 def A57UnitB : ProcResource<1>;  // Type B micro-ops
103 def A57UnitI : ProcResource<2>;  // Type I micro-ops
104 def A57UnitM : ProcResource<1>;  // Type M micro-ops
105 def A57UnitL : ProcResource<1>;  // Type L micro-ops
106 def A57UnitS : ProcResource<1>;  // Type S micro-ops
107
108 def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
109 def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
110
111 let SchedModel = CortexA57Model in {
112   def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
113 }
114
115 let SchedModel = CortexA57Model in {
116
117 //===----------------------------------------------------------------------===//
118 // Define customized scheduler read/write types specific to the Cortex-A57.
119
120 include "ARMScheduleA57WriteRes.td"
121
122 // To have "CompleteModel = 1", support of pseudos and special instructions
123 def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
124   "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
125   "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
126   "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
127   "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
128   "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
129   "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
130
131 def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
132
133 // Specific memory instrs
134 def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
135   "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
136
137 // coprocessor moves
138 def : InstRW<[WriteNoop, WriteNoop], (instregex
139   "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
140   "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
141   "(t2)?MSR(banked|i|_AR|_M)?$")>;
142
143 // Deprecated instructions
144 def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
145
146 // Pseudos
147 def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
148   "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
149   "tLDRpci_pic", "t2SUBS_PC_LR",
150   "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
151   "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
152   "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
153   "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
154   "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
155   "WIN__CHKSTK", "WIN__DBZCHK")>;
156
157 // Miscellaneous
158 // -----------------------------------------------------------------------------
159
160 def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
161
162 // --- 3.2 Branch Instructions ---
163 // B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
164
165 def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
166   "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
167 def : InstRW<[A57Write_1cyc_1B_1I],
168   (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
169 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
170 // Pseudos
171 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
172 def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
173   "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
174 def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
175
176 // --- 3.3 Arithmetic and Logical Instructions ---
177 // ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S},
178 // RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
179
180 def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
181
182 // shift by register, conditional or unconditional
183 // TODO: according to the doc, conditional uses I0/I1, unconditional uses M
184 // Why more complex instruction uses more simple pipeline?
185 // May be an error in doc.
186 def A57WriteALUsi : SchedWriteVariant<[
187   // lsl #2, lsl #1, or lsr #1.
188   SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
189   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
190 ]>;
191 def A57WriteALUsr : SchedWriteVariant<[
192   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
193   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
194 ]>;
195 def A57WriteALUSsr : SchedWriteVariant<[
196   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
197   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
198 ]>;
199 def A57ReadALUsr : SchedReadVariant<[
200   SchedVar<IsPredicatedPred, [ReadDefault]>,
201   SchedVar<NoSchedPred,      [ReadDefault]>
202 ]>;
203 def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
204 def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
205 def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
206 def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
207
208 def A57WriteCMPsr : SchedWriteVariant<[
209   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
210   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
211 ]>;
212 def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
213 def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
214 def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
215
216 // --- 3.4 Move and Shift Instructions ---
217 // Move, basic
218 // MOV{S}, MOVW, MVN{S}
219 def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
220   "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
221   "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
222
223 // Move, shift by immed, setflags/no setflags
224 // (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
225 // setflags = isCPSRDefined
226 def A57WriteMOVsi : SchedWriteVariant<[
227   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
228   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
229 ]>;
230 def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
231   "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
232   "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
233
234 // shift by register, conditional or unconditional, setflags/no setflags
235 def A57WriteMOVsr : SchedWriteVariant<[
236   SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
237   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
238   SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
239   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
240 ]>;
241 def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
242   "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
243   "(t2|t)RORrr")>;
244
245 // Move, top
246 // MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
247 def A57WriteMOVT : SchedWriteVariant<[
248   SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
249   SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
250 ]>;
251 def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
252
253 def A57WriteI2pc :
254   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
255 def A57WriteI2ld :
256   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
257 def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
258 def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
259
260 // +2cyc for branch forms
261 def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
262
263 // --- 3.5 Divide and Multiply Instructions ---
264 // Divide: SDIV, UDIV
265 // latency from documentration: 4 ­‐ 20, maximum taken
266 def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
267 // Multiply: tMul not bound to common WriteRes types
268 def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
269 def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
270 def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
271 def : ReadAdvance<ReadMUL, 0>;
272
273 // Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
274 // SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
275 // Multiply-accumulate pipelines support late-forwarding of accumulate operands
276 // from similar μops, allowing a typical sequence of multiply-accumulate μops
277 // to issue one every 1 cycle (sched advance = 2).
278 def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
279 def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
280 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
281
282 def : SchedAlias<WriteMAC16, A57WriteMLA>;
283 def : SchedAlias<WriteMAC32, A57WriteMLA>;
284 def : SchedAlias<ReadMAC,    A57ReadMLA>;
285
286 def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
287 def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
288
289 // Multiply long: SMULL, UMULL
290 def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
291 def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
292
293 // --- 3.6 Saturating and Parallel Arithmetic Instructions ---
294 // Parallel     arith
295 // SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
296 // Conditional GE-setting instructions require three extra μops
297 // and two additional cycles to conditionally update the GE field.
298 def A57WriteParArith : SchedWriteVariant<[
299   SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
300   SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
301 ]>;
302 def : InstRW< [A57WriteParArith], (instregex
303   "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
304   "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
305
306 // Parallel     arith with exchange: SASX, SSAX, UASX, USAX
307 def A57WriteParArithExch : SchedWriteVariant<[
308   SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
309   SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
310 ]>;
311 def : InstRW<[A57WriteParArithExch],
312   (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
313
314 // Parallel     halving arith
315 // SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,  UHSUB8
316 def : InstRW<[A57Write_2cyc_1M], (instregex
317   "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
318   "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
319
320 // Parallel halving arith with exchange
321 // SHASX, SHSAX, UHASX, UHSAX
322 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
323   "(t2)?UHASX", "(t2)?UHSAX")>;
324
325 // Parallel     saturating arith
326 // QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
327 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
328   "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
329
330 // Parallel     saturating arith with exchange
331 // QASX, QSAX, UQASX, UQSAX
332 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
333   "(t2)?UQASX", "(t2)?UQSAX")>;
334
335 // Saturate: SSAT, SSAT16, USAT, USAT16
336 def : InstRW<[A57Write_2cyc_1M],
337   (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
338
339 // Saturating arith: QADD, QSUB
340 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
341
342 // Saturating doubling arith: QDADD, QDSUB
343 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
344
345 // --- 3.7 Miscellaneous Data-Processing Instructions ---
346 // Bit field extract: SBFX, UBFX
347 def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
348
349 // Bit field insert/clear: BFI, BFC
350 def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
351
352 // Select bytes, conditional/unconditional
353 def A57WriteSEL : SchedWriteVariant<[
354   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
355   SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
356 ]>;
357 def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
358
359 // Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
360 def : InstRW<[A57Write_1cyc_1I],
361   (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
362
363 // Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
364 def : InstRW<[A57Write_2cyc_1M],
365   (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
366
367 // Sign/zero extend and add, parallel: SXTAB16, UXTAB16
368 def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
369
370 // Sum of absolute differences: USAD8, USADA8
371 def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
372
373 // --- 3.8 Load Instructions ---
374
375 // Load, immed offset
376 // LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
377 def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
378   "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
379   "PICLDR", "tLDR")>;
380
381 def : InstRW<[A57Write_4cyc_1L],
382   (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
383
384 // For "Load, register offset, minus" we need +1cyc, +1I
385 def A57WriteLdrAm3 : SchedWriteVariant<[
386   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
387   SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
388 ]>;
389 def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
390 def A57WriteLdrAm3X2 : SchedWriteVariant<[
391   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
392   SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
393 ]>;
394 def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
395 def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
396
397 def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
398   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
399   SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
400   SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
401 ]>;
402 def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
403
404 def A57WrBackOne : SchedWriteRes<[]> {
405   let Latency = 1;
406   let NumMicroOps = 0;
407 }
408 def A57WrBackTwo : SchedWriteRes<[]> {
409   let Latency = 2;
410   let NumMicroOps = 0;
411 }
412 def A57WrBackThree : SchedWriteRes<[]> {
413   let Latency = 3;
414   let NumMicroOps = 0;
415 }
416
417 // --- LDR pre-indexed ---
418 // Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
419 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
420   "LDRB_PRE_IMM", "t2LDRB_PRE")>;
421
422 // Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
423 // (5 cyc load result for not-lsl2 scaled)
424 def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
425   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
426   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
427 ]>;
428 def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
429   (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
430
431 def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
432   SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
433   SchedVar<NoSchedPred,          [A57WrBackOne]>
434 ]>;
435 def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
436   (instregex "LDR(H|SH|SB)_PRE")>;
437 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
438   (instregex "t2LDR(H|SH|SB)?_PRE")>;
439
440 // LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
441 def A57WriteLdrDAm3Pre : SchedWriteVariant<[
442   SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
443   SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
444 ]>;
445 def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
446   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
447   SchedVar<NoSchedPred,          [A57WrBackOne]>
448 ]>;
449 def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
450   (instregex "LDRD_PRE")>;
451 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
452   (instregex "t2LDRD_PRE")>;
453
454 // --- LDR post-indexed ---
455 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
456   "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
457
458 def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
459   SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
460   SchedVar<NoSchedPred,        [A57WrBackOne]>
461 ]>;
462 def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
463   (instregex "LDR(H|SH|SB)_POST")>;
464 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
465   (instregex "t2LDR(H|SH|SB)?_POST")>;
466
467 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
468   "LDRB_POST_REG", "LDR(B?)T_POST$")>;
469
470 def A57WriteLdrTRegPost : SchedWriteVariant<[
471   SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
472   SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
473 ]>;
474 def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
475   SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
476   SchedVar<NoSchedPred,        [A57WrBackTwo]>
477 ]>;
478 // 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
479 def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
480   (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
481
482 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
483
484 def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
485   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
486   SchedVar<NoSchedPred,          [A57WrBackOne]>
487 ]>;
488 // LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
489 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
490   A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
491 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
492   (instregex "t2LDRD_POST")>;
493
494 // --- Preload instructions ---
495 // Preload, immed offset
496 def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
497   "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
498
499 // Preload, register offset,
500 // 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
501 // otherwise 4cyc "L"
502 def A57WritePLD : SchedWriteVariant<[
503   SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
504   SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
505   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
506 ]>;
507 def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
508
509 // --- Load multiple instructions ---
510 foreach NumAddr = 1-8 in {
511   def A57LMAddrPred#NumAddr :
512     SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
513 }
514
515 def A57LDMOpsListNoregin : A57WriteLMOpsListType<
516                 [A57Write_3cyc_1L, A57Write_3cyc_1L,
517                  A57Write_4cyc_1L, A57Write_4cyc_1L,
518                  A57Write_5cyc_1L, A57Write_5cyc_1L,
519                  A57Write_6cyc_1L, A57Write_6cyc_1L,
520                  A57Write_7cyc_1L, A57Write_7cyc_1L,
521                  A57Write_8cyc_1L, A57Write_8cyc_1L,
522                  A57Write_9cyc_1L, A57Write_9cyc_1L,
523                  A57Write_10cyc_1L, A57Write_10cyc_1L]>;
524 def A57WriteLDMnoreginlist : SchedWriteVariant<[
525   SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
526   SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
527   SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
528   SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
529   SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
530   SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
531   SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
532   SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
533   SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
534 ]> { let Variadic=1; }
535
536 def A57LDMOpsListRegin : A57WriteLMOpsListType<
537                 [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
538                  A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
539                  A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
540                  A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
541                  A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
542                  A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
543                  A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
544                  A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
545 def A57WriteLDMreginlist : SchedWriteVariant<[
546   SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
547   SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
548   SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
549   SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
550   SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
551   SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
552   SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
553   SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
554   SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
555 ]> { let Variadic=1; }
556
557 def A57LDMOpsList_Upd : A57WriteLMOpsListType<
558               [A57WrBackOne,
559                A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
560                A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
561                A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
562                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
563                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
564                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
565                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
566                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
567 def A57WriteLDM_Upd : SchedWriteVariant<[
568   SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
569   SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
570   SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
571   SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
572   SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
573   SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
574   SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
575   SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
576   SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
577 ]> { let Variadic=1; }
578
579 def A57WriteLDM : SchedWriteVariant<[
580   SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
581   SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
582 ]> { let Variadic=1; }
583
584 def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
585
586 // TODO: no writeback latency defined in documentation (implemented as 1 cyc)
587 def : InstRW<[A57WriteLDM_Upd],
588   (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
589
590 // --- 3.9 Store Instructions ---
591
592 // Store, immed offset
593 def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
594   "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
595
596 // Store, register offset
597 // For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
598 // otherwise 1cyc S.
599 def A57WriteStrAmLDSTSO : SchedWriteVariant<[
600   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
601   SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
602   SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
603 ]>;
604 def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
605
606 // STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
607 def A57WriteStrAm3 : SchedWriteVariant<[
608   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
609   SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
610 ]>;
611 def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
612 def A57WriteStrAm3X2 : SchedWriteVariant<[
613   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
614   SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
615 ]>;
616 def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
617
618 // Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
619 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
620   "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
621   "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
622
623 // Store, register pre-indexed:
624 // 1(1) "S, I0/I1" for plus reg
625 // 3(2) "I0/I1, S" for minus reg
626 // 1(2) "S, M" for scaled plus lsl2
627 // 3(2) "I0/I1, S" for other scaled
628 def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
629   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
630   SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
631   SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
632   SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
633 ]>;
634 def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
635   SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
636   SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
637   SchedVar<NoSchedPred,                    [A57WrBackOne]>
638 ]>;
639 def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
640   (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
641
642 // pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
643 // 1(1) "S, I0/I1" for imm or reg plus
644 // 3(2) "I0/I1, S" for reg minus
645 def A57WriteStrAm3PreX2 : SchedWriteVariant<[
646   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
647   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
648 ]>;
649 def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
650   SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
651   SchedVar<NoSchedPred,             [A57WrBackOne]>
652 ]>;
653 def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
654   (instregex "STRH_PRE")>;
655
656 def A57WriteStrAm3PreX3 : SchedWriteVariant<[
657   SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
658   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
659 ]>;
660 def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
661   SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
662   SchedVar<NoSchedPred,             [A57WrBackOne]>
663 ]>;
664 def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
665   (instregex "STRD_PRE")>;
666
667 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
668   "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
669
670 // 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
671 def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
672   "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
673
674 // post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
675 // 1(1) "S, I0/I1" both for reg or imm
676 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
677   (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
678
679 // --- Store multiple instructions ---
680 // TODO: no writeback latency defined in documentation
681 def A57WriteSTM : SchedWriteVariant<[
682     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
683     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
684     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
685     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
686     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
687     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
688     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
689     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
690     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
691 ]>;
692 def A57WriteSTM_Upd : SchedWriteVariant<[
693     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
694     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
695     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
696     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
697     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
698     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
699     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
700     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
701     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
702 ]>;
703
704 def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
705 def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
706   (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
707
708 // --- 3.10 FP Data Processing Instructions ---
709 def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
710 def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
711
712 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
713
714 // fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
715 def A57WriteVcmp : SchedWriteVariant<[
716   SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
717   SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
718 ]>;
719 def : InstRW<[A57WriteVcmp],
720   (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
721
722 // fp convert
723 def : InstRW<[A57Write_5cyc_1V], (instregex
724   "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
725
726 def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
727
728 // FP round to integral
729 def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
730
731 // FP divide, FP square root
732 def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
733 def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
734 def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
735 def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
736
737 // FP max/min
738 def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
739
740 // FP multiply-accumulate pipelines support late forwarding of the result
741 // from FP multiply μops to the accumulate operands of an
742 // FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
743 // after the FP multiply μop has been issued
744 // FP multiply, FZ
745 def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
746
747 def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
748 def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
749 def : ReadAdvance<ReadFPMUL, 0>;
750
751 // FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
752 // VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
753 def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
754
755 // VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
756 // VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
757 // Currently, there is no way to define different read advances for VFMA operand
758 // from VFMA or from VMUL, so there will be 5 read advance.
759 // Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
760 // The same situation with ASIMD VMUL/VFMA instructions
761 // def A57ReadVFMA : SchedRead;
762 // def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
763 // def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
764 def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
765
766 def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
767 def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
768 def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
769
770 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
771 def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
772
773 // --- 3.11 FP Miscellaneous Instructions ---
774 // VMOV: 3cyc "F0/F1" for imm/reg
775 def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
776 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
777
778 // 5cyc L for FP transfer, vfp to core reg,
779 // 5cyc L for FP transfer, core reg to vfp
780 def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
781 // VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
782 def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
783
784 // 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
785 def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
786
787 // --- 3.12 FP Load Instructions ---
788 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
789
790 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
791
792 // FP load multiple (VLDM)
793
794 def A57VLDMOpsListUncond : A57WriteLMOpsListType<
795                [A57Write_5cyc_1L, A57Write_5cyc_1L,
796                 A57Write_6cyc_1L, A57Write_6cyc_1L,
797                 A57Write_7cyc_1L, A57Write_7cyc_1L,
798                 A57Write_8cyc_1L, A57Write_8cyc_1L,
799                 A57Write_9cyc_1L, A57Write_9cyc_1L,
800                 A57Write_10cyc_1L, A57Write_10cyc_1L,
801                 A57Write_11cyc_1L, A57Write_11cyc_1L,
802                 A57Write_12cyc_1L, A57Write_12cyc_1L]>;
803 def A57WriteVLDMuncond : SchedWriteVariant<[
804   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
805   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
806   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
807   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
808   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
809   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
810   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
811   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
812   SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
813 ]> { let Variadic=1; }
814
815 def A57VLDMOpsListCond : A57WriteLMOpsListType<
816                [A57Write_5cyc_1L, A57Write_6cyc_1L,
817                 A57Write_7cyc_1L, A57Write_8cyc_1L,
818                 A57Write_9cyc_1L, A57Write_10cyc_1L,
819                 A57Write_11cyc_1L, A57Write_12cyc_1L,
820                 A57Write_13cyc_1L, A57Write_14cyc_1L,
821                 A57Write_15cyc_1L, A57Write_16cyc_1L,
822                 A57Write_17cyc_1L, A57Write_18cyc_1L,
823                 A57Write_19cyc_1L, A57Write_20cyc_1L]>;
824 def A57WriteVLDMcond : SchedWriteVariant<[
825   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
826   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
827   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
828   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
829   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
830   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
831   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
832   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
833   SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
834 ]> { let Variadic=1; }
835
836 def A57WriteVLDM : SchedWriteVariant<[
837   SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
838   SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
839 ]> { let Variadic=1; }
840
841 def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
842
843 def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
844                [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
845                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
846                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
847                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
848                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
849                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
850                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
851                 A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
852 def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
853   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
854   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
855   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
856   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
857   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
858   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
859   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
860   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
861   SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
862 ]> { let Variadic=1; }
863
864 def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
865                [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
866                 A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
867                 A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
868                 A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
869                 A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
870                 A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
871                 A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
872                 A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
873 def A57WriteVLDMcond_UPD : SchedWriteVariant<[
874   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
875   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
876   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
877   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
878   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
879   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
880   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
881   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
882   SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
883 ]> { let Variadic=1; }
884
885 def A57WriteVLDM_UPD : SchedWriteVariant<[
886   SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
887   SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
888 ]> { let Variadic=1; }
889
890 def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
891   (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
892
893 // --- 3.13 FP Store Instructions ---
894 def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
895
896 def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
897
898 def A57WriteVSTMs : SchedWriteVariant<[
899     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
900     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
901     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
902     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
903     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
904     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
905     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
906     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
907     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
908 ]>;
909 def A57WriteVSTMd : SchedWriteVariant<[
910     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
911     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
912     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
913     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
914     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
915     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
916     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
917     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
918     SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
919 ]>;
920 def A57WriteVSTMs_Upd : SchedWriteVariant<[
921     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
922     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
923     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
924     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
925     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
926     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
927     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
928     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
929     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
930 ]>;
931 def A57WriteVSTMd_Upd : SchedWriteVariant<[
932     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
933     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
934     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
935     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
936     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
937     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
938     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
939     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
940     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
941 ]>;
942
943 def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
944 def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
945 def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
946   (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
947 def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
948   (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
949
950 // --- 3.14 ASIMD Integer Instructions ---
951
952 // ASIMD absolute diff, 3cyc F0/F1 for integer VABD
953 def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
954
955 // ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
956 def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
957 def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
958 def : InstRW<[A57WriteVABAD, A57ReadVABAD],
959   (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
960 def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
961 def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
962 def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
963   (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
964
965 // ASIMD absolute diff accum long: 4(1) F1 for VABAL
966 def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
967 def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
968 def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
969
970 // ASIMD absolute diff long: 3cyc F0/F1 for VABDL
971 def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
972
973 // ASIMD arith, basic
974 def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
975   "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
976   "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
977
978 // ASIMD arith, complex
979 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
980   "VQABS", "VQADD", "VQNEG", "VQSUB",
981   "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
982
983 // ASIMD compare
984 def : InstRW<[A57Write_3cyc_1V],
985   (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
986
987 // ASIMD logical
988 def : InstRW<[A57Write_3cyc_1V],
989   (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
990
991 // ASIMD max/min
992 def : InstRW<[A57Write_3cyc_1V],
993   (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
994
995 // ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
996 // Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
997 // and multiply-with-accumulate instructions relative to r0pX.
998 def A57WriteVMULD_VecInt : SchedWriteVariant<[
999   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1000   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1001 def : InstRW<[A57WriteVMULD_VecInt], (instregex
1002   "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1003   "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1004
1005 // ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1006 def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1007   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1008   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1009 def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1010   "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1011   "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1012
1013 // ASIMD multiply accumulate, D-form
1014 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1015 // (4 or 3 ReadAdvance)
1016 def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1017   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1018   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1019 def A57ReadVMLAD_VecInt : SchedReadVariant<[
1020   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1021   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1022 ]>;
1023 def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1024   (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1025
1026 // ASIMD multiply accumulate, Q-form
1027 // 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1028 // (4 or 3 ReadAdvance)
1029 def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1030   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1031   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1032 def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1033   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1034   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1035 ]>;
1036 def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1037   (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1038
1039 // ASIMD multiply accumulate long
1040 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1041 // (4 or 3 ReadAdvance)
1042 def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1043   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1044   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1045 def A57ReadVMLAL_VecInt : SchedReadVariant<[
1046   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1047   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1048 ]>;
1049 def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1050   (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1051
1052 // ASIMD multiply accumulate saturating long
1053 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1054 // (3 or 2 ReadAdvance)
1055 def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1056   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1057   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1058 def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1059   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1060   SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1061 ]>;
1062 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1063   (instregex "VQDMLAL", "VQDMLSL")>;
1064
1065 // ASIMD multiply long
1066 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1067 def A57WriteVMULL_VecInt : SchedWriteVariant<[
1068   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1069   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1070 def : InstRW<[A57WriteVMULL_VecInt],
1071   (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1072
1073 // ASIMD pairwise add and accumulate
1074 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1075 def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1076 def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1077 def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1078
1079 // ASIMD shift accumulate
1080 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1081 def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1082 def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1083 def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1084
1085 // ASIMD shift by immed, basic
1086 def : InstRW<[A57Write_3cyc_1X],
1087   (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1088
1089 // ASIMD shift by immed, complex
1090 def : InstRW<[A57Write_4cyc_1X], (instregex
1091   "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1092   "VRSHRN")>;
1093
1094 // ASIMD shift by immed and insert, basic, D-form
1095 def : InstRW<[A57Write_4cyc_1X], (instregex
1096   "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1097
1098 // ASIMD shift by immed and insert, basic, Q-form
1099 def : InstRW<[A57Write_5cyc_1X], (instregex
1100   "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1101
1102 // ASIMD shift by register, basic, D-form
1103 def : InstRW<[A57Write_3cyc_1X], (instregex
1104   "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1105
1106 // ASIMD shift by register, basic, Q-form
1107 def : InstRW<[A57Write_4cyc_1X], (instregex
1108   "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1109
1110 // ASIMD shift by register, complex, D-form
1111 // VQRSHL, VQSHL, VRSHL
1112 def : InstRW<[A57Write_4cyc_1X], (instregex
1113   "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1114   "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1115
1116 // ASIMD shift by register, complex, Q-form
1117 def : InstRW<[A57Write_5cyc_1X], (instregex
1118   "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1119   "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1120
1121 // --- 3.15 ASIMD Floating-Point Instructions ---
1122 // ASIMD FP absolute value
1123 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1124
1125 // ASIMD FP arith
1126 def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1127   "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1128
1129 // ASIMD FP compare
1130 def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1131   "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1132
1133 // ASIMD FP convert, integer
1134 def : InstRW<[A57Write_5cyc_1V], (instregex
1135   "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1136   "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1137   "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1138
1139 // ASIMD FP convert, half-precision: 8cyc F0/F1
1140 def : InstRW<[A57Write_8cyc_1V], (instregex
1141   "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1142   "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1143   "VCVT(f2h|h2f)")>;
1144
1145 // ASIMD FP max/min
1146 def : InstRW<[A57Write_5cyc_1V], (instregex
1147   "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
1148
1149 // ASIMD FP multiply
1150 def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1151 def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1152
1153 // ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1154 def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1155 def A57ReadVMLA_VecFP  :
1156   SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1157 def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1158   (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1159
1160 // ASIMD FP negate
1161 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1162
1163 // ASIMD FP round to integral
1164 def : InstRW<[A57Write_5cyc_1V], (instregex
1165   "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1166
1167 // --- 3.16 ASIMD Miscellaneous Instructions ---
1168
1169 // ASIMD bitwise insert
1170 def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
1171
1172 // ASIMD count
1173 def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1174
1175 // ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1176 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1177
1178 // ASIMD duplicate, scalar: 3cyc "F0/F1"
1179 def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1180
1181 // ASIMD extract
1182 def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1183
1184 // ASIMD move, immed
1185 def : InstRW<[A57Write_3cyc_1V], (instregex
1186   "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1187   "VMOVQ0")>;
1188
1189 // ASIMD move, narrowing
1190 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1191
1192 // ASIMD move, saturating
1193 def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1194
1195 // ASIMD reciprocal estimate
1196 def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1197
1198 // ASIMD reciprocal step, FZ
1199 def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1200
1201 // ASIMD reverse, swap, table lookup (1-2 reg)
1202 def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1203
1204 // ASIMD table lookup (3-4 reg)
1205 def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1206
1207 // ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1208 def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1209
1210 // ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1211 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1212
1213 // ASIMD transpose
1214 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1215
1216 // ASIMD unzip/zip, D-form
1217 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1218   (instregex "VUZPd", "VZIPd")>;
1219
1220 // ASIMD unzip/zip, Q-form
1221 def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1222   (instregex "VUZPq", "VZIPq")>;
1223
1224 // --- 3.17 ASIMD Load Instructions ---
1225
1226 // Overriden via InstRW for this processor.
1227 def : WriteRes<WriteVLD1, []>;
1228 def : WriteRes<WriteVLD2, []>;
1229 def : WriteRes<WriteVLD3, []>;
1230 def : WriteRes<WriteVLD4, []>;
1231 def : WriteRes<WriteVST1, []>;
1232 def : WriteRes<WriteVST2, []>;
1233 def : WriteRes<WriteVST3, []>;
1234 def : WriteRes<WriteVST4, []>;
1235
1236 // 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1237 def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1238 def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1239   (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1240
1241 // 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1242 def : InstRW<[A57Write_6cyc_1L],
1243   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1244
1245 def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1246   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1247
1248 // ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1249 def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1250   "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1251 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1252   "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1253
1254 // ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1255 def : InstRW<[A57Write_8cyc_1L_1V],
1256       (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1257 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1258       (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1259
1260 // ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1261 def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1262 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1263       (instregex "VLD2b(8|16|32)wb")>;
1264
1265 // ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1266 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1267       (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1268                  "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1269 // 2 results + wb result
1270 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1271       (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1272 // 1 result + wb result
1273 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1274       (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1275                  "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1276
1277 // ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1278 // 3 results
1279 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1280       (instregex "VLD3(d|q)(8|16|32)$")>;
1281 // 1 result
1282 def : InstRW<[A57Write_9cyc_1L_1V],
1283       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1284 // 3 results + wb
1285 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1286               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1287       (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1288 // 1 result + wb
1289 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1290       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1291
1292 // ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1293 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1294       (instregex "VLD3LN(d|q)32$",
1295                  "VLD3LN(d|q)32Pseudo$")>;
1296 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1297               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1298       (instregex "VLD3LN(d|q)32_UPD")>;
1299 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1300       (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1301
1302 // ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1303 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1304       (instregex "VLD3LN(d|q)(8|16)$",
1305                  "VLD3LN(d|q)(8|16)Pseudo$")>;
1306 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1307               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1308       (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1309 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1310       (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1311
1312 // ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1313 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1314       (instregex "VLD3DUP(d|q)(8|16|32)$",
1315                  "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1316 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1317               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1318       (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1319 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1320       (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1321
1322 // ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1323 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1324               A57Write_9cyc_1L_1V],
1325       (instregex "VLD4(d|q)(8|16|32)$")>;
1326 def : InstRW<[A57Write_9cyc_1L_1V],
1327       (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1328 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1329               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1330       (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1331 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1332       (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1333
1334 // ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1335 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1336               A57Write_8cyc_1L_1V],
1337       (instregex "VLD4LN(d|q)32$",
1338                  "VLD4LN(d|q)32Pseudo$")>;
1339 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1340               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1341               A57WrBackOne],
1342       (instregex "VLD4LN(d|q)32_UPD")>;
1343 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1344       (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1345
1346 // ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1347 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1348               A57Write_9cyc_1L_1V],
1349       (instregex "VLD4LN(d|q)(8|16)$",
1350                  "VLD4LN(d|q)(8|16)Pseudo$")>;
1351 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1352               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1353               A57WrBackOne],
1354       (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1355 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1356       (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1357
1358 // ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1359 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1360               A57Write_8cyc_1L_1V],
1361       (instregex "VLD4DUP(d|q)(8|16|32)$",
1362                  "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1363 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1364               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1365               A57WrBackOne],
1366       (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1367 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1368       (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1369
1370 // --- 3.18 ASIMD Store Instructions ---
1371
1372 // ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1373 def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1374 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1375       (instregex "VST1d(8|16|32|64)wb")>;
1376 // ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1377 def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1378 def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1379       (instregex "VST1q(8|16|32|64)wb")>;
1380 // ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1381 def : InstRW<[A57Write_3cyc_1S],
1382       (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1383 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1384       (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1385 // ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1386 def : InstRW<[A57Write_4cyc_1S],
1387       (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1388 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1389       (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1390 // ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1391 def : InstRW<[A57Write_3cyc_1S_1V],
1392       (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1393 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1394       (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1395 // ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1396 def : InstRW<[A57Write_3cyc_1S_1V],
1397       (instregex "VST2(d|b)(8|16|32)$")>;
1398 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1399       (instregex "VST2(b|d)(8|16|32)wb")>;
1400 // ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1401 def : InstRW<[A57Write_4cyc_1S_1V],
1402       (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1403 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1404       (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1405 // ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1406 def : InstRW<[A57Write_3cyc_1S_1V],
1407       (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1408 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1409       (instregex "VST2LN(d|q)(8|16|32)_UPD",
1410                  "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1411 // ASIMD store, 3 element, multiple, 3 reg
1412 def : InstRW<[A57Write_3cyc_1S_1V],
1413       (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1414 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1415       (instregex "VST3(d|q)(8|16|32)_UPD",
1416                  "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1417 // ASIMD store, 3 element, one lane
1418 def : InstRW<[A57Write_3cyc_1S_1V],
1419       (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1420 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1421       (instregex "VST3LN(d|q)(8|16|32)_UPD",
1422                  "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1423 // ASIMD store, 4 element, multiple, 4 reg
1424 def : InstRW<[A57Write_4cyc_1S_1V],
1425       (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1426 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1427       (instregex "VST4(d|q)(8|16|32)_UPD",
1428                  "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1429 // ASIMD store, 4 element, one lane
1430 def : InstRW<[A57Write_3cyc_1S_1V],
1431       (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1432 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1433       (instregex "VST4LN(d|q)(8|16|32)_UPD",
1434                  "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1435
1436 // --- 3.19 Cryptography Extensions ---
1437 // Crypto AES ops
1438 // AESD, AESE, AESIMC, AESMC: 3cyc F0
1439 def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1440 // Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1441 def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1442 // Crypto SHA1 xor ops: 6cyc F0/F1
1443 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1444 // Crypto SHA1 fast ops: 3cyc F0
1445 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1446 // Crypto SHA1 slow ops: 6cyc F0
1447 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1448 // Crypto SHA256 fast ops: 3cyc F0
1449 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1450 // Crypto SHA256 slow ops: 6cyc F0
1451 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1452
1453 // --- 3.20 CRC ---
1454 def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1455
1456 // -----------------------------------------------------------------------------
1457 // Common definitions
1458 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1459 def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
1460
1461 def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1462 def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1463 def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1464 def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1465
1466 def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1467 def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1468 def : ReadAdvance<ReadALU, 0>;
1469
1470 } // SchedModel = CortexA57Model
1471