contrib/llvm/lib/Target/ARM/ARMScheduleA57.td

   1 //=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the machine model for ARM Cortex-A57 to support
  11 // instruction scheduling and other instruction cost heuristics.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 //===----------------------------------------------------------------------===//
  16 // *** Common description and scheduling model parameters taken from AArch64 ***
  17 // The Cortex-A57 is a traditional superscalar microprocessor with a
  18 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
  19 // much wider out-of-order issue stage, this produced a need to carefully
  20 // schedule micro-ops so that all three decoded each cycle are successfully
  21 // issued as the reservation station(s) simply don't stay occupied for long.
  22 // Therefore, IssueWidth is set to the narrower of the two at three, while still
  23 // modeling the machine as out-of-order.
  24
  25 def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
  26 def IsCPSRDefinedAndPredicatedPred :
  27   SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
  28
  29 // Cortex A57 rev. r1p0 or later (false = r0px)
  30 def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
  31
  32 // If Addrmode3 contains register offset (not immediate)
  33 def IsLdrAm3RegOffPred :
  34   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
  35 // The same predicate with operand offset 2 and 3:
  36 def IsLdrAm3RegOffPredX2 :
  37   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
  38 def IsLdrAm3RegOffPredX3 :
  39   SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
  40
  41 // If Addrmode3 contains "minus register"
  42 def IsLdrAm3NegRegOffPred :
  43   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
  44 // The same predicate with operand offset 2 and 3:
  45 def IsLdrAm3NegRegOffPredX2 :
  46   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
  47 def IsLdrAm3NegRegOffPredX3 :
  48   SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
  49
  50 // Load, scaled register offset, not plus LSL2
  51 def IsLdstsoScaledNotOptimalPredX0 :
  52   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
  53 def IsLdstsoScaledNotOptimalPred :
  54   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
  55 def IsLdstsoScaledNotOptimalPredX2 :
  56   SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
  57
  58 // Load, scaled register offset
  59 def IsLdstsoScaledPred :
  60   SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
  61 def IsLdstsoScaledPredX2 :
  62   SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
  63
  64 def IsLdstsoMinusRegPredX0 :
  65   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
  66 def IsLdstsoMinusRegPred :
  67   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
  68 def IsLdstsoMinusRegPredX2 :
  69   SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
  70
  71 // Load, scaled register offset
  72 def IsLdrAm2ScaledPred :
  73   SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
  74
  75 // LDM, base reg in list
  76 def IsLdmBaseRegInList :
  77   SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
  78
  79 class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
  80   list <SchedWriteRes> Writes = writes;
  81   SchedMachineModel SchedModel = ?;
  82 }
  83
  84 // *** Common description and scheduling model parameters taken from AArch64 ***
  85 // (AArch64SchedA57.td)
  86 def CortexA57Model : SchedMachineModel {
  87   let IssueWidth        =   3; // 3-way decode and dispatch
  88   let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
  89   let LoadLatency       =   4; // Optimistic load latency
  90   let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
  91
  92   // Enable partial & runtime unrolling.
  93   let LoopMicroOpBufferSize = 16;
  94   let CompleteModel = 1;
  95
  96   // FIXME: Remove when all errors have been fixed.
  97   let FullInstRWOverlapCheck = 0;
  98 }
  99
 100 //===----------------------------------------------------------------------===//
 101 // Define each kind of processor resource and number available on Cortex-A57.
 102 // Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
 103 // micro-ops wait for their operands and then issue out-of-order.
 104
 105 def A57UnitB : ProcResource<1>;  // Type B micro-ops
 106 def A57UnitI : ProcResource<2>;  // Type I micro-ops
 107 def A57UnitM : ProcResource<1>;  // Type M micro-ops
 108 def A57UnitL : ProcResource<1>;  // Type L micro-ops
 109 def A57UnitS : ProcResource<1>;  // Type S micro-ops
 110
 111 def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
 112 def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
 113
 114 let SchedModel = CortexA57Model in {
 115   def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
 116 }
 117
 118 let SchedModel = CortexA57Model in {
 119
 120 //===----------------------------------------------------------------------===//
 121 // Define customized scheduler read/write types specific to the Cortex-A57.
 122
 123 include "ARMScheduleA57WriteRes.td"
 124
 125 // To have "CompleteModel = 1", support of pseudos and special instructions
 126 def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
 127   "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
 128   "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
 129   "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
 130   "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
 131   "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "(t2|t)?UDF$", "t2DCPS", "t2SG",
 132   "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier",
 133   "t__brkdiv0")>;
 134
 135 def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
 136
 137 // Specific memory instrs
 138 def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
 139   "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
 140
 141 // coprocessor moves
 142 def : InstRW<[WriteNoop, WriteNoop], (instregex
 143   "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
 144   "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
 145   "(t2)?MSR(banked|i|_AR|_M)?$")>;
 146
 147 // Deprecated instructions
 148 def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
 149
 150 // Pseudos
 151 def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
 152   "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
 153   "tLDRpci_pic", "(t2)?SUBS_PC_LR",
 154   "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
 155   "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
 156   "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
 157   "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
 158   "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
 159   "WIN__CHKSTK", "WIN__DBZCHK")>;
 160
 161 // Miscellaneous
 162 // -----------------------------------------------------------------------------
 163
 164 def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
 165
 166 // --- 3.2 Branch Instructions ---
 167 // B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
 168
 169 def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
 170   "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
 171 def : InstRW<[A57Write_1cyc_1B_1I],
 172   (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
 173 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
 174 // Pseudos
 175 def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
 176 def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
 177   "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
 178 def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
 179
 180 // --- 3.3 Arithmetic and Logical Instructions ---
 181 // ADD{S}, ADC{S}, ADR, AND{S}, BIC{S}, CMN, CMP, EOR{S}, ORN{S}, ORR{S},
 182 // RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
 183
 184 def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
 185
 186 // shift by register, conditional or unconditional
 187 // TODO: according to the doc, conditional uses I0/I1, unconditional uses M
 188 // Why more complex instruction uses more simple pipeline?
 189 // May be an error in doc.
 190 def A57WriteALUsi : SchedWriteVariant<[
 191   // lsl #2, lsl #1, or lsr #1.
 192   SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
 193   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
 194 ]>;
 195 def A57WriteALUsr : SchedWriteVariant<[
 196   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
 197   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
 198 ]>;
 199 def A57WriteALUSsr : SchedWriteVariant<[
 200   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
 201   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
 202 ]>;
 203 def A57ReadALUsr : SchedReadVariant<[
 204   SchedVar<IsPredicatedPred, [ReadDefault]>,
 205   SchedVar<NoSchedPred,      [ReadDefault]>
 206 ]>;
 207 def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
 208 def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
 209 def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
 210 def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
 211
 212 def A57WriteCMPsr : SchedWriteVariant<[
 213   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
 214   SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
 215 ]>;
 216 def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
 217 def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
 218 def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
 219
 220 // --- 3.4 Move and Shift Instructions ---
 221 // Move, basic
 222 // MOV{S}, MOVW, MVN{S}
 223 def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
 224   "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
 225   "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
 226
 227 // Move, shift by immed, setflags/no setflags
 228 // (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
 229 // setflags = isCPSRDefined
 230 def A57WriteMOVsi : SchedWriteVariant<[
 231   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
 232   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
 233 ]>;
 234 def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
 235   "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
 236   "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
 237
 238 // shift by register, conditional or unconditional, setflags/no setflags
 239 def A57WriteMOVsr : SchedWriteVariant<[
 240   SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
 241   SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
 242   SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
 243   SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
 244 ]>;
 245 def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
 246   "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
 247   "(t2|t)RORrr")>;
 248
 249 // Move, top
 250 // MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
 251 def A57WriteMOVT : SchedWriteVariant<[
 252   SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
 253   SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
 254 ]>;
 255 def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
 256
 257 def A57WriteI2pc :
 258   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
 259 def A57WriteI2ld :
 260   WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
 261 def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
 262 def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
 263
 264 // +2cyc for branch forms
 265 def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
 266
 267 // --- 3.5 Divide and Multiply Instructions ---
 268 // Divide: SDIV, UDIV
 269 // latency from documentration: 4 ‐ 20, maximum taken
 270 def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
 271 // Multiply: tMul not bound to common WriteRes types
 272 def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
 273 def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
 274 def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
 275 def : ReadAdvance<ReadMUL, 0>;
 276
 277 // Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
 278 // SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
 279 // Multiply-accumulate pipelines support late-forwarding of accumulate operands
 280 // from similar μops, allowing a typical sequence of multiply-accumulate μops
 281 // to issue one every 1 cycle (sched advance = 2).
 282 def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
 283 def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
 284 def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
 285
 286 def : InstRW<[A57WriteMLA],
 287   (instregex "t2SMLAD", "t2SMLADX", "t2SMLSD", "t2SMLSDX")>;
 288
 289 def : SchedAlias<WriteMAC16, A57WriteMLA>;
 290 def : SchedAlias<WriteMAC32, A57WriteMLA>;
 291 def : SchedAlias<ReadMAC,    A57ReadMLA>;
 292
 293 def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
 294 def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
 295
 296 // Multiply long: SMULL, UMULL
 297 def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
 298 def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
 299
 300 // --- 3.6 Saturating and Parallel Arithmetic Instructions ---
 301 // Parallel     arith
 302 // SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
 303 // Conditional GE-setting instructions require three extra μops
 304 // and two additional cycles to conditionally update the GE field.
 305 def A57WriteParArith : SchedWriteVariant<[
 306   SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
 307   SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
 308 ]>;
 309 def : InstRW< [A57WriteParArith], (instregex
 310   "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
 311   "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
 312
 313 // Parallel     arith with exchange: SASX, SSAX, UASX, USAX
 314 def A57WriteParArithExch : SchedWriteVariant<[
 315   SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
 316   SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
 317 ]>;
 318 def : InstRW<[A57WriteParArithExch],
 319   (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
 320
 321 // Parallel     halving arith
 322 // SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,  UHSUB8
 323 def : InstRW<[A57Write_2cyc_1M], (instregex
 324   "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
 325   "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
 326
 327 // Parallel halving arith with exchange
 328 // SHASX, SHSAX, UHASX, UHSAX
 329 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
 330   "(t2)?UHASX", "(t2)?UHSAX")>;
 331
 332 // Parallel     saturating arith
 333 // QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
 334 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
 335   "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
 336
 337 // Parallel     saturating arith with exchange
 338 // QASX, QSAX, UQASX, UQSAX
 339 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
 340   "(t2)?UQASX", "(t2)?UQSAX")>;
 341
 342 // Saturate: SSAT, SSAT16, USAT, USAT16
 343 def : InstRW<[A57Write_2cyc_1M],
 344   (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
 345
 346 // Saturating arith: QADD, QSUB
 347 def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
 348
 349 // Saturating doubling arith: QDADD, QDSUB
 350 def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
 351
 352 // --- 3.7 Miscellaneous Data-Processing Instructions ---
 353 // Bit field extract: SBFX, UBFX
 354 def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
 355
 356 // Bit field insert/clear: BFI, BFC
 357 def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
 358
 359 // Select bytes, conditional/unconditional
 360 def A57WriteSEL : SchedWriteVariant<[
 361   SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
 362   SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
 363 ]>;
 364 def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
 365
 366 // Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
 367 def : InstRW<[A57Write_1cyc_1I],
 368   (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
 369
 370 // Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
 371 def : InstRW<[A57Write_2cyc_1M],
 372   (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
 373
 374 // Sign/zero extend and add, parallel: SXTAB16, UXTAB16
 375 def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
 376
 377 // Sum of absolute differences: USAD8, USADA8
 378 def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
 379
 380 // --- 3.8 Load Instructions ---
 381
 382 // Load, immed offset
 383 // LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
 384 def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
 385   "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
 386   "PICLDR", "tLDR")>;
 387
 388 def : InstRW<[A57Write_4cyc_1L],
 389   (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
 390
 391 // For "Load, register offset, minus" we need +1cyc, +1I
 392 def A57WriteLdrAm3 : SchedWriteVariant<[
 393   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
 394   SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
 395 ]>;
 396 def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
 397 def A57WriteLdrAm3X2 : SchedWriteVariant<[
 398   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
 399   SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
 400 ]>;
 401 def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
 402 def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
 403
 404 def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
 405   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
 406   SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
 407   SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
 408 ]>;
 409 def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
 410
 411 def A57WrBackOne : SchedWriteRes<[]> {
 412   let Latency = 1;
 413   let NumMicroOps = 0;
 414 }
 415 def A57WrBackTwo : SchedWriteRes<[]> {
 416   let Latency = 2;
 417   let NumMicroOps = 0;
 418 }
 419 def A57WrBackThree : SchedWriteRes<[]> {
 420   let Latency = 3;
 421   let NumMicroOps = 0;
 422 }
 423
 424 // --- LDR pre-indexed ---
 425 // Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
 426 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
 427   "LDRB_PRE_IMM", "t2LDRB_PRE")>;
 428
 429 // Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
 430 // (5 cyc load result for not-lsl2 scaled)
 431 def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
 432   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
 433   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
 434 ]>;
 435 def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
 436   (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
 437
 438 def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
 439   SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
 440   SchedVar<NoSchedPred,          [A57WrBackOne]>
 441 ]>;
 442 def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
 443   (instregex "LDR(H|SH|SB)_PRE")>;
 444 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
 445   (instregex "t2LDR(H|SH|SB)?_PRE")>;
 446
 447 // LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
 448 def A57WriteLdrDAm3Pre : SchedWriteVariant<[
 449   SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
 450   SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
 451 ]>;
 452 def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
 453   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
 454   SchedVar<NoSchedPred,          [A57WrBackOne]>
 455 ]>;
 456 def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
 457   (instregex "LDRD_PRE")>;
 458 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
 459   (instregex "t2LDRD_PRE")>;
 460
 461 // --- LDR post-indexed ---
 462 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
 463   "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
 464
 465 def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
 466   SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
 467   SchedVar<NoSchedPred,        [A57WrBackOne]>
 468 ]>;
 469 def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
 470   (instregex "LDR(H|SH|SB)_POST")>;
 471 def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
 472   (instregex "t2LDR(H|SH|SB)?_POST")>;
 473
 474 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
 475   "LDRB_POST_REG", "LDR(B?)T_POST$")>;
 476
 477 def A57WriteLdrTRegPost : SchedWriteVariant<[
 478   SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
 479   SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
 480 ]>;
 481 def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
 482   SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
 483   SchedVar<NoSchedPred,        [A57WrBackTwo]>
 484 ]>;
 485 // 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
 486 def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
 487   (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
 488
 489 def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
 490
 491 def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
 492   SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
 493   SchedVar<NoSchedPred,          [A57WrBackOne]>
 494 ]>;
 495 // LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
 496 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
 497   A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
 498 def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
 499   (instregex "t2LDRD_POST")>;
 500
 501 // --- Preload instructions ---
 502 // Preload, immed offset
 503 def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
 504   "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
 505
 506 // Preload, register offset,
 507 // 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
 508 // otherwise 4cyc "L"
 509 def A57WritePLD : SchedWriteVariant<[
 510   SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
 511   SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
 512   SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
 513 ]>;
 514 def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
 515
 516 // --- Load multiple instructions ---
 517 foreach NumAddr = 1-8 in {
 518   def A57LMAddrPred#NumAddr :
 519     SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
 520 }
 521
 522 def A57LDMOpsListNoregin : A57WriteLMOpsListType<
 523                 [A57Write_3cyc_1L, A57Write_3cyc_1L,
 524                  A57Write_4cyc_1L, A57Write_4cyc_1L,
 525                  A57Write_5cyc_1L, A57Write_5cyc_1L,
 526                  A57Write_6cyc_1L, A57Write_6cyc_1L,
 527                  A57Write_7cyc_1L, A57Write_7cyc_1L,
 528                  A57Write_8cyc_1L, A57Write_8cyc_1L,
 529                  A57Write_9cyc_1L, A57Write_9cyc_1L,
 530                  A57Write_10cyc_1L, A57Write_10cyc_1L]>;
 531 def A57WriteLDMnoreginlist : SchedWriteVariant<[
 532   SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
 533   SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
 534   SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
 535   SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
 536   SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
 537   SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
 538   SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
 539   SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
 540   SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
 541 ]> { let Variadic=1; }
 542
 543 def A57LDMOpsListRegin : A57WriteLMOpsListType<
 544                 [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
 545                  A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
 546                  A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
 547                  A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
 548                  A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
 549                  A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
 550                  A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
 551                  A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
 552 def A57WriteLDMreginlist : SchedWriteVariant<[
 553   SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
 554   SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
 555   SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
 556   SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
 557   SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
 558   SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
 559   SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
 560   SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
 561   SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
 562 ]> { let Variadic=1; }
 563
 564 def A57LDMOpsList_Upd : A57WriteLMOpsListType<
 565               [A57WrBackOne,
 566                A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
 567                A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
 568                A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
 569                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
 570                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
 571                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
 572                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
 573                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
 574 def A57WriteLDM_Upd : SchedWriteVariant<[
 575   SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
 576   SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
 577   SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
 578   SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
 579   SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
 580   SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
 581   SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
 582   SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
 583   SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
 584 ]> { let Variadic=1; }
 585
 586 def A57WriteLDM : SchedWriteVariant<[
 587   SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
 588   SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
 589 ]> { let Variadic=1; }
 590
 591 def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
 592
 593 // TODO: no writeback latency defined in documentation (implemented as 1 cyc)
 594 def : InstRW<[A57WriteLDM_Upd],
 595   (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
 596
 597 def : InstRW<[A57Write_5cyc_1L], (instregex "VLLDM")>;
 598
 599 // --- 3.9 Store Instructions ---
 600
 601 // Store, immed offset
 602 def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
 603   "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
 604
 605 // Store, register offset
 606 // For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
 607 // otherwise 1cyc S.
 608 def A57WriteStrAmLDSTSO : SchedWriteVariant<[
 609   SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
 610   SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
 611   SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
 612 ]>;
 613 def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
 614
 615 // STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
 616 def A57WriteStrAm3 : SchedWriteVariant<[
 617   SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
 618   SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
 619 ]>;
 620 def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
 621 def A57WriteStrAm3X2 : SchedWriteVariant<[
 622   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
 623   SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
 624 ]>;
 625 def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
 626
 627 // Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
 628 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
 629   "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
 630   "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
 631
 632 // Store, register pre-indexed:
 633 // 1(1) "S, I0/I1" for plus reg
 634 // 3(2) "I0/I1, S" for minus reg
 635 // 1(2) "S, M" for scaled plus lsl2
 636 // 3(2) "I0/I1, S" for other scaled
 637 def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
 638   SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
 639   SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
 640   SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
 641   SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
 642 ]>;
 643 def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
 644   SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
 645   SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
 646   SchedVar<NoSchedPred,                    [A57WrBackOne]>
 647 ]>;
 648 def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
 649   (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
 650
 651 // pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
 652 // 1(1) "S, I0/I1" for imm or reg plus
 653 // 3(2) "I0/I1, S" for reg minus
 654 def A57WriteStrAm3PreX2 : SchedWriteVariant<[
 655   SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
 656   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
 657 ]>;
 658 def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
 659   SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
 660   SchedVar<NoSchedPred,             [A57WrBackOne]>
 661 ]>;
 662 def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
 663   (instregex "STRH_PRE")>;
 664
 665 def A57WriteStrAm3PreX3 : SchedWriteVariant<[
 666   SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
 667   SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
 668 ]>;
 669 def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
 670   SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
 671   SchedVar<NoSchedPred,             [A57WrBackOne]>
 672 ]>;
 673 def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
 674   (instregex "STRD_PRE")>;
 675
 676 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
 677   "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
 678
 679 // 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
 680 def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
 681   "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
 682
 683 // post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
 684 // 1(1) "S, I0/I1" both for reg or imm
 685 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
 686   (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
 687
 688 // --- Store multiple instructions ---
 689 // TODO: no writeback latency defined in documentation
 690 def A57WriteSTM : SchedWriteVariant<[
 691     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
 692     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
 693     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
 694     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
 695     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
 696     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
 697     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
 698     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
 699     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
 700 ]>;
 701 def A57WriteSTM_Upd : SchedWriteVariant<[
 702     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
 703     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
 704     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
 705     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
 706     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
 707     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
 708     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
 709     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
 710     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
 711 ]>;
 712
 713 def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
 714 def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
 715   (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
 716
 717 def : InstRW<[A57Write_5cyc_1S], (instregex "VLSTM")>;
 718
 719 // --- 3.10 FP Data Processing Instructions ---
 720 def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
 721 def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
 722
 723 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
 724
 725 // fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
 726 def A57WriteVcmp : SchedWriteVariant<[
 727   SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
 728   SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
 729 ]>;
 730 def : InstRW<[A57WriteVcmp],
 731   (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
 732
 733 // fp convert
 734 def : InstRW<[A57Write_5cyc_1V], (instregex
 735   "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
 736 def : InstRW<[A57Write_5cyc_1V], (instregex "VTOSLS", "VTOUHS", "VTOULS")>;
 737 def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
 738
 739 def : InstRW<[A57Write_5cyc_1V], (instregex "VJCVT")>;
 740
 741 // FP round to integral
 742 def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
 743
 744 // FP divide, FP square root
 745 def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
 746 def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
 747 def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
 748 def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
 749
 750 def : InstRW<[A57Write_17cyc_1W], (instregex "VSQRTH")>;
 751
 752 // FP max/min
 753 def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
 754
 755 // FP multiply-accumulate pipelines support late forwarding of the result
 756 // from FP multiply μops to the accumulate operands of an
 757 // FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
 758 // after the FP multiply μop has been issued
 759 // FP multiply, FZ
 760 def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
 761
 762 def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
 763 def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
 764 def : ReadAdvance<ReadFPMUL, 0>;
 765
 766 // FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
 767 // VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
 768 def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
 769
 770 // VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
 771 // VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
 772 // Currently, there is no way to define different read advances for VFMA operand
 773 // from VFMA or from VMUL, so there will be 5 read advance.
 774 // Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
 775 // The same situation with ASIMD VMUL/VFMA instructions
 776 // def A57ReadVFMA : SchedRead;
 777 // def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
 778 // def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
 779 def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
 780
 781 def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
 782 def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
 783 def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
 784
 785 // VMLAH/VMLSH are not binded to scheduling classes by default, so here custom:
 786 def : InstRW<[A57WriteVFMA, A57ReadVFMA5, ReadFPMUL, ReadFPMUL],
 787   (instregex "VMLAH", "VMLSH", "VNMLAH", "VNMLSH")>;
 788
 789 def : InstRW<[A57WriteVMUL],
 790   (instregex "VUDOTD", "VSDOTD", "VUDOTQ", "VSDOTQ")>;
 791
 792 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
 793 def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
 794
 795 // --- 3.11 FP Miscellaneous Instructions ---
 796 // VMOV: 3cyc "F0/F1" for imm/reg
 797 def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
 798 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
 799
 800 def : InstRW<[A57Write_3cyc_1V], (instregex "VINSH")>;
 801
 802 // 5cyc L for FP transfer, vfp to core reg,
 803 // 5cyc L for FP transfer, core reg to vfp
 804 def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
 805 // VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
 806 def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
 807
 808 // 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
 809 def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
 810
 811 // --- 3.12 FP Load Instructions ---
 812 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
 813
 814 def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
 815
 816 // FP load multiple (VLDM)
 817
 818 def A57VLDMOpsListUncond : A57WriteLMOpsListType<
 819                [A57Write_5cyc_1L, A57Write_5cyc_1L,
 820                 A57Write_6cyc_1L, A57Write_6cyc_1L,
 821                 A57Write_7cyc_1L, A57Write_7cyc_1L,
 822                 A57Write_8cyc_1L, A57Write_8cyc_1L,
 823                 A57Write_9cyc_1L, A57Write_9cyc_1L,
 824                 A57Write_10cyc_1L, A57Write_10cyc_1L,
 825                 A57Write_11cyc_1L, A57Write_11cyc_1L,
 826                 A57Write_12cyc_1L, A57Write_12cyc_1L]>;
 827 def A57WriteVLDMuncond : SchedWriteVariant<[
 828   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
 829   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
 830   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
 831   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
 832   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
 833   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
 834   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
 835   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
 836   SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
 837 ]> { let Variadic=1; }
 838
 839 def A57VLDMOpsListCond : A57WriteLMOpsListType<
 840                [A57Write_5cyc_1L, A57Write_6cyc_1L,
 841                 A57Write_7cyc_1L, A57Write_8cyc_1L,
 842                 A57Write_9cyc_1L, A57Write_10cyc_1L,
 843                 A57Write_11cyc_1L, A57Write_12cyc_1L,
 844                 A57Write_13cyc_1L, A57Write_14cyc_1L,
 845                 A57Write_15cyc_1L, A57Write_16cyc_1L,
 846                 A57Write_17cyc_1L, A57Write_18cyc_1L,
 847                 A57Write_19cyc_1L, A57Write_20cyc_1L]>;
 848 def A57WriteVLDMcond : SchedWriteVariant<[
 849   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
 850   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
 851   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
 852   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
 853   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
 854   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
 855   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
 856   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
 857   SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
 858 ]> { let Variadic=1; }
 859
 860 def A57WriteVLDM : SchedWriteVariant<[
 861   SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
 862   SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
 863 ]> { let Variadic=1; }
 864
 865 def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
 866
 867 def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
 868                [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
 869                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
 870                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
 871                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
 872                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
 873                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
 874                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
 875                 A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
 876 def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
 877   SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
 878   SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
 879   SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
 880   SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
 881   SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
 882   SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
 883   SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
 884   SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
 885   SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
 886 ]> { let Variadic=1; }
 887
 888 def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
 889                [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
 890                 A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
 891                 A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
 892                 A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
 893                 A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
 894                 A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
 895                 A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
 896                 A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
 897 def A57WriteVLDMcond_UPD : SchedWriteVariant<[
 898   SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
 899   SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
 900   SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
 901   SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
 902   SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
 903   SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
 904   SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
 905   SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
 906   SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
 907 ]> { let Variadic=1; }
 908
 909 def A57WriteVLDM_UPD : SchedWriteVariant<[
 910   SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
 911   SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
 912 ]> { let Variadic=1; }
 913
 914 def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
 915   (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
 916
 917 // --- 3.13 FP Store Instructions ---
 918 def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
 919
 920 def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
 921
 922 def A57WriteVSTMs : SchedWriteVariant<[
 923     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
 924     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
 925     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
 926     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
 927     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
 928     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
 929     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
 930     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
 931     SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
 932 ]>;
 933 def A57WriteVSTMd : SchedWriteVariant<[
 934     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
 935     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
 936     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
 937     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
 938     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
 939     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
 940     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
 941     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
 942     SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
 943 ]>;
 944 def A57WriteVSTMs_Upd : SchedWriteVariant<[
 945     SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
 946     SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
 947     SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
 948     SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
 949     SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
 950     SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
 951     SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
 952     SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
 953     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
 954 ]>;
 955 def A57WriteVSTMd_Upd : SchedWriteVariant<[
 956     SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
 957     SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
 958     SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
 959     SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
 960     SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
 961     SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
 962     SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
 963     SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
 964     SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
 965 ]>;
 966
 967 def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
 968 def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
 969 def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
 970   (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
 971 def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
 972   (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
 973
 974 // --- 3.14 ASIMD Integer Instructions ---
 975
 976 // ASIMD absolute diff, 3cyc F0/F1 for integer VABD
 977 def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
 978
 979 // ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
 980 def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
 981 def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
 982 def : InstRW<[A57WriteVABAD, A57ReadVABAD],
 983   (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
 984 def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
 985 def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
 986 def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
 987   (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
 988
 989 // ASIMD absolute diff accum long: 4(1) F1 for VABAL
 990 def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
 991 def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
 992 def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
 993
 994 // ASIMD absolute diff long: 3cyc F0/F1 for VABDL
 995 def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
 996
 997 // ASIMD arith, basic
 998 def : InstRW<[A57Write_3cyc_1V], (instregex "VADDv", "VADDL", "VADDW",
 999   "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
1000   "VPADDi", "VPADDL", "VSUBv", "VSUBL", "VSUBW")>;
1001
1002 // ASIMD arith, complex
1003 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
1004   "VQABS", "VQADD", "VQNEG", "VQSUB",
1005   "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
1006
1007 // ASIMD compare
1008 def : InstRW<[A57Write_3cyc_1V],
1009   (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
1010
1011 // ASIMD logical
1012 def : InstRW<[A57Write_3cyc_1V],
1013   (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
1014
1015 // ASIMD max/min
1016 def : InstRW<[A57Write_3cyc_1V],
1017   (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
1018
1019 // ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1020 // Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
1021 // and multiply-with-accumulate instructions relative to r0pX.
1022 def A57WriteVMULD_VecInt : SchedWriteVariant<[
1023   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1024   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1025 def : InstRW<[A57WriteVMULD_VecInt], (instregex
1026   "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
1027   "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
1028
1029 // ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
1030 def A57WriteVMULQ_VecInt : SchedWriteVariant<[
1031   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1032   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1033 def : InstRW<[A57WriteVMULQ_VecInt], (instregex
1034   "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
1035   "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
1036
1037 // ASIMD multiply accumulate, D-form
1038 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1039 // (4 or 3 ReadAdvance)
1040 def A57WriteVMLAD_VecInt : SchedWriteVariant<[
1041   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1042   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1043 def A57ReadVMLAD_VecInt : SchedReadVariant<[
1044   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
1045   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
1046 ]>;
1047 def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
1048   (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
1049
1050 // ASIMD multiply accumulate, Q-form
1051 // 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1052 // (4 or 3 ReadAdvance)
1053 def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
1054   SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
1055   SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
1056 def A57ReadVMLAQ_VecInt : SchedReadVariant<[
1057   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
1058   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
1059 ]>;
1060 def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
1061   (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
1062
1063 // ASIMD multiply accumulate long
1064 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
1065 // (4 or 3 ReadAdvance)
1066 def A57WriteVMLAL_VecInt : SchedWriteVariant<[
1067   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1068   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1069 def A57ReadVMLAL_VecInt : SchedReadVariant<[
1070   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
1071   SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
1072 ]>;
1073 def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
1074   (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
1075
1076 // ASIMD multiply accumulate saturating long
1077 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
1078 // (3 or 2 ReadAdvance)
1079 def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
1080   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1081   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1082 def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
1083   SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
1084   SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
1085 ]>;
1086 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1087   (instregex "VQDMLAL", "VQDMLSL")>;
1088
1089 // Vector Saturating Rounding Doubling Multiply Accumulate/Subtract Long
1090 // Scheduling info from VQDMLAL/VQDMLSL
1091 def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
1092   (instregex "VQRDMLAH", "VQRDMLSH")>;
1093
1094 // ASIMD multiply long
1095 // 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
1096 def A57WriteVMULL_VecInt : SchedWriteVariant<[
1097   SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
1098   SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
1099 def : InstRW<[A57WriteVMULL_VecInt],
1100   (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
1101
1102 // ASIMD pairwise add and accumulate
1103 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1104 def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
1105 def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
1106 def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
1107
1108 // ASIMD shift accumulate
1109 // 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
1110 def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
1111 def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
1112 def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
1113
1114 // ASIMD shift by immed, basic
1115 def : InstRW<[A57Write_3cyc_1X],
1116   (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
1117
1118 // ASIMD shift by immed, complex
1119 def : InstRW<[A57Write_4cyc_1X], (instregex
1120   "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
1121   "VRSHRN")>;
1122
1123 // ASIMD shift by immed and insert, basic, D-form
1124 def : InstRW<[A57Write_4cyc_1X], (instregex
1125   "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
1126
1127 // ASIMD shift by immed and insert, basic, Q-form
1128 def : InstRW<[A57Write_5cyc_1X], (instregex
1129   "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
1130
1131 // ASIMD shift by register, basic, D-form
1132 def : InstRW<[A57Write_3cyc_1X], (instregex
1133   "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1134
1135 // ASIMD shift by register, basic, Q-form
1136 def : InstRW<[A57Write_4cyc_1X], (instregex
1137   "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1138
1139 // ASIMD shift by register, complex, D-form
1140 // VQRSHL, VQSHL, VRSHL
1141 def : InstRW<[A57Write_4cyc_1X], (instregex
1142   "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
1143   "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
1144
1145 // ASIMD shift by register, complex, Q-form
1146 def : InstRW<[A57Write_5cyc_1X], (instregex
1147   "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
1148   "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
1149
1150 // --- 3.15 ASIMD Floating-Point Instructions ---
1151 // ASIMD FP absolute value
1152 def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
1153
1154 // ASIMD FP arith
1155 def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
1156   "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
1157
1158 def : InstRW<[A57Write_5cyc_1V], (instregex "VCADD", "VCMLA")>;
1159
1160 // ASIMD FP compare
1161 def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
1162   "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
1163
1164 // ASIMD FP convert, integer
1165 def : InstRW<[A57Write_5cyc_1V], (instregex
1166   "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
1167   "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
1168   "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
1169
1170 // ASIMD FP convert, half-precision: 8cyc F0/F1
1171 def : InstRW<[A57Write_8cyc_1V], (instregex
1172   "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
1173   "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
1174   "VCVT(f2h|h2f)")>;
1175
1176 // ASIMD FP max/min
1177 def : InstRW<[A57Write_5cyc_1V], (instregex
1178   "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
1179
1180 // ASIMD FP multiply
1181 def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
1182 def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
1183
1184 // ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
1185 def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
1186 def A57ReadVMLA_VecFP  :
1187   SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
1188 def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
1189   (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
1190
1191 // ASIMD FP negate
1192 def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
1193
1194 // ASIMD FP round to integral
1195 def : InstRW<[A57Write_5cyc_1V], (instregex
1196   "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
1197
1198 // --- 3.16 ASIMD Miscellaneous Instructions ---
1199
1200 // ASIMD bitwise insert
1201 def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
1202
1203 // ASIMD count
1204 def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
1205
1206 // ASIMD duplicate, core reg: 8cyc "L, F0/F1"
1207 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
1208
1209 // ASIMD duplicate, scalar: 3cyc "F0/F1"
1210 def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
1211
1212 // ASIMD extract
1213 def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
1214
1215 // ASIMD move, immed
1216 def : InstRW<[A57Write_3cyc_1V], (instregex
1217   "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
1218   "VMOVD0", "VMOVQ0")>;
1219
1220 // ASIMD move, narrowing
1221 def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
1222
1223 // ASIMD move, saturating
1224 def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
1225
1226 // ASIMD reciprocal estimate
1227 def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
1228
1229 // ASIMD reciprocal step, FZ
1230 def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
1231
1232 // ASIMD reverse, swap, table lookup (1-2 reg)
1233 def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
1234
1235 // ASIMD table lookup (3-4 reg)
1236 def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
1237
1238 // ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
1239 def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
1240
1241 // ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
1242 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
1243
1244 // ASIMD transpose
1245 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
1246
1247 // ASIMD unzip/zip, D-form
1248 def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
1249   (instregex "VUZPd", "VZIPd")>;
1250
1251 // ASIMD unzip/zip, Q-form
1252 def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
1253   (instregex "VUZPq", "VZIPq")>;
1254
1255 // --- 3.17 ASIMD Load Instructions ---
1256
1257 // Overriden via InstRW for this processor.
1258 def : WriteRes<WriteVLD1, []>;
1259 def : WriteRes<WriteVLD2, []>;
1260 def : WriteRes<WriteVLD3, []>;
1261 def : WriteRes<WriteVLD4, []>;
1262 def : WriteRes<WriteVST1, []>;
1263 def : WriteRes<WriteVST2, []>;
1264 def : WriteRes<WriteVST3, []>;
1265 def : WriteRes<WriteVST4, []>;
1266
1267 // 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
1268 def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
1269 def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
1270   (instregex "VLD1(d|q)(8|16|32|64)wb")>;
1271
1272 // 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
1273 def : InstRW<[A57Write_6cyc_1L],
1274   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
1275
1276 def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
1277   (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
1278
1279 // ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
1280 def : InstRW<[A57Write_8cyc_1L_1V], (instregex
1281   "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
1282 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
1283   "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
1284
1285 // ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
1286 def : InstRW<[A57Write_8cyc_1L_1V],
1287       (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
1288 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1289       (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
1290
1291 // ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
1292 def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
1293 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1294       (instregex "VLD2b(8|16|32)wb")>;
1295
1296 // ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
1297 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1298       (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
1299                  "VLD2LN(d|q)(8|16|32)Pseudo$")>;
1300 // 2 results + wb result
1301 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
1302       (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
1303 // 1 result + wb result
1304 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1305       (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
1306                  "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
1307
1308 // ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
1309 // 3 results
1310 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1311       (instregex "VLD3(d|q)(8|16|32)$")>;
1312 // 1 result
1313 def : InstRW<[A57Write_9cyc_1L_1V],
1314       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
1315 // 3 results + wb
1316 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1317               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1318       (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
1319 // 1 result + wb
1320 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1321       (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1322
1323 // ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
1324 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1325       (instregex "VLD3LN(d|q)32$",
1326                  "VLD3LN(d|q)32Pseudo$")>;
1327 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1328               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1329       (instregex "VLD3LN(d|q)32_UPD")>;
1330 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1331       (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
1332
1333 // ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
1334 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
1335       (instregex "VLD3LN(d|q)(8|16)$",
1336                  "VLD3LN(d|q)(8|16)Pseudo$")>;
1337 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1338               A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1339       (instregex "VLD3LN(d|q)(8|16)_UPD")>;
1340 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1341       (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
1342
1343 // ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
1344 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
1345       (instregex "VLD3DUP(d|q)(8|16|32)$",
1346                  "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
1347 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1348               A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1349       (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
1350 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1351       (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
1352
1353 // ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
1354 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1355               A57Write_9cyc_1L_1V],
1356       (instregex "VLD4(d|q)(8|16|32)$")>;
1357 def : InstRW<[A57Write_9cyc_1L_1V],
1358       (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
1359 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1360               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1361       (instregex "VLD4(d|q)(8|16|32)_UPD")>;
1362 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1363       (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
1364
1365 // ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
1366 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1367               A57Write_8cyc_1L_1V],
1368       (instregex "VLD4LN(d|q)32$",
1369                  "VLD4LN(d|q)32Pseudo$")>;
1370 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1371               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1372               A57WrBackOne],
1373       (instregex "VLD4LN(d|q)32_UPD")>;
1374 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1375       (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
1376
1377 // ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
1378 def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
1379               A57Write_9cyc_1L_1V],
1380       (instregex "VLD4LN(d|q)(8|16)$",
1381                  "VLD4LN(d|q)(8|16)Pseudo$")>;
1382 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1383               A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
1384               A57WrBackOne],
1385       (instregex "VLD4LN(d|q)(8|16)_UPD")>;
1386 def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
1387       (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
1388
1389 // ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
1390 def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
1391               A57Write_8cyc_1L_1V],
1392       (instregex "VLD4DUP(d|q)(8|16|32)$",
1393                  "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
1394 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1395               A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
1396               A57WrBackOne],
1397       (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
1398 def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
1399       (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
1400
1401 // --- 3.18 ASIMD Store Instructions ---
1402
1403 // ASIMD store, 1 element, multiple, 1 reg: 1cyc S
1404 def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
1405 def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
1406       (instregex "VST1d(8|16|32|64)wb")>;
1407 // ASIMD store, 1 element, multiple, 2 reg: 2cyc S
1408 def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
1409 def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
1410       (instregex "VST1q(8|16|32|64)wb")>;
1411 // ASIMD store, 1 element, multiple, 3 reg: 3cyc S
1412 def : InstRW<[A57Write_3cyc_1S],
1413       (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
1414 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
1415       (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
1416 // ASIMD store, 1 element, multiple, 4 reg: 4cyc S
1417 def : InstRW<[A57Write_4cyc_1S],
1418       (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
1419 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
1420       (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
1421 // ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
1422 def : InstRW<[A57Write_3cyc_1S_1V],
1423       (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
1424 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1425       (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
1426 // ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
1427 def : InstRW<[A57Write_3cyc_1S_1V],
1428       (instregex "VST2(d|b)(8|16|32)$")>;
1429 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1430       (instregex "VST2(b|d)(8|16|32)wb")>;
1431 // ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
1432 def : InstRW<[A57Write_4cyc_1S_1V],
1433       (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
1434 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1435       (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
1436 // ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
1437 def : InstRW<[A57Write_3cyc_1S_1V],
1438       (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
1439 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1440       (instregex "VST2LN(d|q)(8|16|32)_UPD",
1441                  "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
1442 // ASIMD store, 3 element, multiple, 3 reg
1443 def : InstRW<[A57Write_3cyc_1S_1V],
1444       (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
1445 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1446       (instregex "VST3(d|q)(8|16|32)_UPD",
1447                  "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1448 // ASIMD store, 3 element, one lane
1449 def : InstRW<[A57Write_3cyc_1S_1V],
1450       (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
1451 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1452       (instregex "VST3LN(d|q)(8|16|32)_UPD",
1453                  "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
1454 // ASIMD store, 4 element, multiple, 4 reg
1455 def : InstRW<[A57Write_4cyc_1S_1V],
1456       (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
1457 def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
1458       (instregex "VST4(d|q)(8|16|32)_UPD",
1459                  "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
1460 // ASIMD store, 4 element, one lane
1461 def : InstRW<[A57Write_3cyc_1S_1V],
1462       (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
1463 def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
1464       (instregex "VST4LN(d|q)(8|16|32)_UPD",
1465                  "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
1466
1467 // --- 3.19 Cryptography Extensions ---
1468 // Crypto AES ops
1469 // AESD, AESE, AESIMC, AESMC: 3cyc F0
1470 def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
1471 // Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
1472 def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
1473 // Crypto SHA1 xor ops: 6cyc F0/F1
1474 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
1475 // Crypto SHA1 fast ops: 3cyc F0
1476 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
1477 // Crypto SHA1 slow ops: 6cyc F0
1478 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
1479 // Crypto SHA256 fast ops: 3cyc F0
1480 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
1481 // Crypto SHA256 slow ops: 6cyc F0
1482 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
1483
1484 // --- 3.20 CRC ---
1485 def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
1486
1487 // -----------------------------------------------------------------------------
1488 // Common definitions
1489 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
1490 def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
1491
1492 def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
1493 def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
1494 def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
1495 def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
1496
1497 def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
1498 def : SchedAlias<WriteST, A57Write_1cyc_1S>;
1499 def : ReadAdvance<ReadALU, 0>;
1500
1501 } // SchedModel = CortexA57Model
1502