contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SchedCyclone.td

   1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file defines the machine model for AArch64 Cyclone to support
  10 // instruction scheduling and other instruction cost heuristics.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 def CycloneModel : SchedMachineModel {
  15   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
  16   let MicroOpBufferSize = 192; // Based on the reorder buffer.
  17   let LoadLatency = 4; // Optimistic load latency.
  18   let MispredictPenalty = 16; // 14-19 cycles are typical.
  19   let CompleteModel = 1;
  20
  21   list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
  22                                                     PAUnsupported.F);
  23 }
  24
  25 //===----------------------------------------------------------------------===//
  26 // Define each kind of processor resource and number available on Cyclone.
  27
  28 // 4 integer pipes
  29 def CyUnitI : ProcResource<4> {
  30   let BufferSize = 48;
  31 }
  32
  33 // 2 branch units: I[0..1]
  34 def CyUnitB : ProcResource<2> {
  35   let Super  = CyUnitI;
  36   let BufferSize = 24;
  37 }
  38
  39 // 1 indirect-branch unit: I[0]
  40 def CyUnitBR : ProcResource<1> {
  41   let Super  = CyUnitB;
  42 }
  43
  44 // 2 shifter pipes: I[2..3]
  45 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
  46 def CyUnitIS : ProcResource<2> {
  47   let Super = CyUnitI;
  48   let BufferSize = 24;
  49 }
  50
  51 // 1 mul pipe: I[0]
  52 def CyUnitIM : ProcResource<1> {
  53   let Super = CyUnitBR;
  54   let BufferSize = 32;
  55 }
  56
  57 // 1 div pipe: I[1]
  58 def CyUnitID : ProcResource<1> {
  59   let Super = CyUnitB;
  60   let BufferSize = 16;
  61 }
  62
  63 // 1 integer division unit. This is driven by the ID pipe, but only
  64 // consumes the pipe for one cycle at issue and another cycle at writeback.
  65 def CyUnitIntDiv : ProcResource<1>;
  66
  67 // 2 ld/st pipes.
  68 def CyUnitLS : ProcResource<2> {
  69   let BufferSize = 28;
  70 }
  71
  72 // 3 fp/vector pipes.
  73 def CyUnitV : ProcResource<3> {
  74   let BufferSize = 48;
  75 }
  76 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
  77 def CyUnitVM : ProcResource<2> {
  78   let Super = CyUnitV;
  79   let BufferSize = 32;
  80 }
  81 // 1 fp/vector division/sqrt pipe: V[2]
  82 def CyUnitVD : ProcResource<1> {
  83   let Super = CyUnitV;
  84   let BufferSize = 16;
  85 }
  86 // 1 fp compare pipe: V[0]
  87 def CyUnitVC : ProcResource<1> {
  88   let Super = CyUnitVM;
  89   let BufferSize = 16;
  90 }
  91
  92 // 2 fp division/square-root units.  These are driven by the VD pipe,
  93 // but only consume the pipe for one cycle at issue and a cycle at writeback.
  94 def CyUnitFloatDiv : ProcResource<2>;
  95
  96 //===----------------------------------------------------------------------===//
  97 // Define scheduler read/write resources and latency on Cyclone.
  98 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
  99
 100 let SchedModel = CycloneModel in {
 101
 102 //---
 103 // 7.8.1. Moves
 104 //---
 105
 106 // A single nop micro-op (uX).
 107 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 108
 109 // Move zero is a register rename (to machine register zero).
 110 // The move is replaced by a single nop micro-op.
 111 // MOVZ Rd, #0
 112 // AND Rd, Rzr, #imm
 113 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 114 def WriteImmZ  : SchedWriteVariant<[
 115                    SchedVar<WriteZPred, [WriteX]>,
 116                    SchedVar<NoSchedPred, [WriteImm]>]>;
 117 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 118
 119 // Move GPR is a register rename and single nop micro-op.
 120 // ORR Xd, XZR, Xm
 121 // ADD Xd, Xn, #0
 122 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
 123 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 124 def WriteMov      : SchedWriteVariant<[
 125                       SchedVar<WriteIMovPred, [WriteX]>,
 126                       SchedVar<WriteVMovPred, [WriteX]>,
 127                       SchedVar<NoSchedPred,   [WriteI]>]>;
 128 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
 129
 130 // Move non-zero immediate is an integer ALU op.
 131 // MOVN,MOVZ,MOVK
 132 def : WriteRes<WriteImm, [CyUnitI]>;
 133
 134 //---
 135 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
 136 //              Shifts and Bitfield Operations
 137 //---
 138
 139 // ADR,ADRP
 140 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
 141 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
 142 // ADC(S),SBC(S)
 143 // Aliases: CMN, CMP, TST
 144 //
 145 // Conditional operations.
 146 // CCMNi,CCMPi,CCMNr,CCMPr,
 147 // CSEL,CSINC,CSINV,CSNEG
 148 //
 149 // Bit counting and reversal operations.
 150 // CLS,CLZ,RBIT,REV,REV16,REV32
 151 def : WriteRes<WriteI, [CyUnitI]>;
 152
 153 // ADD with shifted register operand is a single micro-op that
 154 // consumes a shift pipeline for two cycles.
 155 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
 156 // EXAMPLE: ADDrs Xn, Xm LSL #imm
 157 def : WriteRes<WriteISReg, [CyUnitIS]> {
 158   let Latency = 2;
 159   let ResourceCycles = [2];
 160 }
 161
 162 // ADD with extended register operand is the same as shifted reg operand.
 163 // ADD(S)re,SUB(S)re
 164 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
 165 def : WriteRes<WriteIEReg, [CyUnitIS]> {
 166   let Latency = 2;
 167   let ResourceCycles = [2];
 168 }
 169
 170 // Variable shift and bitfield operations.
 171 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
 172 def : WriteRes<WriteIS, [CyUnitIS]>;
 173
 174 // EXTR Shifts a pair of registers and requires two micro-ops.
 175 // The second micro-op is delayed, as modeled by ReadExtrHi.
 176 // EXTR Xn, Xm, #imm
 177 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
 178   let Latency = 2;
 179   let NumMicroOps = 2;
 180 }
 181
 182 // EXTR's first register read is delayed by one cycle, effectively
 183 // shortening its writer's latency.
 184 // EXTR Xn, Xm, #imm
 185 def : ReadAdvance<ReadExtrHi, 1>;
 186
 187 //---
 188 // 7.8.6. Multiplies
 189 //---
 190
 191 // MUL/MNEG are aliases for MADD/MSUB.
 192 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
 193 def : WriteRes<WriteIM32, [CyUnitIM]> {
 194   let Latency = 4;
 195 }
 196 // MADDX,MSUBX,SMULH,UMULH
 197 def : WriteRes<WriteIM64, [CyUnitIM]> {
 198   let Latency = 5;
 199 }
 200
 201 //---
 202 // 7.8.7. Divide
 203 //---
 204
 205 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
 206 // The ID pipe is consumed for 2 cycles: issue and writeback.
 207 // SDIVW,UDIVW
 208 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
 209   let Latency = 10;
 210   let ResourceCycles = [2, 10];
 211 }
 212 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
 213 // The ID pipe is consumed for 2 cycles: issue and writeback.
 214 // SDIVX,UDIVX
 215 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
 216   let Latency = 13;
 217   let ResourceCycles = [2, 13];
 218 }
 219
 220 //---
 221 // 7.8.8,7.8.10. Load/Store, single element
 222 //---
 223
 224 // Integer loads take 4 cycles and use one LS unit for one cycle.
 225 def : WriteRes<WriteLD, [CyUnitLS]> {
 226   let Latency = 4;
 227 }
 228
 229 // Store-load forwarding is 4 cycles.
 230 //
 231 // Note: The store-exclusive sequence incorporates this
 232 // latency. However, general heuristics should not model the
 233 // dependence between a store and subsequent may-alias load because
 234 // hardware speculation works.
 235 def : WriteRes<WriteST, [CyUnitLS]> {
 236   let Latency = 4;
 237 }
 238
 239 // Load from base address plus an optionally scaled register offset.
 240 // Rt latency is latency WriteIS + WriteLD.
 241 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 242 def CyWriteLDIdx : SchedWriteVariant<[
 243   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
 244   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
 245 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 246
 247 // EXAMPLE: STR Xn, Xm [, lsl 3]
 248 def CyWriteSTIdx : SchedWriteVariant<[
 249   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
 250   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
 251 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 252
 253 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 254 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 255 def ReadBaseRS : SchedReadAdvance<1>;
 256 def CyReadAdrBase : SchedReadVariant<[
 257   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
 258   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
 259 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 260
 261 //---
 262 // 7.8.9,7.8.11. Load/Store, paired
 263 //---
 264
 265 // Address pre/post increment is a simple ALU op with one cycle latency.
 266 def : WriteRes<WriteAdr, [CyUnitI]>;
 267
 268 // LDP high register write is fused with the load, but a nop micro-op remains.
 269 def : WriteRes<WriteLDHi, []> {
 270   let Latency = 4;
 271 }
 272
 273 // STP is a vector op and store, except for QQ, which is just two stores.
 274 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
 275 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
 276
 277 //---
 278 // 7.8.13. Branches
 279 //---
 280
 281 // Branches take a single micro-op.
 282 // The misprediction penalty is defined as a SchedMachineModel property.
 283 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
 284 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
 285
 286 //---
 287 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
 288 //---
 289
 290 // NOP,SEV,SEVL,WFE,WFI,YIELD
 291 def : WriteRes<WriteHint, []> {let Latency = 0;}
 292 // ISB
 293 def : InstRW<[WriteI], (instrs ISB)>;
 294 // SLREX,DMB,DSB
 295 def : WriteRes<WriteBarrier, [CyUnitLS]>;
 296
 297 // System instructions get an invalid latency because the latency of
 298 // other operations across them is meaningless.
 299 def : WriteRes<WriteSys, []> {let Latency = -1;}
 300
 301 //===----------------------------------------------------------------------===//
 302 // 7.9 Vector Unit Instructions
 303
 304 // Simple vector operations take 2 cycles.
 305 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
 306
 307 // Define some longer latency vector op types for Cyclone.
 308 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 309 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
 310 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
 311 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
 312
 313 // Simple floating-point operations take 2 cycles.
 314 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
 315
 316 //---
 317 // 7.9.1 Vector Moves
 318 //---
 319
 320 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
 321 // generates expensive int-float conversion instead:
 322 // FMOVDi Dd, #0.0
 323 // FMOVv2f64ns Vd.2d, #0.0
 324
 325 // FMOVSi,FMOVDi
 326 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
 327
 328 // MOVI,MVNI are WriteV
 329 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
 330
 331 // Move FPR is a register rename and single nop micro-op.
 332 // ORR.16b Vd,Vn,Vn
 333 // COPY is handled above in the WriteMov Variant.
 334 def WriteVMov    : SchedWriteVariant<[
 335                      SchedVar<WriteVMovPred, [WriteX]>,
 336                      SchedVar<NoSchedPred,   [WriteV]>]>;
 337 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 338
 339 // FMOVSr,FMOVDr are WriteF.
 340
 341 // MOV V,V is a WriteV.
 342
 343 // CPY D,V[x] is a WriteV
 344
 345 // INS V[x],V[y] is a WriteV.
 346
 347 // FMOVWSr,FMOVXDr,FMOVXDHighr
 348 def : WriteRes<WriteFCopy, [CyUnitLS]> {
 349   let Latency = 5;
 350 }
 351
 352 // FMOVSWr,FMOVDXr
 353 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
 354
 355 // INS V[x],R
 356 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
 357 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
 358
 359 // SMOV,UMOV R,V[x]
 360 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
 361 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
 362
 363 // DUP V,R
 364 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
 365
 366 // DUP V,V[x] is a WriteV.
 367
 368 //---
 369 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
 370 //---
 371
 372 // BIC,ORR V,#imm are WriteV
 373
 374 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
 375
 376 // MVN,NEG,NOT are WriteV
 377
 378 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
 379
 380 // ADDP is a WriteV.
 381 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 382 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
 383
 384 def : InstRW<[CyWriteV3],
 385              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
 386
 387 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
 388
 389 // ADD,SUB are WriteV
 390
 391 // Forward declare.
 392 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 393
 394 // Add/Diff and accumulate uses the vector multiply unit.
 395 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 396 def CyReadVAccum  : SchedReadAdvance<1,
 397                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
 398
 399 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 400              (instregex "SADALP","UADALP")>;
 401
 402 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 403              (instregex "SABAv","UABAv","SABALv","UABALv")>;
 404
 405 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
 406
 407 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
 408
 409 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
 410
 411 // WriteV includes:
 412 // AND,BIC,CMTST,EOR,ORN,ORR
 413 // ADDP
 414 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
 415 // SADDL,SSUBL,UADDL,USUBL
 416 // SADDW,SSUBW,UADDW,USUBW
 417
 418 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
 419                                      "CMLEv","CMLTv",
 420                                      "CMHIv","CMHSv")>;
 421
 422 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
 423                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
 424
 425 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
 426                                        "SABDLv","UABDLv")>;
 427
 428 //---
 429 // 7.9.3 Floating Point Arithmetic and Comparisons
 430 //---
 431
 432 // FABS,FNEG are WriteF
 433
 434 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
 435 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
 436
 437 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
 438                                      "FMINPv2i","FMINNMPv2i")>;
 439
 440 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
 441
 442 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
 443                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
 444                                   FADDPv2f32,FADDPv4f32,
 445                                   FABD32,FABDv2f32,FABDv4f32)>;
 446 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
 447                                   FSUBDrr,FSUBv2f64,
 448                                   FADDPv2f64,
 449                                   FABD64,FABDv2f64)>;
 450
 451 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
 452
 453 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
 454                                      "FMAXS","FMAXD","FMAXv",
 455                                      "FMINS","FMIND","FMINv",
 456                                      "FMAXNMS","FMAXNMD","FMAXNMv",
 457                                      "FMINNMS","FMINNMD","FMINNMv",
 458                                      "FMAXPv2f","FMAXPv4f",
 459                                      "FMINPv2f","FMINPv4f",
 460                                      "FMAXNMPv2f","FMAXNMPv4f",
 461                                      "FMINNMPv2f","FMINNMPv4f")>;
 462
 463 // FCMP,FCMPE,FCCMP,FCCMPE
 464 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
 465
 466 // FCSEL is a WriteF.
 467
 468 //---
 469 // 7.9.4 Shifts and Bitfield Operations
 470 //---
 471
 472 // SHL is a WriteV
 473
 474 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 475 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
 476
 477 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 478 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
 479
 480 // Shift and accumulate uses the vector multiply unit.
 481 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 482 def CyReadVShiftAcc  : SchedReadAdvance<1,
 483                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
 484 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
 485              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
 486
 487 // SSHL,USHL are WriteV.
 488
 489 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
 490
 491 // SQSHL,SQSHLU,UQSHL are WriteV.
 492
 493 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 494
 495 // WriteV includes:
 496 // SHLL,SSHLL,USHLL
 497 // SLI,SRI
 498 // BIF,BIT,BSL
 499 // EXT
 500 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 501 // XTN2
 502
 503 def : InstRW<[CyWriteV4],
 504              (instregex "RSHRNv","SHRNv",
 505                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
 506                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
 507
 508 //---
 509 // 7.9.5 Multiplication
 510 //---
 511
 512 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
 513 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
 514                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
 515
 516 // FMUL,FMULX,FNMUL default to WriteFMul.
 517 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
 518
 519 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
 520 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
 521                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
 522
 523 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
 524 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
 525              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
 526               "SQDMLAL","SQDMLSL")>;
 527
 528 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
 529 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
 530 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
 531 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
 532
 533 def : InstRW<[CyWriteSMul, CyReadSMul],
 534              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
 535               FMLAv2f32,FMLAv4f32,
 536               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
 537 def : InstRW<[CyWriteDMul, CyReadDMul],
 538              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
 539               FMLAv2f64,FMLAv2i64_indexed,
 540               FMLSv2f64,FMLSv2i64_indexed)>;
 541
 542 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
 543 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
 544
 545 //---
 546 // 7.9.6 Divide and Square Root
 547 //---
 548
 549 // FDIV,FSQRT
 550 // TODO: Add 64-bit variant with 19 cycle latency.
 551 // TODO: Specialize FSQRT for longer latency.
 552 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
 553   let Latency = 17;
 554   let ResourceCycles = [2, 17];
 555 }
 556
 557 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
 558
 559 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
 560 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
 561
 562 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
 563 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
 564 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
 565 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
 566
 567 //---
 568 // 7.9.7 Integer-FP Conversions
 569 //---
 570
 571 // FCVT lengthen f16/s32
 572 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
 573
 574 // FCVT,FCVTN,FCVTXN
 575 // SCVTF,UCVTF V,V
 576 // FRINT(AIMNPXZ) V,V
 577 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
 578
 579 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
 580 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
 581 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
 582
 583 // FCVT Rd, S/D = V6+LD4: 10 cycles
 584 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
 585 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
 586
 587 // FCVTL is a WriteV
 588
 589 //---
 590 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
 591 //---
 592
 593 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
 594 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
 595                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
 596                                        SHA1SU0rrr)>;
 597
 598 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
 599 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
 600
 601 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
 602 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
 603                                        SHA256Hrrr,SHA256H2rrr)>;
 604
 605 // TRN,UZP,ZUP are WriteV.
 606
 607 // TBL,TBX are WriteV.
 608
 609 //---
 610 // 7.9.11-7.9.14 Load/Store, single element and paired
 611 //---
 612
 613 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
 614 def : WriteRes<WriteVLD, [CyUnitLS]> {
 615   let Latency = 5;
 616 }
 617
 618 // Store-load forwarding is 4 cycles.
 619 def : WriteRes<WriteVST, [CyUnitLS]> {
 620   let Latency = 4;
 621 }
 622
 623 // WriteVLDPair/VSTPair sequences are expanded by the target description.
 624
 625 //---
 626 // 7.9.15 Load, element operations
 627 //---
 628
 629 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
 630 // Subsequent WriteVLDs consume resources. Since all loaded values have the
 631 // same latency, this is acceptable.
 632
 633 // Vd is read 5 cycles after issuing the vector load.
 634 def : ReadAdvance<ReadVLD, 5>;
 635
 636 def : InstRW<[WriteVLD],
 637              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 638 def : InstRW<[WriteVLD, WriteAdr],
 639              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 640
 641 // Register writes from the load's high half are fused micro-ops.
 642 def : InstRW<[WriteVLD],
 643              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
 644 def : InstRW<[WriteVLD, WriteAdr],
 645              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
 646 def : InstRW<[WriteVLD, WriteVLD],
 647              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 648 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 649              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
 650
 651 def : InstRW<[WriteVLD, WriteVLD],
 652              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 653 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 654              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
 655 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
 656              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 657 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
 658              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
 659
 660 def : InstRW<[WriteVLD, WriteVLD],
 661              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 662 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 663              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
 664 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
 665              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 666 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
 667              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
 668
 669 def : InstRW<[WriteVLDShuffle, ReadVLD],
 670              (instregex "LD1i(8|16|32)$")>;
 671 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
 672              (instregex "LD1i(8|16|32)_POST")>;
 673
 674 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
 675 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
 676
 677 def : InstRW<[WriteVLDShuffle],
 678              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 679 def : InstRW<[WriteVLDShuffle, WriteAdr],
 680              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 681
 682 def : InstRW<[WriteVLDShuffle, WriteV],
 683              (instregex "LD2Twov(8b|4h|2s)$")>;
 684 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 685              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 686 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
 687              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 688 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
 689              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
 690
 691 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 692              (instregex "LD2i(8|16|32)$")>;
 693 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 694              (instregex "LD2i(8|16|32)_POST")>;
 695 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 696              (instregex "LD2i64$")>;
 697 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 698              (instregex "LD2i64_POST")>;
 699
 700 def : InstRW<[WriteVLDShuffle, WriteV],
 701              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 702 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 703              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 704
 705 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 706              (instregex "LD3Threev(8b|4h|2s)$")>;
 707 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 708              (instregex "LD3Threev(8b|4h|2s)_POST")>;
 709 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
 710              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 711 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
 712              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
 713
 714 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
 715              (instregex "LD3i(8|16|32)$")>;
 716 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
 717              (instregex "LD3i(8|16|32)_POST")>;
 718
 719 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
 720              (instregex "LD3i64$")>;
 721 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 722              (instregex "LD3i64_POST")>;
 723
 724 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
 725              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
 726 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 727              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 728
 729 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 730              (instrs LD3Rv1d,LD3Rv2d)>;
 731 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 732              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 733
 734 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 735              (instregex "LD4Fourv(8b|4h|2s)$")>;
 736 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 737              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
 738 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
 739               WriteVLDPairShuffle, WriteVLDPairShuffle],
 740              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 741 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
 742               WriteVLDPairShuffle, WriteVLDPairShuffle],
 743              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
 744
 745 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
 746              (instregex "LD4i(8|16|32)$")>;
 747 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
 748              (instregex "LD4i(8|16|32)_POST")>;
 749
 750
 751 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
 752              (instrs LD4i64)>;
 753 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 754              (instrs LD4i64_POST)>;
 755
 756 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
 757              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
 758 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
 759              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 760
 761 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 762              (instrs LD4Rv1d,LD4Rv2d)>;
 763 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 764              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
 765
 766 //---
 767 // 7.9.16 Store, element operations
 768 //---
 769
 770 // Only the WriteAdr for writeback matches a def operands.
 771 // Subsequent WriteVLDs only consume resources.
 772
 773 def : InstRW<[WriteVST],
 774              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 775 def : InstRW<[WriteAdr, WriteVST],
 776              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 777
 778 def : InstRW<[WriteVSTShuffle],
 779              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
 780 def : InstRW<[WriteAdr, WriteVSTShuffle],
 781              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
 782 def : InstRW<[WriteVST, WriteVST],
 783              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
 784 def : InstRW<[WriteAdr, WriteVST, WriteVST],
 785              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
 786
 787 def : InstRW<[WriteVSTShuffle, WriteVST],
 788              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
 789 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
 790              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
 791 def : InstRW<[WriteVST, WriteVST, WriteVST],
 792              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
 793 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
 794              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
 795
 796 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 797              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
 798 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 799              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
 800 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
 801              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
 802 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
 803              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
 804
 805 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
 806 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
 807
 808 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
 809 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
 810
 811 def : InstRW<[WriteVSTShuffle],
 812              (instregex "ST2Twov(8b|4h|2s)$")>;
 813 def : InstRW<[WriteAdr, WriteVSTShuffle],
 814              (instregex "ST2Twov(8b|4h|2s)_POST")>;
 815 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 816              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 817 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 818              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
 819
 820 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
 821 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
 822 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
 823 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
 824
 825 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 826              (instregex "ST3Threev(8b|4h|2s)$")>;
 827 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 828              (instregex "ST3Threev(8b|4h|2s)_POST")>;
 829 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 830              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
 831 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 832              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
 833
 834 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
 835 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
 836
 837 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
 838 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
 839
 840 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
 841             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
 842 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
 843             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
 844 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
 845               WriteVSTPairShuffle, WriteVSTPairShuffle],
 846              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
 847 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
 848               WriteVSTPairShuffle, WriteVSTPairShuffle],
 849              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
 850
 851 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
 852 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 853
 854 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 855 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 856
 857 // Atomic operations are not supported.
 858 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 859
 860 //---
 861 // Unused SchedRead types
 862 //---
 863
 864 def : ReadAdvance<ReadI, 0>;
 865 def : ReadAdvance<ReadISReg, 0>;
 866 def : ReadAdvance<ReadIEReg, 0>;
 867 def : ReadAdvance<ReadIM, 0>;
 868 def : ReadAdvance<ReadIMA, 0>;
 869 def : ReadAdvance<ReadID, 0>;
 870
 871 } // SchedModel = CycloneModel