contrib/llvm/lib/Target/AArch64/AArch64SchedCyclone.td

   1 //=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the machine model for AArch64 Cyclone to support
  11 // instruction scheduling and other instruction cost heuristics.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 def CycloneModel : SchedMachineModel {
  16   let IssueWidth = 6; // 6 micro-ops are dispatched per cycle.
  17   let MicroOpBufferSize = 192; // Based on the reorder buffer.
  18   let LoadLatency = 4; // Optimistic load latency.
  19   let MispredictPenalty = 16; // 14-19 cycles are typical.
  20   let CompleteModel = 1;
  21 }
  22
  23 //===----------------------------------------------------------------------===//
  24 // Define each kind of processor resource and number available on Cyclone.
  25
  26 // 4 integer pipes
  27 def CyUnitI : ProcResource<4> {
  28   let BufferSize = 48;
  29 }
  30
  31 // 2 branch units: I[0..1]
  32 def CyUnitB : ProcResource<2> {
  33   let Super  = CyUnitI;
  34   let BufferSize = 24;
  35 }
  36
  37 // 1 indirect-branch unit: I[0]
  38 def CyUnitBR : ProcResource<1> {
  39   let Super  = CyUnitB;
  40 }
  41
  42 // 2 shifter pipes: I[2..3]
  43 // When an instruction consumes a CyUnitIS, it also consumes a CyUnitI
  44 def CyUnitIS : ProcResource<2> {
  45   let Super = CyUnitI;
  46   let BufferSize = 24;
  47 }
  48
  49 // 1 mul pipe: I[0]
  50 def CyUnitIM : ProcResource<1> {
  51   let Super = CyUnitBR;
  52   let BufferSize = 32;
  53 }
  54
  55 // 1 div pipe: I[1]
  56 def CyUnitID : ProcResource<1> {
  57   let Super = CyUnitB;
  58   let BufferSize = 16;
  59 }
  60
  61 // 1 integer division unit. This is driven by the ID pipe, but only
  62 // consumes the pipe for one cycle at issue and another cycle at writeback.
  63 def CyUnitIntDiv : ProcResource<1>;
  64
  65 // 2 ld/st pipes.
  66 def CyUnitLS : ProcResource<2> {
  67   let BufferSize = 28;
  68 }
  69
  70 // 3 fp/vector pipes.
  71 def CyUnitV : ProcResource<3> {
  72   let BufferSize = 48;
  73 }
  74 // 2 fp/vector arithmetic and multiply pipes: V[0-1]
  75 def CyUnitVM : ProcResource<2> {
  76   let Super = CyUnitV;
  77   let BufferSize = 32;
  78 }
  79 // 1 fp/vector division/sqrt pipe: V[2]
  80 def CyUnitVD : ProcResource<1> {
  81   let Super = CyUnitV;
  82   let BufferSize = 16;
  83 }
  84 // 1 fp compare pipe: V[0]
  85 def CyUnitVC : ProcResource<1> {
  86   let Super = CyUnitVM;
  87   let BufferSize = 16;
  88 }
  89
  90 // 2 fp division/square-root units.  These are driven by the VD pipe,
  91 // but only consume the pipe for one cycle at issue and a cycle at writeback.
  92 def CyUnitFloatDiv : ProcResource<2>;
  93
  94 //===----------------------------------------------------------------------===//
  95 // Define scheduler read/write resources and latency on Cyclone.
  96 // This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1.
  97
  98 let SchedModel = CycloneModel in {
  99
 100 //---
 101 // 7.8.1. Moves
 102 //---
 103
 104 // A single nop micro-op (uX).
 105 def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 106
 107 // Move zero is a register rename (to machine register zero).
 108 // The move is replaced by a single nop micro-op.
 109 // MOVZ Rd, #0
 110 // AND Rd, Rzr, #imm
 111 def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 112 def WriteImmZ  : SchedWriteVariant<[
 113                    SchedVar<WriteZPred, [WriteX]>,
 114                    SchedVar<NoSchedPred, [WriteImm]>]>;
 115 def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 116
 117 // Move GPR is a register rename and single nop micro-op.
 118 // ORR Xd, XZR, Xm
 119 // ADD Xd, Xn, #0
 120 def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
 121 def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 122 def WriteMov      : SchedWriteVariant<[
 123                       SchedVar<WriteIMovPred, [WriteX]>,
 124                       SchedVar<WriteVMovPred, [WriteX]>,
 125                       SchedVar<NoSchedPred,   [WriteI]>]>;
 126 def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>;
 127
 128 // Move non-zero immediate is an integer ALU op.
 129 // MOVN,MOVZ,MOVK
 130 def : WriteRes<WriteImm, [CyUnitI]>;
 131
 132 //---
 133 // 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional,
 134 //              Shifts and Bitfield Operations
 135 //---
 136
 137 // ADR,ADRP
 138 // ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri
 139 // ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr
 140 // ADC(S),SBC(S)
 141 // Aliases: CMN, CMP, TST
 142 //
 143 // Conditional operations.
 144 // CCMNi,CCMPi,CCMNr,CCMPr,
 145 // CSEL,CSINC,CSINV,CSNEG
 146 //
 147 // Bit counting and reversal operations.
 148 // CLS,CLZ,RBIT,REV,REV16,REV32
 149 def : WriteRes<WriteI, [CyUnitI]>;
 150
 151 // ADD with shifted register operand is a single micro-op that
 152 // consumes a shift pipeline for two cycles.
 153 // ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs
 154 // EXAMPLE: ADDrs Xn, Xm LSL #imm
 155 def : WriteRes<WriteISReg, [CyUnitIS]> {
 156   let Latency = 2;
 157   let ResourceCycles = [2];
 158 }
 159
 160 // ADD with extended register operand is the same as shifted reg operand.
 161 // ADD(S)re,SUB(S)re
 162 // EXAMPLE: ADDXre Xn, Xm, UXTB #1
 163 def : WriteRes<WriteIEReg, [CyUnitIS]> {
 164   let Latency = 2;
 165   let ResourceCycles = [2];
 166 }
 167
 168 // Variable shift and bitfield operations.
 169 // ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM
 170 def : WriteRes<WriteIS, [CyUnitIS]>;
 171
 172 // EXTR Shifts a pair of registers and requires two micro-ops.
 173 // The second micro-op is delayed, as modeled by ReadExtrHi.
 174 // EXTR Xn, Xm, #imm
 175 def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> {
 176   let Latency = 2;
 177   let NumMicroOps = 2;
 178 }
 179
 180 // EXTR's first register read is delayed by one cycle, effectively
 181 // shortening its writer's latency.
 182 // EXTR Xn, Xm, #imm
 183 def : ReadAdvance<ReadExtrHi, 1>;
 184
 185 //---
 186 // 7.8.6. Multiplies
 187 //---
 188
 189 // MUL/MNEG are aliases for MADD/MSUB.
 190 // MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL
 191 def : WriteRes<WriteIM32, [CyUnitIM]> {
 192   let Latency = 4;
 193 }
 194 // MADDX,MSUBX,SMULH,UMULH
 195 def : WriteRes<WriteIM64, [CyUnitIM]> {
 196   let Latency = 5;
 197 }
 198
 199 //---
 200 // 7.8.7. Divide
 201 //---
 202
 203 // 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient.
 204 // The ID pipe is consumed for 2 cycles: issue and writeback.
 205 // SDIVW,UDIVW
 206 def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> {
 207   let Latency = 10;
 208   let ResourceCycles = [2, 10];
 209 }
 210 // 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient.
 211 // The ID pipe is consumed for 2 cycles: issue and writeback.
 212 // SDIVX,UDIVX
 213 def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> {
 214   let Latency = 13;
 215   let ResourceCycles = [2, 13];
 216 }
 217
 218 //---
 219 // 7.8.8,7.8.10. Load/Store, single element
 220 //---
 221
 222 // Integer loads take 4 cycles and use one LS unit for one cycle.
 223 def : WriteRes<WriteLD, [CyUnitLS]> {
 224   let Latency = 4;
 225 }
 226
 227 // Store-load forwarding is 4 cycles.
 228 //
 229 // Note: The store-exclusive sequence incorporates this
 230 // latency. However, general heuristics should not model the
 231 // dependence between a store and subsequent may-alias load because
 232 // hardware speculation works.
 233 def : WriteRes<WriteST, [CyUnitLS]> {
 234   let Latency = 4;
 235 }
 236
 237 // Load from base address plus an optionally scaled register offset.
 238 // Rt latency is latency WriteIS + WriteLD.
 239 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 240 def CyWriteLDIdx : SchedWriteVariant<[
 241   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
 242   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
 243 def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 244
 245 // EXAMPLE: STR Xn, Xm [, lsl 3]
 246 def CyWriteSTIdx : SchedWriteVariant<[
 247   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
 248   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
 249 def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 250
 251 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 252 // EXAMPLE: LDR Xn, Xm [, lsl 3]
 253 def ReadBaseRS : SchedReadAdvance<1>;
 254 def CyReadAdrBase : SchedReadVariant<[
 255   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
 256   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
 257 def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 258
 259 //---
 260 // 7.8.9,7.8.11. Load/Store, paired
 261 //---
 262
 263 // Address pre/post increment is a simple ALU op with one cycle latency.
 264 def : WriteRes<WriteAdr, [CyUnitI]>;
 265
 266 // LDP high register write is fused with the load, but a nop micro-op remains.
 267 def : WriteRes<WriteLDHi, []> {
 268   let Latency = 4;
 269 }
 270
 271 // STP is a vector op and store, except for QQ, which is just two stores.
 272 def : SchedAlias<WriteSTP, WriteVSTShuffle>;
 273 def : InstRW<[WriteST, WriteST], (instrs STPQi)>;
 274
 275 //---
 276 // 7.8.13. Branches
 277 //---
 278
 279 // Branches take a single micro-op.
 280 // The misprediction penalty is defined as a SchedMachineModel property.
 281 def : WriteRes<WriteBr,    [CyUnitB]>  {let Latency = 0;}
 282 def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;}
 283
 284 //---
 285 // 7.8.14. Never-issued Instructions, Barrier and Hint Operations
 286 //---
 287
 288 // NOP,SEV,SEVL,WFE,WFI,YIELD
 289 def : WriteRes<WriteHint, []> {let Latency = 0;}
 290 // ISB
 291 def : InstRW<[WriteI], (instrs ISB)>;
 292 // SLREX,DMB,DSB
 293 def : WriteRes<WriteBarrier, [CyUnitLS]>;
 294
 295 // System instructions get an invalid latency because the latency of
 296 // other operations across them is meaningless.
 297 def : WriteRes<WriteSys, []> {let Latency = -1;}
 298
 299 //===----------------------------------------------------------------------===//
 300 // 7.9 Vector Unit Instructions
 301
 302 // Simple vector operations take 2 cycles.
 303 def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;}
 304
 305 // Define some longer latency vector op types for Cyclone.
 306 def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 307 def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;}
 308 def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;}
 309 def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;}
 310
 311 // Simple floating-point operations take 2 cycles.
 312 def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;}
 313
 314 //---
 315 // 7.9.1 Vector Moves
 316 //---
 317
 318 // TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently
 319 // generates expensive int-float conversion instead:
 320 // FMOVDi Dd, #0.0
 321 // FMOVv2f64ns Vd.2d, #0.0
 322
 323 // FMOVSi,FMOVDi
 324 def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;}
 325
 326 // MOVI,MVNI are WriteV
 327 // FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV
 328
 329 // Move FPR is a register rename and single nop micro-op.
 330 // ORR.16b Vd,Vn,Vn
 331 // COPY is handled above in the WriteMov Variant.
 332 def WriteVMov    : SchedWriteVariant<[
 333                      SchedVar<WriteVMovPred, [WriteX]>,
 334                      SchedVar<NoSchedPred,   [WriteV]>]>;
 335 def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 336
 337 // FMOVSr,FMOVDr are WriteF.
 338
 339 // MOV V,V is a WriteV.
 340
 341 // CPY D,V[x] is a WriteV
 342
 343 // INS V[x],V[y] is a WriteV.
 344
 345 // FMOVWSr,FMOVXDr,FMOVXDHighr
 346 def : WriteRes<WriteFCopy, [CyUnitLS]> {
 347   let Latency = 5;
 348 }
 349
 350 // FMOVSWr,FMOVDXr
 351 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
 352
 353 // INS V[x],R
 354 def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>;
 355 def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>;
 356
 357 // SMOV,UMOV R,V[x]
 358 def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>;
 359 def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>;
 360
 361 // DUP V,R
 362 def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>;
 363
 364 // DUP V,V[x] is a WriteV.
 365
 366 //---
 367 // 7.9.2 Integer Arithmetic, Logical, and Comparisons
 368 //---
 369
 370 // BIC,ORR V,#imm are WriteV
 371
 372 def : InstRW<[CyWriteV3], (instregex "ABSv")>;
 373
 374 // MVN,NEG,NOT are WriteV
 375
 376 def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>;
 377
 378 // ADDP is a WriteV.
 379 def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 380 def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>;
 381
 382 def : InstRW<[CyWriteV3],
 383              (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>;
 384
 385 def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>;
 386
 387 // ADD,SUB are WriteV
 388
 389 // Forward declare.
 390 def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 391
 392 // Add/Diff and accumulate uses the vector multiply unit.
 393 def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 394 def CyReadVAccum  : SchedReadAdvance<1,
 395                     [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>;
 396
 397 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 398              (instregex "SADALP","UADALP")>;
 399
 400 def : InstRW<[CyWriteVAccum, CyReadVAccum],
 401              (instregex "SABAv","UABAv","SABALv","UABALv")>;
 402
 403 def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>;
 404
 405 def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>;
 406
 407 def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>;
 408
 409 // WriteV includes:
 410 // AND,BIC,CMTST,EOR,ORN,ORR
 411 // ADDP
 412 // SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD
 413 // SADDL,SSUBL,UADDL,USUBL
 414 // SADDW,SSUBW,UADDW,USUBW
 415
 416 def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv",
 417                                      "CMLEv","CMLTv",
 418                                      "CMHIv","CMHSv")>;
 419
 420 def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv",
 421                                      "SMAXPv","SMINPv","UMAXPv","UMINPv")>;
 422
 423 def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv",
 424                                        "SABDLv","UABDLv")>;
 425
 426 //---
 427 // 7.9.3 Floating Point Arithmetic and Comparisons
 428 //---
 429
 430 // FABS,FNEG are WriteF
 431
 432 def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>;
 433 def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>;
 434
 435 def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i",
 436                                      "FMINPv2i","FMINNMPv2i")>;
 437
 438 def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>;
 439
 440 def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32,
 441                                   FSUBSrr,FSUBv2f32,FSUBv4f32,
 442                                   FADDPv2f32,FADDPv4f32,
 443                                   FABD32,FABDv2f32,FABDv4f32)>;
 444 def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64,
 445                                   FSUBDrr,FSUBv2f64,
 446                                   FADDPv2f64,
 447                                   FABD64,FABDv2f64)>;
 448
 449 def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>;
 450
 451 def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT",
 452                                      "FMAXS","FMAXD","FMAXv",
 453                                      "FMINS","FMIND","FMINv",
 454                                      "FMAXNMS","FMAXNMD","FMAXNMv",
 455                                      "FMINNMS","FMINNMD","FMINNMv",
 456                                      "FMAXPv2f","FMAXPv4f",
 457                                      "FMINPv2f","FMINPv4f",
 458                                      "FMAXNMPv2f","FMAXNMPv4f",
 459                                      "FMINNMPv2f","FMINNMPv4f")>;
 460
 461 // FCMP,FCMPE,FCCMP,FCCMPE
 462 def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;}
 463
 464 // FCSEL is a WriteF.
 465
 466 //---
 467 // 7.9.4 Shifts and Bitfield Operations
 468 //---
 469
 470 // SHL is a WriteV
 471
 472 def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;}
 473 def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>;
 474
 475 def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;}
 476 def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>;
 477
 478 // Shift and accumulate uses the vector multiply unit.
 479 def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;}
 480 def CyReadVShiftAcc  : SchedReadAdvance<1,
 481                         [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>;
 482 def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc],
 483              (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
 484
 485 // SSHL,USHL are WriteV.
 486
 487 def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>;
 488
 489 // SQSHL,SQSHLU,UQSHL are WriteV.
 490
 491 def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 492
 493 // WriteV includes:
 494 // SHLL,SSHLL,USHLL
 495 // SLI,SRI
 496 // BIF,BIT,BSL
 497 // EXT
 498 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 499 // XTN2
 500
 501 def : InstRW<[CyWriteV4],
 502              (instregex "RSHRNv","SHRNv",
 503                         "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv",
 504                         "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
 505
 506 //---
 507 // 7.9.5 Multiplication
 508 //---
 509
 510 def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;}
 511 def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv",
 512                              "SQDMULLv","SQDMULHv","SQRDMULHv")>;
 513
 514 // FMUL,FMULX,FNMUL default to WriteFMul.
 515 def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;}
 516
 517 def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;}
 518 def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed,
 519                                FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>;
 520
 521 def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>;
 522 def : InstRW<[CyWriteVMul, CyReadVMulAcc],
 523              (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL",
 524               "SQDMLAL","SQDMLSL")>;
 525
 526 def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;}
 527 def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;}
 528 def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>;
 529 def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>;
 530
 531 def : InstRW<[CyWriteSMul, CyReadSMul],
 532              (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr,
 533               FMLAv2f32,FMLAv4f32,
 534               FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>;
 535 def : InstRW<[CyWriteDMul, CyReadDMul],
 536              (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr,
 537               FMLAv2f64,FMLAv2i64_indexed,
 538               FMLSv2f64,FMLSv2i64_indexed)>;
 539
 540 def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; }
 541 def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>;
 542
 543 //---
 544 // 7.9.6 Divide and Square Root
 545 //---
 546
 547 // FDIV,FSQRT
 548 // TODO: Add 64-bit variant with 19 cycle latency.
 549 // TODO: Specialize FSQRT for longer latency.
 550 def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> {
 551   let Latency = 17;
 552   let ResourceCycles = [2, 17];
 553 }
 554
 555 def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>;
 556
 557 def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; }
 558 def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>;
 559
 560 def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; }
 561 def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; }
 562 def : InstRW<[WriteFRECPS],  (instregex "FRECPSv")>;
 563 def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>;
 564
 565 //---
 566 // 7.9.7 Integer-FP Conversions
 567 //---
 568
 569 // FCVT lengthen f16/s32
 570 def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>;
 571
 572 // FCVT,FCVTN,FCVTXN
 573 // SCVTF,UCVTF V,V
 574 // FRINT(AIMNPXZ) V,V
 575 def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;}
 576
 577 // SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles.
 578 def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>;
 579 def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>;
 580
 581 // FCVT Rd, S/D = V6+LD4: 10 cycles
 582 def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>;
 583 def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>;
 584
 585 // FCVTL is a WriteV
 586
 587 //---
 588 // 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup
 589 //---
 590
 591 def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;}
 592 def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr,
 593                                        AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr,
 594                                        SHA1SU0rrr)>;
 595
 596 def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;}
 597 def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>;
 598
 599 def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;}
 600 def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr,
 601                                        SHA256Hrrr,SHA256H2rrr)>;
 602
 603 // TRN,UZP,ZUP are WriteV.
 604
 605 // TBL,TBX are WriteV.
 606
 607 //---
 608 // 7.9.11-7.9.14 Load/Store, single element and paired
 609 //---
 610
 611 // Loading into the vector unit takes 5 cycles vs 4 for integer loads.
 612 def : WriteRes<WriteVLD, [CyUnitLS]> {
 613   let Latency = 5;
 614 }
 615
 616 // Store-load forwarding is 4 cycles.
 617 def : WriteRes<WriteVST, [CyUnitLS]> {
 618   let Latency = 4;
 619 }
 620
 621 // WriteVLDPair/VSTPair sequences are expanded by the target description.
 622
 623 //---
 624 // 7.9.15 Load, element operations
 625 //---
 626
 627 // Only the first WriteVLD and WriteAdr for writeback matches def operands.
 628 // Subsequent WriteVLDs consume resources. Since all loaded values have the
 629 // same latency, this is acceptable.
 630
 631 // Vd is read 5 cycles after issuing the vector load.
 632 def : ReadAdvance<ReadVLD, 5>;
 633
 634 def : InstRW<[WriteVLD],
 635              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 636 def : InstRW<[WriteVLD, WriteAdr],
 637              (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 638
 639 // Register writes from the load's high half are fused micro-ops.
 640 def : InstRW<[WriteVLD],
 641              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
 642 def : InstRW<[WriteVLD, WriteAdr],
 643              (instregex "LD1Twov(8b|4h|2s|1d)_POST")>;
 644 def : InstRW<[WriteVLD, WriteVLD],
 645              (instregex "LD1Twov(16b|8h|4s|2d)$")>;
 646 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 647              (instregex "LD1Twov(16b|8h|4s|2d)_POST")>;
 648
 649 def : InstRW<[WriteVLD, WriteVLD],
 650              (instregex "LD1Threev(8b|4h|2s|1d)$")>;
 651 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 652              (instregex "LD1Threev(8b|4h|2s|1d)_POST")>;
 653 def : InstRW<[WriteVLD, WriteVLD, WriteVLD],
 654              (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 655 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD],
 656              (instregex "LD1Threev(16b|8h|4s|2d)_POST")>;
 657
 658 def : InstRW<[WriteVLD, WriteVLD],
 659              (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 660 def : InstRW<[WriteVLD, WriteAdr, WriteVLD],
 661              (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>;
 662 def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD],
 663              (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 664 def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD],
 665              (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>;
 666
 667 def : InstRW<[WriteVLDShuffle, ReadVLD],
 668              (instregex "LD1i(8|16|32)$")>;
 669 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],
 670              (instregex "LD1i(8|16|32)_POST")>;
 671
 672 def : InstRW<[WriteVLDShuffle, ReadVLD],          (instrs LD1i64)>;
 673 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>;
 674
 675 def : InstRW<[WriteVLDShuffle],
 676              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 677 def : InstRW<[WriteVLDShuffle, WriteAdr],
 678              (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 679
 680 def : InstRW<[WriteVLDShuffle, WriteV],
 681              (instregex "LD2Twov(8b|4h|2s)$")>;
 682 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 683              (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 684 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle],
 685              (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 686 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle],
 687              (instregex "LD2Twov(16b|8h|4s|2d)_POST")>;
 688
 689 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 690              (instregex "LD2i(8|16|32)$")>;
 691 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 692              (instregex "LD2i(8|16|32)_POST")>;
 693 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV],
 694              (instregex "LD2i64$")>;
 695 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV],
 696              (instregex "LD2i64_POST")>;
 697
 698 def : InstRW<[WriteVLDShuffle, WriteV],
 699              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 700 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV],
 701              (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 702
 703 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 704              (instregex "LD3Threev(8b|4h|2s)$")>;
 705 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 706              (instregex "LD3Threev(8b|4h|2s)_POST")>;
 707 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle],
 708              (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 709 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle],
 710              (instregex "LD3Threev(16b|8h|4s|2d)_POST")>;
 711
 712 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV],
 713              (instregex "LD3i(8|16|32)$")>;
 714 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV],
 715              (instregex "LD3i(8|16|32)_POST")>;
 716
 717 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV],
 718              (instregex "LD3i64$")>;
 719 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 720              (instregex "LD3i64_POST")>;
 721
 722 def : InstRW<[WriteVLDShuffle, WriteV, WriteV],
 723              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>;
 724 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 725              (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 726
 727 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
 728              (instrs LD3Rv1d,LD3Rv2d)>;
 729 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
 730              (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 731
 732 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 733              (instregex "LD4Fourv(8b|4h|2s)$")>;
 734 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 735              (instregex "LD4Fourv(8b|4h|2s)_POST")>;
 736 def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle,
 737               WriteVLDPairShuffle, WriteVLDPairShuffle],
 738              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 739 def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle,
 740               WriteVLDPairShuffle, WriteVLDPairShuffle],
 741              (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>;
 742
 743 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV],
 744              (instregex "LD4i(8|16|32)$")>;
 745 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV],
 746              (instregex "LD4i(8|16|32)_POST")>;
 747
 748
 749 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV],
 750              (instrs LD4i64)>;
 751 def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV],
 752              (instrs LD4i64_POST)>;
 753
 754 def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV],
 755              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>;
 756 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV],
 757              (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>;
 758
 759 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
 760              (instrs LD4Rv1d,LD4Rv2d)>;
 761 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV],
 762              (instrs LD4Rv1d_POST,LD4Rv2d_POST)>;
 763
 764 //---
 765 // 7.9.16 Store, element operations
 766 //---
 767
 768 // Only the WriteAdr for writeback matches a def operands.
 769 // Subsequent WriteVLDs only consume resources.
 770
 771 def : InstRW<[WriteVST],
 772              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 773 def : InstRW<[WriteAdr, WriteVST],
 774              (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>;
 775
 776 def : InstRW<[WriteVSTShuffle],
 777              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
 778 def : InstRW<[WriteAdr, WriteVSTShuffle],
 779              (instregex "ST1Twov(8b|4h|2s|1d)_POST")>;
 780 def : InstRW<[WriteVST, WriteVST],
 781              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
 782 def : InstRW<[WriteAdr, WriteVST, WriteVST],
 783              (instregex "ST1Twov(16b|8h|4s|2d)_POST")>;
 784
 785 def : InstRW<[WriteVSTShuffle, WriteVST],
 786              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
 787 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST],
 788              (instregex "ST1Threev(8b|4h|2s|1d)_POST")>;
 789 def : InstRW<[WriteVST, WriteVST, WriteVST],
 790              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
 791 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST],
 792              (instregex "ST1Threev(16b|8h|4s|2d)_POST")>;
 793
 794 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 795              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
 796 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 797              (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>;
 798 def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],
 799              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
 800 def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST],
 801              (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>;
 802
 803 def : InstRW<[WriteVSTShuffle],           (instregex "ST1i(8|16|32)$")>;
 804 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>;
 805
 806 def : InstRW<[WriteVSTShuffle],           (instrs ST1i64)>;
 807 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>;
 808
 809 def : InstRW<[WriteVSTShuffle],
 810              (instregex "ST2Twov(8b|4h|2s)$")>;
 811 def : InstRW<[WriteAdr, WriteVSTShuffle],
 812              (instregex "ST2Twov(8b|4h|2s)_POST")>;
 813 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 814              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
 815 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 816              (instregex "ST2Twov(16b|8h|4s|2d)_POST")>;
 817
 818 def : InstRW<[WriteVSTShuffle],           (instregex "ST2i(8|16|32)$")>;
 819 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>;
 820 def : InstRW<[WriteVSTShuffle],           (instrs ST2i64)>;
 821 def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>;
 822
 823 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],
 824              (instregex "ST3Threev(8b|4h|2s)$")>;
 825 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],
 826              (instregex "ST3Threev(8b|4h|2s)_POST")>;
 827 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 828              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
 829 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle],
 830              (instregex "ST3Threev(16b|8h|4s|2d)_POST")>;
 831
 832 def : InstRW<[WriteVSTShuffle],           (instregex "ST3i(8|16|32)$")>;
 833 def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>;
 834
 835 def :InstRW<[WriteVSTShuffle, WriteVSTShuffle],           (instrs ST3i64)>;
 836 def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>;
 837
 838 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle],
 839             (instregex "ST4Fourv(8b|4h|2s|1d)$")>;
 840 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle],
 841             (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>;
 842 def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle,
 843               WriteVSTPairShuffle, WriteVSTPairShuffle],
 844              (instregex "ST4Fourv(16b|8h|4s|2d)$")>;
 845 def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle,
 846               WriteVSTPairShuffle, WriteVSTPairShuffle],
 847              (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>;
 848
 849 def : InstRW<[WriteVSTPairShuffle],           (instregex "ST4i(8|16|32)$")>;
 850 def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 851
 852 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 853 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 854
 855 // Atomic operations are not supported.
 856 def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
 857
 858 //---
 859 // Unused SchedRead types
 860 //---
 861
 862 def : ReadAdvance<ReadI, 0>;
 863 def : ReadAdvance<ReadISReg, 0>;
 864 def : ReadAdvance<ReadIEReg, 0>;
 865 def : ReadAdvance<ReadIM, 0>;
 866 def : ReadAdvance<ReadIMA, 0>;
 867 def : ReadAdvance<ReadID, 0>;
 868
 869 } // SchedModel = CycloneModel