contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.h

   1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// SI Machine Scheduler interface
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  15 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  16
  17 #include "SIInstrInfo.h"
  18 #include "llvm/CodeGen/MachineBasicBlock.h"
  19 #include "llvm/CodeGen/MachineScheduler.h"
  20 #include "llvm/CodeGen/RegisterPressure.h"
  21 #include "llvm/CodeGen/ScheduleDAG.h"
  22 #include <cassert>
  23 #include <cstdint>
  24 #include <map>
  25 #include <memory>
  26 #include <set>
  27 #include <vector>
  28
  29 namespace llvm {
  30
  31 enum SIScheduleCandReason {
  32   NoCand,
  33   RegUsage,
  34   Latency,
  35   Successor,
  36   Depth,
  37   NodeOrder
  38 };
  39
  40 struct SISchedulerCandidate {
  41   // The reason for this candidate.
  42   SIScheduleCandReason Reason = NoCand;
  43
  44   // Set of reasons that apply to multiple candidates.
  45   uint32_t RepeatReasonSet = 0;
  46
  47   SISchedulerCandidate() = default;
  48
  49   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
  50   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
  51 };
  52
  53 class SIScheduleDAGMI;
  54 class SIScheduleBlockCreator;
  55
  56 enum SIScheduleBlockLinkKind {
  57   NoData,
  58   Data
  59 };
  60
  61 class SIScheduleBlock {
  62   SIScheduleDAGMI *DAG;
  63   SIScheduleBlockCreator *BC;
  64
  65   std::vector<SUnit*> SUnits;
  66   std::map<unsigned, unsigned> NodeNum2Index;
  67   std::vector<SUnit*> TopReadySUs;
  68   std::vector<SUnit*> ScheduledSUnits;
  69
  70   /// The top of the unscheduled zone.
  71   IntervalPressure TopPressure;
  72   RegPressureTracker TopRPTracker;
  73
  74   // Pressure: number of said class of registers needed to
  75   // store the live virtual and real registers.
  76   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
  77   // Pressure of additional registers required inside the block.
  78   std::vector<unsigned> InternalAdditionnalPressure;
  79   // Pressure of input and output registers
  80   std::vector<unsigned> LiveInPressure;
  81   std::vector<unsigned> LiveOutPressure;
  82   // Registers required by the block, and outputs.
  83   // We do track only virtual registers.
  84   // Note that some registers are not 32 bits,
  85   // and thus the pressure is not equal
  86   // to the number of live registers.
  87   std::set<unsigned> LiveInRegs;
  88   std::set<unsigned> LiveOutRegs;
  89
  90   bool Scheduled = false;
  91   bool HighLatencyBlock = false;
  92
  93   std::vector<unsigned> HasLowLatencyNonWaitedParent;
  94
  95   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
  96   unsigned ID;
  97
  98   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
  99   // All blocks successors, and the kind of link
 100   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
 101   unsigned NumHighLatencySuccessors = 0;
 102
 103 public:
 104   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
 105                   unsigned ID):
 106     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
 107
 108   ~SIScheduleBlock() = default;
 109
 110   unsigned getID() const { return ID; }
 111
 112   /// Functions for Block construction.
 113   void addUnit(SUnit *SU);
 114
 115   // When all SUs have been added.
 116   void finalizeUnits();
 117
 118   // Add block pred, which has instruction predecessor of SU.
 119   void addPred(SIScheduleBlock *Pred);
 120   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
 121
 122   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
 123   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
 124     getSuccs() const { return Succs; }
 125
 126   unsigned Height;  // Maximum topdown path length to block without outputs
 127   unsigned Depth;   // Maximum bottomup path length to block without inputs
 128
 129   unsigned getNumHighLatencySuccessors() const {
 130     return NumHighLatencySuccessors;
 131   }
 132
 133   bool isHighLatencyBlock() { return HighLatencyBlock; }
 134
 135   // This is approximative.
 136   // Ideally should take into accounts some instructions (rcp, etc)
 137   // are 4 times slower.
 138   int getCost() { return SUnits.size(); }
 139
 140   // The block Predecessors and Successors must be all registered
 141   // before fastSchedule().
 142   // Fast schedule with no particular requirement.
 143   void fastSchedule();
 144
 145   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
 146
 147   // Complete schedule that will try to minimize reg pressure and
 148   // low latencies, and will fill liveins and liveouts.
 149   // Needs all MIs to be grouped between BeginBlock and EndBlock.
 150   // The MIs can be moved after the scheduling,
 151   // it is just used to allow correct track of live registers.
 152   void schedule(MachineBasicBlock::iterator BeginBlock,
 153                 MachineBasicBlock::iterator EndBlock);
 154
 155   bool isScheduled() { return Scheduled; }
 156
 157   // Needs the block to be scheduled inside
 158   // TODO: find a way to compute it.
 159   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
 160     return InternalAdditionnalPressure;
 161   }
 162
 163   std::set<unsigned> &getInRegs() { return LiveInRegs; }
 164   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
 165
 166   void printDebug(bool Full);
 167
 168 private:
 169   struct SISchedCandidate : SISchedulerCandidate {
 170     // The best SUnit candidate.
 171     SUnit *SU = nullptr;
 172
 173     unsigned SGPRUsage;
 174     unsigned VGPRUsage;
 175     bool IsLowLatency;
 176     unsigned LowLatencyOffset;
 177     bool HasLowLatencyNonWaitedParent;
 178
 179     SISchedCandidate() = default;
 180
 181     bool isValid() const { return SU; }
 182
 183     // Copy the status of another candidate without changing policy.
 184     void setBest(SISchedCandidate &Best) {
 185       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 186       SU = Best.SU;
 187       Reason = Best.Reason;
 188       SGPRUsage = Best.SGPRUsage;
 189       VGPRUsage = Best.VGPRUsage;
 190       IsLowLatency = Best.IsLowLatency;
 191       LowLatencyOffset = Best.LowLatencyOffset;
 192       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
 193     }
 194   };
 195
 196   void undoSchedule();
 197
 198   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
 199   void releaseSucc(SUnit *SU, SDep *SuccEdge);
 200   // InOrOutBlock: restrict to links pointing inside the block (true),
 201   // or restrict to links pointing outside the block (false).
 202   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
 203
 204   void nodeScheduled(SUnit *SU);
 205   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 206   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 207   SUnit* pickNode();
 208   void traceCandidate(const SISchedCandidate &Cand);
 209   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
 210                        MachineBasicBlock::iterator EndBlock);
 211 };
 212
 213 struct SIScheduleBlocks {
 214   std::vector<SIScheduleBlock*> Blocks;
 215   std::vector<int> TopDownIndex2Block;
 216   std::vector<int> TopDownBlock2Index;
 217 };
 218
 219 enum SISchedulerBlockCreatorVariant {
 220   LatenciesAlone,
 221   LatenciesGrouped,
 222   LatenciesAlonePlusConsecutive
 223 };
 224
 225 class SIScheduleBlockCreator {
 226   SIScheduleDAGMI *DAG;
 227   // unique_ptr handles freeing memory for us.
 228   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
 229   std::map<SISchedulerBlockCreatorVariant,
 230            SIScheduleBlocks> Blocks;
 231   std::vector<SIScheduleBlock*> CurrentBlocks;
 232   std::vector<int> Node2CurrentBlock;
 233
 234   // Topological sort
 235   // Maps topological index to the node number.
 236   std::vector<int> TopDownIndex2Block;
 237   std::vector<int> TopDownBlock2Index;
 238   std::vector<int> BottomUpIndex2Block;
 239
 240   // 0 -> Color not given.
 241   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
 242   // Above -> Other groups.
 243   int NextReservedID;
 244   int NextNonReservedID;
 245   std::vector<int> CurrentColoring;
 246   std::vector<int> CurrentTopDownReservedDependencyColoring;
 247   std::vector<int> CurrentBottomUpReservedDependencyColoring;
 248
 249 public:
 250   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
 251   ~SIScheduleBlockCreator();
 252
 253   SIScheduleBlocks
 254   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
 255
 256   bool isSUInBlock(SUnit *SU, unsigned ID);
 257
 258 private:
 259   // Give a Reserved color to every high latency.
 260   void colorHighLatenciesAlone();
 261
 262   // Create groups of high latencies with a Reserved color.
 263   void colorHighLatenciesGroups();
 264
 265   // Compute coloring for topdown and bottom traversals with
 266   // different colors depending on dependencies on Reserved colors.
 267   void colorComputeReservedDependencies();
 268
 269   // Give color to all non-colored SUs according to Reserved groups dependencies.
 270   void colorAccordingToReservedDependencies();
 271
 272   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
 273   // The new colors are computed according to the dependencies on the other blocks
 274   // formed with colorAccordingToReservedDependencies.
 275   void colorEndsAccordingToDependencies();
 276
 277   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
 278   void colorForceConsecutiveOrderInGroup();
 279
 280   // Merge Constant loads that have all their users into another group to the group.
 281   // (TODO: else if all their users depend on the same group, put them there)
 282   void colorMergeConstantLoadsNextGroup();
 283
 284   // Merge SUs that have all their users into another group to the group
 285   void colorMergeIfPossibleNextGroup();
 286
 287   // Merge SUs that have all their users into another group to the group,
 288   // but only for Reserved groups.
 289   void colorMergeIfPossibleNextGroupOnlyForReserved();
 290
 291   // Merge SUs that have all their users into another group to the group,
 292   // but only if the group is no more than a few SUs.
 293   void colorMergeIfPossibleSmallGroupsToNextGroup();
 294
 295   // Divides Blocks with important size.
 296   // Idea of implementation: attribute new colors depending on topdown and
 297   // bottom up links to other blocks.
 298   void cutHugeBlocks();
 299
 300   // Put in one group all instructions with no users in this scheduling region
 301   // (we'd want these groups be at the end).
 302   void regroupNoUserInstructions();
 303
 304   // Give Reserved color to export instructions
 305   void colorExports();
 306
 307   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
 308
 309   void topologicalSort();
 310
 311   void scheduleInsideBlocks();
 312
 313   void fillStats();
 314 };
 315
 316 enum SISchedulerBlockSchedulerVariant {
 317   BlockLatencyRegUsage,
 318   BlockRegUsageLatency,
 319   BlockRegUsage
 320 };
 321
 322 class SIScheduleBlockScheduler {
 323   SIScheduleDAGMI *DAG;
 324   SISchedulerBlockSchedulerVariant Variant;
 325   std::vector<SIScheduleBlock*> Blocks;
 326
 327   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
 328   std::set<unsigned> LiveRegs;
 329   // Num of schedulable unscheduled blocks reading the register.
 330   std::map<unsigned, unsigned> LiveRegsConsumers;
 331
 332   std::vector<unsigned> LastPosHighLatencyParentScheduled;
 333   int LastPosWaitedHighLatency;
 334
 335   std::vector<SIScheduleBlock*> BlocksScheduled;
 336   unsigned NumBlockScheduled;
 337   std::vector<SIScheduleBlock*> ReadyBlocks;
 338
 339   unsigned VregCurrentUsage;
 340   unsigned SregCurrentUsage;
 341
 342   // Currently is only approximation.
 343   unsigned maxVregUsage;
 344   unsigned maxSregUsage;
 345
 346   std::vector<unsigned> BlockNumPredsLeft;
 347   std::vector<unsigned> BlockNumSuccsLeft;
 348
 349 public:
 350   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
 351                            SISchedulerBlockSchedulerVariant Variant,
 352                            SIScheduleBlocks BlocksStruct);
 353   ~SIScheduleBlockScheduler() = default;
 354
 355   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
 356
 357   unsigned getVGPRUsage() { return maxVregUsage; }
 358   unsigned getSGPRUsage() { return maxSregUsage; }
 359
 360 private:
 361   struct SIBlockSchedCandidate : SISchedulerCandidate {
 362     // The best Block candidate.
 363     SIScheduleBlock *Block = nullptr;
 364
 365     bool IsHighLatency;
 366     int VGPRUsageDiff;
 367     unsigned NumSuccessors;
 368     unsigned NumHighLatencySuccessors;
 369     unsigned LastPosHighLatParentScheduled;
 370     unsigned Height;
 371
 372     SIBlockSchedCandidate() = default;
 373
 374     bool isValid() const { return Block; }
 375
 376     // Copy the status of another candidate without changing policy.
 377     void setBest(SIBlockSchedCandidate &Best) {
 378       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 379       Block = Best.Block;
 380       Reason = Best.Reason;
 381       IsHighLatency = Best.IsHighLatency;
 382       VGPRUsageDiff = Best.VGPRUsageDiff;
 383       NumSuccessors = Best.NumSuccessors;
 384       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
 385       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
 386       Height = Best.Height;
 387     }
 388   };
 389
 390   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
 391                            SIBlockSchedCandidate &TryCand);
 392   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
 393                             SIBlockSchedCandidate &TryCand);
 394   SIScheduleBlock *pickBlock();
 395
 396   void addLiveRegs(std::set<unsigned> &Regs);
 397   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
 398   void releaseBlockSuccs(SIScheduleBlock *Parent);
 399   void blockScheduled(SIScheduleBlock *Block);
 400
 401   // Check register pressure change
 402   // by scheduling a block with these LiveIn and LiveOut.
 403   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
 404                                        std::set<unsigned> &OutRegs);
 405
 406   void schedule();
 407 };
 408
 409 struct SIScheduleBlockResult {
 410   std::vector<unsigned> SUs;
 411   unsigned MaxSGPRUsage;
 412   unsigned MaxVGPRUsage;
 413 };
 414
 415 class SIScheduler {
 416   SIScheduleDAGMI *DAG;
 417   SIScheduleBlockCreator BlockCreator;
 418
 419 public:
 420   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
 421
 422   ~SIScheduler() = default;
 423
 424   struct SIScheduleBlockResult
 425   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 426                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 427 };
 428
 429 class SIScheduleDAGMI final : public ScheduleDAGMILive {
 430   const SIInstrInfo *SITII;
 431   const SIRegisterInfo *SITRI;
 432
 433   std::vector<SUnit> SUnitsLinksBackup;
 434
 435   // For moveLowLatencies. After all Scheduling variants are tested.
 436   std::vector<unsigned> ScheduledSUnits;
 437   std::vector<unsigned> ScheduledSUnitsInv;
 438
 439   unsigned VGPRSetID;
 440   unsigned SGPRSetID;
 441
 442 public:
 443   SIScheduleDAGMI(MachineSchedContext *C);
 444
 445   ~SIScheduleDAGMI() override;
 446
 447   // Entry point for the schedule.
 448   void schedule() override;
 449
 450   // To init Block's RPTracker.
 451   void initRPTracker(RegPressureTracker &RPTracker) {
 452     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
 453   }
 454
 455   MachineBasicBlock *getBB() { return BB; }
 456   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
 457   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
 458   LiveIntervals *getLIS() { return LIS; }
 459   MachineRegisterInfo *getMRI() { return &MRI; }
 460   const TargetRegisterInfo *getTRI() { return TRI; }
 461   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
 462   SUnit& getEntrySU() { return EntrySU; }
 463   SUnit& getExitSU() { return ExitSU; }
 464
 465   void restoreSULinksLeft();
 466
 467   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
 468                                                      _Iterator End,
 469                                                      unsigned &VgprUsage,
 470                                                      unsigned &SgprUsage);
 471
 472   std::set<unsigned> getInRegs() {
 473     std::set<unsigned> InRegs;
 474     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
 475       InRegs.insert(RegMaskPair.RegUnit);
 476     }
 477     return InRegs;
 478   }
 479
 480   std::set<unsigned> getOutRegs() {
 481     std::set<unsigned> OutRegs;
 482     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
 483       OutRegs.insert(RegMaskPair.RegUnit);
 484     }
 485     return OutRegs;
 486   };
 487
 488   unsigned getVGPRSetID() const { return VGPRSetID; }
 489   unsigned getSGPRSetID() const { return SGPRSetID; }
 490
 491 private:
 492   void topologicalSort();
 493   // After scheduling is done, improve low latency placements.
 494   void moveLowLatencies();
 495
 496 public:
 497   // Some stats for scheduling inside blocks.
 498   std::vector<unsigned> IsLowLatencySU;
 499   std::vector<unsigned> LowLatencyOffset;
 500   std::vector<unsigned> IsHighLatencySU;
 501   // Topological sort
 502   // Maps topological index to the node number.
 503   std::vector<int> TopDownIndex2SU;
 504   std::vector<int> BottomUpIndex2SU;
 505 };
 506
 507 } // end namespace llvm
 508
 509 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H