contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h

   1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief SI Machine Scheduler interface
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  16 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  17
  18 #include "SIInstrInfo.h"
  19 #include "llvm/CodeGen/MachineBasicBlock.h"
  20 #include "llvm/CodeGen/MachineScheduler.h"
  21 #include "llvm/CodeGen/RegisterPressure.h"
  22 #include "llvm/CodeGen/ScheduleDAG.h"
  23 #include <cassert>
  24 #include <cstdint>
  25 #include <map>
  26 #include <memory>
  27 #include <set>
  28 #include <vector>
  29
  30 namespace llvm {
  31
  32 enum SIScheduleCandReason {
  33   NoCand,
  34   RegUsage,
  35   Latency,
  36   Successor,
  37   Depth,
  38   NodeOrder
  39 };
  40
  41 struct SISchedulerCandidate {
  42   // The reason for this candidate.
  43   SIScheduleCandReason Reason;
  44
  45   // Set of reasons that apply to multiple candidates.
  46   uint32_t RepeatReasonSet;
  47
  48   SISchedulerCandidate()
  49     :  Reason(NoCand), RepeatReasonSet(0) {}
  50
  51   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
  52   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
  53 };
  54
  55 class SIScheduleDAGMI;
  56 class SIScheduleBlockCreator;
  57
  58 class SIScheduleBlock {
  59   SIScheduleDAGMI *DAG;
  60   SIScheduleBlockCreator *BC;
  61
  62   std::vector<SUnit*> SUnits;
  63   std::map<unsigned, unsigned> NodeNum2Index;
  64   std::vector<SUnit*> TopReadySUs;
  65   std::vector<SUnit*> ScheduledSUnits;
  66
  67   /// The top of the unscheduled zone.
  68   IntervalPressure TopPressure;
  69   RegPressureTracker TopRPTracker;
  70
  71   // Pressure: number of said class of registers needed to
  72   // store the live virtual and real registers.
  73   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
  74   // Pressure of additional registers required inside the block.
  75   std::vector<unsigned> InternalAdditionnalPressure;
  76   // Pressure of input and output registers
  77   std::vector<unsigned> LiveInPressure;
  78   std::vector<unsigned> LiveOutPressure;
  79   // Registers required by the block, and outputs.
  80   // We do track only virtual registers.
  81   // Note that some registers are not 32 bits,
  82   // and thus the pressure is not equal
  83   // to the number of live registers.
  84   std::set<unsigned> LiveInRegs;
  85   std::set<unsigned> LiveOutRegs;
  86
  87   bool Scheduled;
  88   bool HighLatencyBlock;
  89
  90   std::vector<unsigned> HasLowLatencyNonWaitedParent;
  91
  92   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
  93   unsigned ID;
  94
  95   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
  96   std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
  97   unsigned NumHighLatencySuccessors;
  98
  99 public:
 100   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
 101                   unsigned ID):
 102     DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
 103     HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
 104
 105   ~SIScheduleBlock() = default;
 106
 107   unsigned getID() const { return ID; }
 108
 109   /// Functions for Block construction.
 110   void addUnit(SUnit *SU);
 111
 112   // When all SUs have been added.
 113   void finalizeUnits();
 114
 115   // Add block pred, which has instruction predecessor of SU.
 116   void addPred(SIScheduleBlock *Pred);
 117   void addSucc(SIScheduleBlock *Succ);
 118
 119   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
 120   const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
 121
 122   unsigned Height;  // Maximum topdown path length to block without outputs
 123   unsigned Depth;   // Maximum bottomup path length to block without inputs
 124
 125   unsigned getNumHighLatencySuccessors() const {
 126     return NumHighLatencySuccessors;
 127   }
 128
 129   bool isHighLatencyBlock() { return HighLatencyBlock; }
 130
 131   // This is approximative.
 132   // Ideally should take into accounts some instructions (rcp, etc)
 133   // are 4 times slower.
 134   int getCost() { return SUnits.size(); }
 135
 136   // The block Predecessors and Successors must be all registered
 137   // before fastSchedule().
 138   // Fast schedule with no particular requirement.
 139   void fastSchedule();
 140
 141   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
 142
 143   // Complete schedule that will try to minimize reg pressure and
 144   // low latencies, and will fill liveins and liveouts.
 145   // Needs all MIs to be grouped between BeginBlock and EndBlock.
 146   // The MIs can be moved after the scheduling,
 147   // it is just used to allow correct track of live registers.
 148   void schedule(MachineBasicBlock::iterator BeginBlock,
 149                 MachineBasicBlock::iterator EndBlock);
 150
 151   bool isScheduled() { return Scheduled; }
 152
 153   // Needs the block to be scheduled inside
 154   // TODO: find a way to compute it.
 155   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
 156     return InternalAdditionnalPressure;
 157   }
 158
 159   std::set<unsigned> &getInRegs() { return LiveInRegs; }
 160   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
 161
 162   void printDebug(bool Full);
 163
 164 private:
 165   struct SISchedCandidate : SISchedulerCandidate {
 166     // The best SUnit candidate.
 167     SUnit *SU = nullptr;
 168
 169     unsigned SGPRUsage;
 170     unsigned VGPRUsage;
 171     bool IsLowLatency;
 172     unsigned LowLatencyOffset;
 173     bool HasLowLatencyNonWaitedParent;
 174
 175     SISchedCandidate() = default;
 176
 177     bool isValid() const { return SU; }
 178
 179     // Copy the status of another candidate without changing policy.
 180     void setBest(SISchedCandidate &Best) {
 181       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 182       SU = Best.SU;
 183       Reason = Best.Reason;
 184       SGPRUsage = Best.SGPRUsage;
 185       VGPRUsage = Best.VGPRUsage;
 186       IsLowLatency = Best.IsLowLatency;
 187       LowLatencyOffset = Best.LowLatencyOffset;
 188       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
 189     }
 190   };
 191
 192   void undoSchedule();
 193
 194   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
 195   void releaseSucc(SUnit *SU, SDep *SuccEdge);
 196   // InOrOutBlock: restrict to links pointing inside the block (true),
 197   // or restrict to links pointing outside the block (false).
 198   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
 199
 200   void nodeScheduled(SUnit *SU);
 201   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 202   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 203   SUnit* pickNode();
 204   void traceCandidate(const SISchedCandidate &Cand);
 205   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
 206                        MachineBasicBlock::iterator EndBlock);
 207 };
 208
 209 struct SIScheduleBlocks {
 210   std::vector<SIScheduleBlock*> Blocks;
 211   std::vector<int> TopDownIndex2Block;
 212   std::vector<int> TopDownBlock2Index;
 213 };
 214
 215 enum SISchedulerBlockCreatorVariant {
 216     LatenciesAlone,
 217     LatenciesGrouped,
 218     LatenciesAlonePlusConsecutive
 219 };
 220
 221 class SIScheduleBlockCreator {
 222   SIScheduleDAGMI *DAG;
 223   // unique_ptr handles freeing memory for us.
 224   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
 225   std::map<SISchedulerBlockCreatorVariant,
 226            SIScheduleBlocks> Blocks;
 227   std::vector<SIScheduleBlock*> CurrentBlocks;
 228   std::vector<int> Node2CurrentBlock;
 229
 230   // Topological sort
 231   // Maps topological index to the node number.
 232   std::vector<int> TopDownIndex2Block;
 233   std::vector<int> TopDownBlock2Index;
 234   std::vector<int> BottomUpIndex2Block;
 235
 236   // 0 -> Color not given.
 237   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
 238   // Above -> Other groups.
 239   int NextReservedID;
 240   int NextNonReservedID;
 241   std::vector<int> CurrentColoring;
 242   std::vector<int> CurrentTopDownReservedDependencyColoring;
 243   std::vector<int> CurrentBottomUpReservedDependencyColoring;
 244
 245 public:
 246   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
 247   ~SIScheduleBlockCreator();
 248
 249   SIScheduleBlocks
 250   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
 251
 252   bool isSUInBlock(SUnit *SU, unsigned ID);
 253
 254 private:
 255   // Give a Reserved color to every high latency.
 256   void colorHighLatenciesAlone();
 257
 258   // Create groups of high latencies with a Reserved color.
 259   void colorHighLatenciesGroups();
 260
 261   // Compute coloring for topdown and bottom traversals with
 262   // different colors depending on dependencies on Reserved colors.
 263   void colorComputeReservedDependencies();
 264
 265   // Give color to all non-colored SUs according to Reserved groups dependencies.
 266   void colorAccordingToReservedDependencies();
 267
 268   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
 269   // The new colors are computed according to the dependencies on the other blocks
 270   // formed with colorAccordingToReservedDependencies.
 271   void colorEndsAccordingToDependencies();
 272
 273   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
 274   void colorForceConsecutiveOrderInGroup();
 275
 276   // Merge Constant loads that have all their users into another group to the group.
 277   // (TODO: else if all their users depend on the same group, put them there)
 278   void colorMergeConstantLoadsNextGroup();
 279
 280   // Merge SUs that have all their users into another group to the group
 281   void colorMergeIfPossibleNextGroup();
 282
 283   // Merge SUs that have all their users into another group to the group,
 284   // but only for Reserved groups.
 285   void colorMergeIfPossibleNextGroupOnlyForReserved();
 286
 287   // Merge SUs that have all their users into another group to the group,
 288   // but only if the group is no more than a few SUs.
 289   void colorMergeIfPossibleSmallGroupsToNextGroup();
 290
 291   // Divides Blocks with important size.
 292   // Idea of implementation: attribute new colors depending on topdown and
 293   // bottom up links to other blocks.
 294   void cutHugeBlocks();
 295
 296   // Put in one group all instructions with no users in this scheduling region
 297   // (we'd want these groups be at the end).
 298   void regroupNoUserInstructions();
 299
 300   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
 301
 302   void topologicalSort();
 303
 304   void scheduleInsideBlocks();
 305
 306   void fillStats();
 307 };
 308
 309 enum SISchedulerBlockSchedulerVariant {
 310   BlockLatencyRegUsage,
 311   BlockRegUsageLatency,
 312   BlockRegUsage
 313 };
 314
 315 class SIScheduleBlockScheduler {
 316   SIScheduleDAGMI *DAG;
 317   SISchedulerBlockSchedulerVariant Variant;
 318   std::vector<SIScheduleBlock*> Blocks;
 319
 320   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
 321   std::set<unsigned> LiveRegs;
 322   // Num of schedulable unscheduled blocks reading the register.
 323   std::map<unsigned, unsigned> LiveRegsConsumers;
 324
 325   std::vector<unsigned> LastPosHighLatencyParentScheduled;
 326   int LastPosWaitedHighLatency;
 327
 328   std::vector<SIScheduleBlock*> BlocksScheduled;
 329   unsigned NumBlockScheduled;
 330   std::vector<SIScheduleBlock*> ReadyBlocks;
 331
 332   unsigned VregCurrentUsage;
 333   unsigned SregCurrentUsage;
 334
 335   // Currently is only approximation.
 336   unsigned maxVregUsage;
 337   unsigned maxSregUsage;
 338
 339   std::vector<unsigned> BlockNumPredsLeft;
 340   std::vector<unsigned> BlockNumSuccsLeft;
 341
 342 public:
 343   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
 344                            SISchedulerBlockSchedulerVariant Variant,
 345                            SIScheduleBlocks BlocksStruct);
 346   ~SIScheduleBlockScheduler() = default;
 347
 348   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
 349
 350   unsigned getVGPRUsage() { return maxVregUsage; }
 351   unsigned getSGPRUsage() { return maxSregUsage; }
 352
 353 private:
 354   struct SIBlockSchedCandidate : SISchedulerCandidate {
 355     // The best Block candidate.
 356     SIScheduleBlock *Block = nullptr;
 357
 358     bool IsHighLatency;
 359     int VGPRUsageDiff;
 360     unsigned NumSuccessors;
 361     unsigned NumHighLatencySuccessors;
 362     unsigned LastPosHighLatParentScheduled;
 363     unsigned Height;
 364
 365     SIBlockSchedCandidate() = default;
 366
 367     bool isValid() const { return Block; }
 368
 369     // Copy the status of another candidate without changing policy.
 370     void setBest(SIBlockSchedCandidate &Best) {
 371       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 372       Block = Best.Block;
 373       Reason = Best.Reason;
 374       IsHighLatency = Best.IsHighLatency;
 375       VGPRUsageDiff = Best.VGPRUsageDiff;
 376       NumSuccessors = Best.NumSuccessors;
 377       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
 378       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
 379       Height = Best.Height;
 380     }
 381   };
 382
 383   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
 384                            SIBlockSchedCandidate &TryCand);
 385   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
 386                             SIBlockSchedCandidate &TryCand);
 387   SIScheduleBlock *pickBlock();
 388
 389   void addLiveRegs(std::set<unsigned> &Regs);
 390   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
 391   void releaseBlockSuccs(SIScheduleBlock *Parent);
 392   void blockScheduled(SIScheduleBlock *Block);
 393
 394   // Check register pressure change
 395   // by scheduling a block with these LiveIn and LiveOut.
 396   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
 397                                        std::set<unsigned> &OutRegs);
 398
 399   void schedule();
 400 };
 401
 402 struct SIScheduleBlockResult {
 403   std::vector<unsigned> SUs;
 404   unsigned MaxSGPRUsage;
 405   unsigned MaxVGPRUsage;
 406 };
 407
 408 class SIScheduler {
 409   SIScheduleDAGMI *DAG;
 410   SIScheduleBlockCreator BlockCreator;
 411
 412 public:
 413   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
 414
 415   ~SIScheduler() = default;
 416
 417   struct SIScheduleBlockResult
 418   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 419                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 420 };
 421
 422 class SIScheduleDAGMI final : public ScheduleDAGMILive {
 423   const SIInstrInfo *SITII;
 424   const SIRegisterInfo *SITRI;
 425
 426   std::vector<SUnit> SUnitsLinksBackup;
 427
 428   // For moveLowLatencies. After all Scheduling variants are tested.
 429   std::vector<unsigned> ScheduledSUnits;
 430   std::vector<unsigned> ScheduledSUnitsInv;
 431
 432   unsigned VGPRSetID;
 433   unsigned SGPRSetID;
 434
 435 public:
 436   SIScheduleDAGMI(MachineSchedContext *C);
 437
 438   ~SIScheduleDAGMI() override;
 439
 440   // Entry point for the schedule.
 441   void schedule() override;
 442
 443   // To init Block's RPTracker.
 444   void initRPTracker(RegPressureTracker &RPTracker) {
 445     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
 446   }
 447
 448   MachineBasicBlock *getBB() { return BB; }
 449   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
 450   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
 451   LiveIntervals *getLIS() { return LIS; }
 452   MachineRegisterInfo *getMRI() { return &MRI; }
 453   const TargetRegisterInfo *getTRI() { return TRI; }
 454   SUnit& getEntrySU() { return EntrySU; }
 455   SUnit& getExitSU() { return ExitSU; }
 456
 457   void restoreSULinksLeft();
 458
 459   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
 460                                                      _Iterator End,
 461                                                      unsigned &VgprUsage,
 462                                                      unsigned &SgprUsage);
 463
 464   std::set<unsigned> getInRegs() {
 465     std::set<unsigned> InRegs;
 466     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
 467       InRegs.insert(RegMaskPair.RegUnit);
 468     }
 469     return InRegs;
 470   }
 471
 472   unsigned getVGPRSetID() const { return VGPRSetID; }
 473   unsigned getSGPRSetID() const { return SGPRSetID; }
 474
 475 private:
 476   void topologicalSort();
 477   // After scheduling is done, improve low latency placements.
 478   void moveLowLatencies();
 479
 480 public:
 481   // Some stats for scheduling inside blocks.
 482   std::vector<unsigned> IsLowLatencySU;
 483   std::vector<unsigned> LowLatencyOffset;
 484   std::vector<unsigned> IsHighLatencySU;
 485   // Topological sort
 486   // Maps topological index to the node number.
 487   std::vector<int> TopDownIndex2SU;
 488   std::vector<int> BottomUpIndex2SU;
 489 };
 490
 491 } // end namespace llvm
 492
 493 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H