contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h

   1 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// SI Machine Scheduler interface
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  16 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
  17
  18 #include "SIInstrInfo.h"
  19 #include "llvm/CodeGen/MachineBasicBlock.h"
  20 #include "llvm/CodeGen/MachineScheduler.h"
  21 #include "llvm/CodeGen/RegisterPressure.h"
  22 #include "llvm/CodeGen/ScheduleDAG.h"
  23 #include <cassert>
  24 #include <cstdint>
  25 #include <map>
  26 #include <memory>
  27 #include <set>
  28 #include <vector>
  29
  30 namespace llvm {
  31
  32 enum SIScheduleCandReason {
  33   NoCand,
  34   RegUsage,
  35   Latency,
  36   Successor,
  37   Depth,
  38   NodeOrder
  39 };
  40
  41 struct SISchedulerCandidate {
  42   // The reason for this candidate.
  43   SIScheduleCandReason Reason = NoCand;
  44
  45   // Set of reasons that apply to multiple candidates.
  46   uint32_t RepeatReasonSet = 0;
  47
  48   SISchedulerCandidate() = default;
  49
  50   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
  51   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
  52 };
  53
  54 class SIScheduleDAGMI;
  55 class SIScheduleBlockCreator;
  56
  57 enum SIScheduleBlockLinkKind {
  58   NoData,
  59   Data
  60 };
  61
  62 class SIScheduleBlock {
  63   SIScheduleDAGMI *DAG;
  64   SIScheduleBlockCreator *BC;
  65
  66   std::vector<SUnit*> SUnits;
  67   std::map<unsigned, unsigned> NodeNum2Index;
  68   std::vector<SUnit*> TopReadySUs;
  69   std::vector<SUnit*> ScheduledSUnits;
  70
  71   /// The top of the unscheduled zone.
  72   IntervalPressure TopPressure;
  73   RegPressureTracker TopRPTracker;
  74
  75   // Pressure: number of said class of registers needed to
  76   // store the live virtual and real registers.
  77   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
  78   // Pressure of additional registers required inside the block.
  79   std::vector<unsigned> InternalAdditionnalPressure;
  80   // Pressure of input and output registers
  81   std::vector<unsigned> LiveInPressure;
  82   std::vector<unsigned> LiveOutPressure;
  83   // Registers required by the block, and outputs.
  84   // We do track only virtual registers.
  85   // Note that some registers are not 32 bits,
  86   // and thus the pressure is not equal
  87   // to the number of live registers.
  88   std::set<unsigned> LiveInRegs;
  89   std::set<unsigned> LiveOutRegs;
  90
  91   bool Scheduled = false;
  92   bool HighLatencyBlock = false;
  93
  94   std::vector<unsigned> HasLowLatencyNonWaitedParent;
  95
  96   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
  97   unsigned ID;
  98
  99   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
 100   // All blocks successors, and the kind of link
 101   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
 102   unsigned NumHighLatencySuccessors = 0;
 103
 104 public:
 105   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
 106                   unsigned ID):
 107     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
 108
 109   ~SIScheduleBlock() = default;
 110
 111   unsigned getID() const { return ID; }
 112
 113   /// Functions for Block construction.
 114   void addUnit(SUnit *SU);
 115
 116   // When all SUs have been added.
 117   void finalizeUnits();
 118
 119   // Add block pred, which has instruction predecessor of SU.
 120   void addPred(SIScheduleBlock *Pred);
 121   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
 122
 123   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
 124   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
 125     getSuccs() const { return Succs; }
 126
 127   unsigned Height;  // Maximum topdown path length to block without outputs
 128   unsigned Depth;   // Maximum bottomup path length to block without inputs
 129
 130   unsigned getNumHighLatencySuccessors() const {
 131     return NumHighLatencySuccessors;
 132   }
 133
 134   bool isHighLatencyBlock() { return HighLatencyBlock; }
 135
 136   // This is approximative.
 137   // Ideally should take into accounts some instructions (rcp, etc)
 138   // are 4 times slower.
 139   int getCost() { return SUnits.size(); }
 140
 141   // The block Predecessors and Successors must be all registered
 142   // before fastSchedule().
 143   // Fast schedule with no particular requirement.
 144   void fastSchedule();
 145
 146   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
 147
 148   // Complete schedule that will try to minimize reg pressure and
 149   // low latencies, and will fill liveins and liveouts.
 150   // Needs all MIs to be grouped between BeginBlock and EndBlock.
 151   // The MIs can be moved after the scheduling,
 152   // it is just used to allow correct track of live registers.
 153   void schedule(MachineBasicBlock::iterator BeginBlock,
 154                 MachineBasicBlock::iterator EndBlock);
 155
 156   bool isScheduled() { return Scheduled; }
 157
 158   // Needs the block to be scheduled inside
 159   // TODO: find a way to compute it.
 160   std::vector<unsigned> &getInternalAdditionnalRegUsage() {
 161     return InternalAdditionnalPressure;
 162   }
 163
 164   std::set<unsigned> &getInRegs() { return LiveInRegs; }
 165   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
 166
 167   void printDebug(bool Full);
 168
 169 private:
 170   struct SISchedCandidate : SISchedulerCandidate {
 171     // The best SUnit candidate.
 172     SUnit *SU = nullptr;
 173
 174     unsigned SGPRUsage;
 175     unsigned VGPRUsage;
 176     bool IsLowLatency;
 177     unsigned LowLatencyOffset;
 178     bool HasLowLatencyNonWaitedParent;
 179
 180     SISchedCandidate() = default;
 181
 182     bool isValid() const { return SU; }
 183
 184     // Copy the status of another candidate without changing policy.
 185     void setBest(SISchedCandidate &Best) {
 186       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 187       SU = Best.SU;
 188       Reason = Best.Reason;
 189       SGPRUsage = Best.SGPRUsage;
 190       VGPRUsage = Best.VGPRUsage;
 191       IsLowLatency = Best.IsLowLatency;
 192       LowLatencyOffset = Best.LowLatencyOffset;
 193       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
 194     }
 195   };
 196
 197   void undoSchedule();
 198
 199   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
 200   void releaseSucc(SUnit *SU, SDep *SuccEdge);
 201   // InOrOutBlock: restrict to links pointing inside the block (true),
 202   // or restrict to links pointing outside the block (false).
 203   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
 204
 205   void nodeScheduled(SUnit *SU);
 206   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 207   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
 208   SUnit* pickNode();
 209   void traceCandidate(const SISchedCandidate &Cand);
 210   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
 211                        MachineBasicBlock::iterator EndBlock);
 212 };
 213
 214 struct SIScheduleBlocks {
 215   std::vector<SIScheduleBlock*> Blocks;
 216   std::vector<int> TopDownIndex2Block;
 217   std::vector<int> TopDownBlock2Index;
 218 };
 219
 220 enum SISchedulerBlockCreatorVariant {
 221   LatenciesAlone,
 222   LatenciesGrouped,
 223   LatenciesAlonePlusConsecutive
 224 };
 225
 226 class SIScheduleBlockCreator {
 227   SIScheduleDAGMI *DAG;
 228   // unique_ptr handles freeing memory for us.
 229   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
 230   std::map<SISchedulerBlockCreatorVariant,
 231            SIScheduleBlocks> Blocks;
 232   std::vector<SIScheduleBlock*> CurrentBlocks;
 233   std::vector<int> Node2CurrentBlock;
 234
 235   // Topological sort
 236   // Maps topological index to the node number.
 237   std::vector<int> TopDownIndex2Block;
 238   std::vector<int> TopDownBlock2Index;
 239   std::vector<int> BottomUpIndex2Block;
 240
 241   // 0 -> Color not given.
 242   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
 243   // Above -> Other groups.
 244   int NextReservedID;
 245   int NextNonReservedID;
 246   std::vector<int> CurrentColoring;
 247   std::vector<int> CurrentTopDownReservedDependencyColoring;
 248   std::vector<int> CurrentBottomUpReservedDependencyColoring;
 249
 250 public:
 251   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
 252   ~SIScheduleBlockCreator();
 253
 254   SIScheduleBlocks
 255   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
 256
 257   bool isSUInBlock(SUnit *SU, unsigned ID);
 258
 259 private:
 260   // Give a Reserved color to every high latency.
 261   void colorHighLatenciesAlone();
 262
 263   // Create groups of high latencies with a Reserved color.
 264   void colorHighLatenciesGroups();
 265
 266   // Compute coloring for topdown and bottom traversals with
 267   // different colors depending on dependencies on Reserved colors.
 268   void colorComputeReservedDependencies();
 269
 270   // Give color to all non-colored SUs according to Reserved groups dependencies.
 271   void colorAccordingToReservedDependencies();
 272
 273   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
 274   // The new colors are computed according to the dependencies on the other blocks
 275   // formed with colorAccordingToReservedDependencies.
 276   void colorEndsAccordingToDependencies();
 277
 278   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
 279   void colorForceConsecutiveOrderInGroup();
 280
 281   // Merge Constant loads that have all their users into another group to the group.
 282   // (TODO: else if all their users depend on the same group, put them there)
 283   void colorMergeConstantLoadsNextGroup();
 284
 285   // Merge SUs that have all their users into another group to the group
 286   void colorMergeIfPossibleNextGroup();
 287
 288   // Merge SUs that have all their users into another group to the group,
 289   // but only for Reserved groups.
 290   void colorMergeIfPossibleNextGroupOnlyForReserved();
 291
 292   // Merge SUs that have all their users into another group to the group,
 293   // but only if the group is no more than a few SUs.
 294   void colorMergeIfPossibleSmallGroupsToNextGroup();
 295
 296   // Divides Blocks with important size.
 297   // Idea of implementation: attribute new colors depending on topdown and
 298   // bottom up links to other blocks.
 299   void cutHugeBlocks();
 300
 301   // Put in one group all instructions with no users in this scheduling region
 302   // (we'd want these groups be at the end).
 303   void regroupNoUserInstructions();
 304
 305   // Give Reserved color to export instructions
 306   void colorExports();
 307
 308   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
 309
 310   void topologicalSort();
 311
 312   void scheduleInsideBlocks();
 313
 314   void fillStats();
 315 };
 316
 317 enum SISchedulerBlockSchedulerVariant {
 318   BlockLatencyRegUsage,
 319   BlockRegUsageLatency,
 320   BlockRegUsage
 321 };
 322
 323 class SIScheduleBlockScheduler {
 324   SIScheduleDAGMI *DAG;
 325   SISchedulerBlockSchedulerVariant Variant;
 326   std::vector<SIScheduleBlock*> Blocks;
 327
 328   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
 329   std::set<unsigned> LiveRegs;
 330   // Num of schedulable unscheduled blocks reading the register.
 331   std::map<unsigned, unsigned> LiveRegsConsumers;
 332
 333   std::vector<unsigned> LastPosHighLatencyParentScheduled;
 334   int LastPosWaitedHighLatency;
 335
 336   std::vector<SIScheduleBlock*> BlocksScheduled;
 337   unsigned NumBlockScheduled;
 338   std::vector<SIScheduleBlock*> ReadyBlocks;
 339
 340   unsigned VregCurrentUsage;
 341   unsigned SregCurrentUsage;
 342
 343   // Currently is only approximation.
 344   unsigned maxVregUsage;
 345   unsigned maxSregUsage;
 346
 347   std::vector<unsigned> BlockNumPredsLeft;
 348   std::vector<unsigned> BlockNumSuccsLeft;
 349
 350 public:
 351   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
 352                            SISchedulerBlockSchedulerVariant Variant,
 353                            SIScheduleBlocks BlocksStruct);
 354   ~SIScheduleBlockScheduler() = default;
 355
 356   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }
 357
 358   unsigned getVGPRUsage() { return maxVregUsage; }
 359   unsigned getSGPRUsage() { return maxSregUsage; }
 360
 361 private:
 362   struct SIBlockSchedCandidate : SISchedulerCandidate {
 363     // The best Block candidate.
 364     SIScheduleBlock *Block = nullptr;
 365
 366     bool IsHighLatency;
 367     int VGPRUsageDiff;
 368     unsigned NumSuccessors;
 369     unsigned NumHighLatencySuccessors;
 370     unsigned LastPosHighLatParentScheduled;
 371     unsigned Height;
 372
 373     SIBlockSchedCandidate() = default;
 374
 375     bool isValid() const { return Block; }
 376
 377     // Copy the status of another candidate without changing policy.
 378     void setBest(SIBlockSchedCandidate &Best) {
 379       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
 380       Block = Best.Block;
 381       Reason = Best.Reason;
 382       IsHighLatency = Best.IsHighLatency;
 383       VGPRUsageDiff = Best.VGPRUsageDiff;
 384       NumSuccessors = Best.NumSuccessors;
 385       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
 386       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
 387       Height = Best.Height;
 388     }
 389   };
 390
 391   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
 392                            SIBlockSchedCandidate &TryCand);
 393   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
 394                             SIBlockSchedCandidate &TryCand);
 395   SIScheduleBlock *pickBlock();
 396
 397   void addLiveRegs(std::set<unsigned> &Regs);
 398   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
 399   void releaseBlockSuccs(SIScheduleBlock *Parent);
 400   void blockScheduled(SIScheduleBlock *Block);
 401
 402   // Check register pressure change
 403   // by scheduling a block with these LiveIn and LiveOut.
 404   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
 405                                        std::set<unsigned> &OutRegs);
 406
 407   void schedule();
 408 };
 409
 410 struct SIScheduleBlockResult {
 411   std::vector<unsigned> SUs;
 412   unsigned MaxSGPRUsage;
 413   unsigned MaxVGPRUsage;
 414 };
 415
 416 class SIScheduler {
 417   SIScheduleDAGMI *DAG;
 418   SIScheduleBlockCreator BlockCreator;
 419
 420 public:
 421   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}
 422
 423   ~SIScheduler() = default;
 424
 425   struct SIScheduleBlockResult
 426   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 427                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 428 };
 429
 430 class SIScheduleDAGMI final : public ScheduleDAGMILive {
 431   const SIInstrInfo *SITII;
 432   const SIRegisterInfo *SITRI;
 433
 434   std::vector<SUnit> SUnitsLinksBackup;
 435
 436   // For moveLowLatencies. After all Scheduling variants are tested.
 437   std::vector<unsigned> ScheduledSUnits;
 438   std::vector<unsigned> ScheduledSUnitsInv;
 439
 440   unsigned VGPRSetID;
 441   unsigned SGPRSetID;
 442
 443 public:
 444   SIScheduleDAGMI(MachineSchedContext *C);
 445
 446   ~SIScheduleDAGMI() override;
 447
 448   // Entry point for the schedule.
 449   void schedule() override;
 450
 451   // To init Block's RPTracker.
 452   void initRPTracker(RegPressureTracker &RPTracker) {
 453     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
 454   }
 455
 456   MachineBasicBlock *getBB() { return BB; }
 457   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
 458   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
 459   LiveIntervals *getLIS() { return LIS; }
 460   MachineRegisterInfo *getMRI() { return &MRI; }
 461   const TargetRegisterInfo *getTRI() { return TRI; }
 462   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
 463   SUnit& getEntrySU() { return EntrySU; }
 464   SUnit& getExitSU() { return ExitSU; }
 465
 466   void restoreSULinksLeft();
 467
 468   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
 469                                                      _Iterator End,
 470                                                      unsigned &VgprUsage,
 471                                                      unsigned &SgprUsage);
 472
 473   std::set<unsigned> getInRegs() {
 474     std::set<unsigned> InRegs;
 475     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
 476       InRegs.insert(RegMaskPair.RegUnit);
 477     }
 478     return InRegs;
 479   }
 480
 481   std::set<unsigned> getOutRegs() {
 482     std::set<unsigned> OutRegs;
 483     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
 484       OutRegs.insert(RegMaskPair.RegUnit);
 485     }
 486     return OutRegs;
 487   };
 488
 489   unsigned getVGPRSetID() const { return VGPRSetID; }
 490   unsigned getSGPRSetID() const { return SGPRSetID; }
 491
 492 private:
 493   void topologicalSort();
 494   // After scheduling is done, improve low latency placements.
 495   void moveLowLatencies();
 496
 497 public:
 498   // Some stats for scheduling inside blocks.
 499   std::vector<unsigned> IsLowLatencySU;
 500   std::vector<unsigned> LowLatencyOffset;
 501   std::vector<unsigned> IsHighLatencySU;
 502   // Topological sort
 503   // Maps topological index to the node number.
 504   std::vector<int> TopDownIndex2SU;
 505   std::vector<int> BottomUpIndex2SU;
 506 };
 507
 508 } // end namespace llvm
 509
 510 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H