contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp

   1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// This pass inserts branches on the 0 exec mask over divergent branches
  11 /// branches when it's expected that jumping over the untaken control flow will
  12 /// be cheaper than having every workitem no-op through it.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUSubtarget.h"
  18 #include "SIInstrInfo.h"
  19 #include "SIMachineFunctionInfo.h"
  20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  21 #include "llvm/ADT/DepthFirstIterator.h"
  22 #include "llvm/ADT/SmallVector.h"
  23 #include "llvm/ADT/StringRef.h"
  24 #include "llvm/CodeGen/MachineBasicBlock.h"
  25 #include "llvm/CodeGen/MachineDominators.h"
  26 #include "llvm/CodeGen/MachineFunction.h"
  27 #include "llvm/CodeGen/MachineFunctionPass.h"
  28 #include "llvm/CodeGen/MachineInstr.h"
  29 #include "llvm/CodeGen/MachineInstrBuilder.h"
  30 #include "llvm/CodeGen/MachineOperand.h"
  31 #include "llvm/IR/CallingConv.h"
  32 #include "llvm/IR/DebugLoc.h"
  33 #include "llvm/InitializePasses.h"
  34 #include "llvm/MC/MCAsmInfo.h"
  35 #include "llvm/Pass.h"
  36 #include "llvm/Support/CommandLine.h"
  37 #include "llvm/Target/TargetMachine.h"
  38 #include <cassert>
  39 #include <cstdint>
  40 #include <iterator>
  41
  42 using namespace llvm;
  43
  44 #define DEBUG_TYPE "si-insert-skips"
  45
  46 static cl::opt<unsigned> SkipThresholdFlag(
  47   "amdgpu-skip-threshold-legacy",
  48   cl::desc("Number of instructions before jumping over divergent control flow"),
  49   cl::init(12), cl::Hidden);
  50
  51 namespace {
  52
  53 class SIInsertSkips : public MachineFunctionPass {
  54 private:
  55   const SIRegisterInfo *TRI = nullptr;
  56   const SIInstrInfo *TII = nullptr;
  57   unsigned SkipThreshold = 0;
  58   MachineDominatorTree *MDT = nullptr;
  59
  60   MachineBasicBlock *EarlyExitBlock = nullptr;
  61
  62   bool shouldSkip(const MachineBasicBlock &From,
  63                   const MachineBasicBlock &To) const;
  64
  65   bool dominatesAllReachable(MachineBasicBlock &MBB);
  66   void createEarlyExitBlock(MachineBasicBlock &MBB);
  67   void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
  68                   DebugLoc DL);
  69
  70   bool kill(MachineInstr &MI);
  71
  72   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
  73
  74 public:
  75   static char ID;
  76
  77   SIInsertSkips() : MachineFunctionPass(ID) {}
  78
  79   bool runOnMachineFunction(MachineFunction &MF) override;
  80
  81   StringRef getPassName() const override {
  82     return "SI insert s_cbranch_execz instructions";
  83   }
  84
  85   void getAnalysisUsage(AnalysisUsage &AU) const override {
  86     AU.addRequired<MachineDominatorTree>();
  87     AU.addPreserved<MachineDominatorTree>();
  88     MachineFunctionPass::getAnalysisUsage(AU);
  89   }
  90 };
  91
  92 } // end anonymous namespace
  93
  94 char SIInsertSkips::ID = 0;
  95
  96 INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
  97                       "SI insert s_cbranch_execz instructions", false, false)
  98 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
  99 INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
 100                     "SI insert s_cbranch_execz instructions", false, false)
 101
 102 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
 103
 104 static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
 105   if (MI.isMetaInstruction())
 106     return true;
 107
 108   // Handle target specific opcodes.
 109   switch (MI.getOpcode()) {
 110   case AMDGPU::SI_MASK_BRANCH:
 111     return true;
 112   default:
 113     return false;
 114   }
 115 }
 116
 117 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
 118                                const MachineBasicBlock &To) const {
 119   unsigned NumInstr = 0;
 120   const MachineFunction *MF = From.getParent();
 121
 122   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
 123        MBBI != End && MBBI != ToI; ++MBBI) {
 124     const MachineBasicBlock &MBB = *MBBI;
 125
 126     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
 127          NumInstr < SkipThreshold && I != E; ++I) {
 128       if (opcodeEmitsNoInsts(*I))
 129         continue;
 130
 131       // FIXME: Since this is required for correctness, this should be inserted
 132       // during SILowerControlFlow.
 133
 134       // When a uniform loop is inside non-uniform control flow, the branch
 135       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
 136       // when EXEC = 0. We should skip the loop lest it becomes infinite.
 137       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
 138           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
 139         return true;
 140
 141       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
 142         return true;
 143
 144       // These instructions are potentially expensive even if EXEC = 0.
 145       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
 146           I->getOpcode() == AMDGPU::S_WAITCNT)
 147         return true;
 148
 149       ++NumInstr;
 150       if (NumInstr >= SkipThreshold)
 151         return true;
 152     }
 153   }
 154
 155   return false;
 156 }
 157
 158 /// Check whether \p MBB dominates all blocks that are reachable from it.
 159 bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
 160   for (MachineBasicBlock *Other : depth_first(&MBB)) {
 161     if (!MDT->dominates(&MBB, Other))
 162       return false;
 163   }
 164   return true;
 165 }
 166
 167 static void generatePsEndPgm(MachineBasicBlock &MBB,
 168                              MachineBasicBlock::iterator I, DebugLoc DL,
 169                              const SIInstrInfo *TII) {
 170   // Generate "null export; s_endpgm".
 171   BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
 172       .addImm(0x09) // V_008DFC_SQ_EXP_NULL
 173       .addReg(AMDGPU::VGPR0, RegState::Undef)
 174       .addReg(AMDGPU::VGPR0, RegState::Undef)
 175       .addReg(AMDGPU::VGPR0, RegState::Undef)
 176       .addReg(AMDGPU::VGPR0, RegState::Undef)
 177       .addImm(1)  // vm
 178       .addImm(0)  // compr
 179       .addImm(0); // en
 180   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
 181 }
 182
 183 void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
 184   MachineFunction *MF = MBB.getParent();
 185   DebugLoc DL;
 186
 187   assert(!EarlyExitBlock);
 188   EarlyExitBlock = MF->CreateMachineBasicBlock();
 189   MF->insert(MF->end(), EarlyExitBlock);
 190
 191   generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
 192 }
 193
 194 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
 195 /// iterator. Only applies to pixel shaders.
 196 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
 197                                MachineBasicBlock::iterator I, DebugLoc DL) {
 198   MachineFunction *MF = MBB.getParent();
 199   assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
 200
 201   // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
 202   // basic block that has no further successors (e.g., there was an
 203   // `unreachable` there in IR). This can happen with original source of the
 204   // form:
 205   //
 206   //   if (uniform_condition) {
 207   //     write_to_memory();
 208   //     discard;
 209   //   }
 210   //
 211   // In this case, we write the "null_export; s_endpgm" skip code in the
 212   // already-existing basic block.
 213   auto NextBBI = std::next(MBB.getIterator());
 214   bool NoSuccessor = I == MBB.end() &&
 215                      llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
 216
 217   if (NoSuccessor) {
 218     generatePsEndPgm(MBB, I, DL, TII);
 219   } else {
 220     if (!EarlyExitBlock) {
 221       createEarlyExitBlock(MBB);
 222       // Update next block pointer to reflect any new blocks
 223       NextBBI = std::next(MBB.getIterator());
 224     }
 225
 226     auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
 227                         .addMBB(EarlyExitBlock);
 228
 229     // Split the block if the branch will not come at the end.
 230     auto Next = std::next(BranchMI->getIterator());
 231     if (Next != MBB.end() && !Next->isTerminator()) {
 232       MachineBasicBlock *SplitBB =
 233           MF->CreateMachineBasicBlock(MBB.getBasicBlock());
 234       MF->insert(NextBBI, SplitBB);
 235       SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
 236       SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
 237       // FIXME: the expectation is that this will be used near the beginning
 238       //        of a block so just assume all registers are still live.
 239       for (auto LiveIn : MBB.liveins())
 240         SplitBB->addLiveIn(LiveIn);
 241       MBB.addSuccessor(SplitBB);
 242
 243       // Update dominator tree
 244       using DomTreeT = DomTreeBase<MachineBasicBlock>;
 245       SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
 246       for (MachineBasicBlock *Succ : SplitBB->successors()) {
 247         DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
 248         DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
 249       }
 250       DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
 251       MDT->getBase().applyUpdates(DTUpdates);
 252     }
 253
 254     MBB.addSuccessor(EarlyExitBlock);
 255     MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
 256   }
 257 }
 258
 259 /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
 260 /// Return true unless the terminator is a no-op.
 261 bool SIInsertSkips::kill(MachineInstr &MI) {
 262   MachineBasicBlock &MBB = *MI.getParent();
 263   DebugLoc DL = MI.getDebugLoc();
 264
 265   switch (MI.getOpcode()) {
 266   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
 267     unsigned Opcode = 0;
 268
 269     // The opcodes are inverted because the inline immediate has to be
 270     // the first operand, e.g. from "x < imm" to "imm > x"
 271     switch (MI.getOperand(2).getImm()) {
 272     case ISD::SETOEQ:
 273     case ISD::SETEQ:
 274       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
 275       break;
 276     case ISD::SETOGT:
 277     case ISD::SETGT:
 278       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
 279       break;
 280     case ISD::SETOGE:
 281     case ISD::SETGE:
 282       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
 283       break;
 284     case ISD::SETOLT:
 285     case ISD::SETLT:
 286       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
 287       break;
 288     case ISD::SETOLE:
 289     case ISD::SETLE:
 290       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
 291       break;
 292     case ISD::SETONE:
 293     case ISD::SETNE:
 294       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
 295       break;
 296     case ISD::SETO:
 297       Opcode = AMDGPU::V_CMPX_O_F32_e64;
 298       break;
 299     case ISD::SETUO:
 300       Opcode = AMDGPU::V_CMPX_U_F32_e64;
 301       break;
 302     case ISD::SETUEQ:
 303       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
 304       break;
 305     case ISD::SETUGT:
 306       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
 307       break;
 308     case ISD::SETUGE:
 309       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
 310       break;
 311     case ISD::SETULT:
 312       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
 313       break;
 314     case ISD::SETULE:
 315       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
 316       break;
 317     case ISD::SETUNE:
 318       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
 319       break;
 320     default:
 321       llvm_unreachable("invalid ISD:SET cond code");
 322     }
 323
 324     const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
 325     if (ST.hasNoSdstCMPX())
 326       Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
 327
 328     assert(MI.getOperand(0).isReg());
 329
 330     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
 331                     MI.getOperand(0).getReg())) {
 332       Opcode = AMDGPU::getVOPe32(Opcode);
 333       BuildMI(MBB, &MI, DL, TII->get(Opcode))
 334           .add(MI.getOperand(1))
 335           .add(MI.getOperand(0));
 336     } else {
 337       auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
 338       if (!ST.hasNoSdstCMPX())
 339         I.addReg(AMDGPU::VCC, RegState::Define);
 340
 341       I.addImm(0)  // src0 modifiers
 342         .add(MI.getOperand(1))
 343         .addImm(0)  // src1 modifiers
 344         .add(MI.getOperand(0));
 345
 346       I.addImm(0);  // omod
 347     }
 348     return true;
 349   }
 350   case AMDGPU::SI_KILL_I1_TERMINATOR: {
 351     const MachineFunction *MF = MI.getParent()->getParent();
 352     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 353     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 354     const MachineOperand &Op = MI.getOperand(0);
 355     int64_t KillVal = MI.getOperand(1).getImm();
 356     assert(KillVal == 0 || KillVal == -1);
 357
 358     // Kill all threads if Op0 is an immediate and equal to the Kill value.
 359     if (Op.isImm()) {
 360       int64_t Imm = Op.getImm();
 361       assert(Imm == 0 || Imm == -1);
 362
 363       if (Imm == KillVal) {
 364         BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
 365                                                      : AMDGPU::S_MOV_B64), Exec)
 366           .addImm(0);
 367         return true;
 368       }
 369       return false;
 370     }
 371
 372     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
 373     if (ST.isWave32())
 374       Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
 375     BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
 376         .addReg(Exec)
 377         .add(Op);
 378     return true;
 379   }
 380   default:
 381     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
 382   }
 383 }
 384
 385 // Returns true if a branch over the block was inserted.
 386 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
 387                                    MachineBasicBlock &SrcMBB) {
 388   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
 389
 390   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
 391     return false;
 392
 393   const DebugLoc &DL = MI.getDebugLoc();
 394   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
 395
 396   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
 397     .addMBB(DestBB);
 398
 399   return true;
 400 }
 401
 402 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 403   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 404   TII = ST.getInstrInfo();
 405   TRI = &TII->getRegisterInfo();
 406   MDT = &getAnalysis<MachineDominatorTree>();
 407   SkipThreshold = SkipThresholdFlag;
 408
 409   SmallVector<MachineInstr *, 4> KillInstrs;
 410   bool MadeChange = false;
 411
 412   for (MachineBasicBlock &MBB : MF) {
 413     MachineBasicBlock::iterator I, Next;
 414     for (I = MBB.begin(); I != MBB.end(); I = Next) {
 415       Next = std::next(I);
 416       MachineInstr &MI = *I;
 417
 418       switch (MI.getOpcode()) {
 419       case AMDGPU::SI_MASK_BRANCH:
 420         MadeChange |= skipMaskBranch(MI, MBB);
 421         break;
 422
 423       case AMDGPU::S_BRANCH:
 424         // Optimize out branches to the next block.
 425         // FIXME: Shouldn't this be handled by BranchFolding?
 426         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
 427           assert(&MI == &MBB.back());
 428           MI.eraseFromParent();
 429           MadeChange = true;
 430         }
 431         break;
 432
 433       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
 434       case AMDGPU::SI_KILL_I1_TERMINATOR: {
 435         MadeChange = true;
 436         bool CanKill = kill(MI);
 437
 438         // Check if we can add an early "if exec=0 { end shader }".
 439         //
 440         // Note that we _always_ do this if it is correct, even if the kill
 441         // happens fairly late in the shader, because the null export should
 442         // generally still be cheaper than normal export(s).
 443         //
 444         // TODO: The dominatesAllReachable check is conservative: if the
 445         //       dominance is only missing due to _uniform_ branches, we could
 446         //       in fact insert the early-exit as well.
 447         if (CanKill &&
 448             MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
 449             dominatesAllReachable(MBB)) {
 450           // Mark the instruction for kill-if-dead insertion. We delay this
 451           // change because it modifies the CFG.
 452           KillInstrs.push_back(&MI);
 453         } else {
 454           MI.eraseFromParent();
 455         }
 456         break;
 457       }
 458
 459       case AMDGPU::SI_KILL_CLEANUP:
 460         if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
 461             dominatesAllReachable(MBB)) {
 462           KillInstrs.push_back(&MI);
 463         } else {
 464           MI.eraseFromParent();
 465         }
 466         break;
 467
 468       default:
 469         break;
 470       }
 471     }
 472   }
 473
 474   for (MachineInstr *Kill : KillInstrs) {
 475     skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
 476                Kill->getDebugLoc());
 477     Kill->eraseFromParent();
 478   }
 479   KillInstrs.clear();
 480   EarlyExitBlock = nullptr;
 481
 482   return MadeChange;
 483 }