contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

   1 //===- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies ---------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Copies from VGPR to SGPR registers are illegal and the register coalescer
  11 /// will sometimes generate these illegal copies in situations like this:
  12 ///
  13 ///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
  14 ///
  15 /// BB0:
  16 ///   %0 <sgpr> = SCALAR_INST
  17 ///   %1 <vsrc> = COPY %0 <sgpr>
  18 ///    ...
  19 ///    BRANCH %cond BB1, BB2
  20 ///  BB1:
  21 ///    %2 <vgpr> = VECTOR_INST
  22 ///    %3 <vsrc> = COPY %2 <vgpr>
  23 ///  BB2:
  24 ///    %4 <vsrc> = PHI %1 <vsrc>, <%bb.0>, %3 <vrsc>, <%bb.1>
  25 ///    %5 <vgpr> = VECTOR_INST %4 <vsrc>
  26 ///
  27 ///
  28 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
  29 /// code will look like this:
  30 ///
  31 /// BB0:
  32 ///   %0 <sgpr> = SCALAR_INST
  33 ///    ...
  34 ///    BRANCH %cond BB1, BB2
  35 /// BB1:
  36 ///   %2 <vgpr> = VECTOR_INST
  37 ///   %3 <vsrc> = COPY %2 <vgpr>
  38 /// BB2:
  39 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <vsrc>, <%bb.1>
  40 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
  41 ///
  42 /// Now that the result of the PHI instruction is an SGPR, the register
  43 /// allocator is now forced to constrain the register class of %3 to
  44 /// <sgpr> so we end up with final code like this:
  45 ///
  46 /// BB0:
  47 ///   %0 <sgpr> = SCALAR_INST
  48 ///    ...
  49 ///    BRANCH %cond BB1, BB2
  50 /// BB1:
  51 ///   %2 <vgpr> = VECTOR_INST
  52 ///   %3 <sgpr> = COPY %2 <vgpr>
  53 /// BB2:
  54 ///   %4 <sgpr> = PHI %0 <sgpr>, <%bb.0>, %3 <sgpr>, <%bb.1>
  55 ///   %5 <vgpr> = VECTOR_INST %4 <sgpr>
  56 ///
  57 /// Now this code contains an illegal copy from a VGPR to an SGPR.
  58 ///
  59 /// In order to avoid this problem, this pass searches for PHI instructions
  60 /// which define a <vsrc> register and constrains its definition class to
  61 /// <vgpr> if the user of the PHI's definition register is a vector instruction.
  62 /// If the PHI's definition class is constrained to <vgpr> then the coalescer
  63 /// will be unable to perform the COPY removal from the above example  which
  64 /// ultimately led to the creation of an illegal COPY.
  65 //===----------------------------------------------------------------------===//
  66
  67 #include "AMDGPU.h"
  68 #include "AMDGPUSubtarget.h"
  69 #include "SIInstrInfo.h"
  70 #include "SIRegisterInfo.h"
  71 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  72 #include "llvm/ADT/DenseSet.h"
  73 #include "llvm/ADT/STLExtras.h"
  74 #include "llvm/ADT/SmallSet.h"
  75 #include "llvm/ADT/SmallVector.h"
  76 #include "llvm/CodeGen/MachineBasicBlock.h"
  77 #include "llvm/CodeGen/MachineDominators.h"
  78 #include "llvm/CodeGen/MachineFunction.h"
  79 #include "llvm/CodeGen/MachineFunctionPass.h"
  80 #include "llvm/CodeGen/MachineInstr.h"
  81 #include "llvm/CodeGen/MachineInstrBuilder.h"
  82 #include "llvm/CodeGen/MachineOperand.h"
  83 #include "llvm/CodeGen/MachineRegisterInfo.h"
  84 #include "llvm/CodeGen/TargetRegisterInfo.h"
  85 #include "llvm/Pass.h"
  86 #include "llvm/Support/CodeGen.h"
  87 #include "llvm/Support/CommandLine.h"
  88 #include "llvm/Support/Debug.h"
  89 #include "llvm/Support/raw_ostream.h"
  90 #include "llvm/Target/TargetMachine.h"
  91 #include <cassert>
  92 #include <cstdint>
  93 #include <iterator>
  94 #include <list>
  95 #include <map>
  96 #include <tuple>
  97 #include <utility>
  98
  99 using namespace llvm;
 100
 101 #define DEBUG_TYPE "si-fix-sgpr-copies"
 102
 103 static cl::opt<bool> EnableM0Merge(
 104   "amdgpu-enable-merge-m0",
 105   cl::desc("Merge and hoist M0 initializations"),
 106   cl::init(true));
 107
 108 namespace {
 109
 110 class SIFixSGPRCopies : public MachineFunctionPass {
 111   MachineDominatorTree *MDT;
 112
 113 public:
 114   static char ID;
 115
 116   MachineRegisterInfo *MRI;
 117   const SIRegisterInfo *TRI;
 118   const SIInstrInfo *TII;
 119
 120   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
 121
 122   bool runOnMachineFunction(MachineFunction &MF) override;
 123
 124   void processPHINode(MachineInstr &MI);
 125
 126   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 127
 128   void getAnalysisUsage(AnalysisUsage &AU) const override {
 129     AU.addRequired<MachineDominatorTree>();
 130     AU.addPreserved<MachineDominatorTree>();
 131     AU.setPreservesCFG();
 132     MachineFunctionPass::getAnalysisUsage(AU);
 133   }
 134 };
 135
 136 } // end anonymous namespace
 137
 138 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
 139                      "SI Fix SGPR copies", false, false)
 140 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 141 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
 142                      "SI Fix SGPR copies", false, false)
 143
 144 char SIFixSGPRCopies::ID = 0;
 145
 146 char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID;
 147
 148 FunctionPass *llvm::createSIFixSGPRCopiesPass() {
 149   return new SIFixSGPRCopies();
 150 }
 151
 152 static bool hasVectorOperands(const MachineInstr &MI,
 153                               const SIRegisterInfo *TRI) {
 154   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 155   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
 156     if (!MI.getOperand(i).isReg() ||
 157         !Register::isVirtualRegister(MI.getOperand(i).getReg()))
 158       continue;
 159
 160     if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
 161       return true;
 162   }
 163   return false;
 164 }
 165
 166 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
 167 getCopyRegClasses(const MachineInstr &Copy,
 168                   const SIRegisterInfo &TRI,
 169                   const MachineRegisterInfo &MRI) {
 170   Register DstReg = Copy.getOperand(0).getReg();
 171   Register SrcReg = Copy.getOperand(1).getReg();
 172
 173   const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
 174                                          ? MRI.getRegClass(SrcReg)
 175                                          : TRI.getPhysRegClass(SrcReg);
 176
 177   // We don't really care about the subregister here.
 178   // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
 179
 180   const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
 181                                          ? MRI.getRegClass(DstReg)
 182                                          : TRI.getPhysRegClass(DstReg);
 183
 184   return std::make_pair(SrcRC, DstRC);
 185 }
 186
 187 static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
 188                              const TargetRegisterClass *DstRC,
 189                              const SIRegisterInfo &TRI) {
 190   return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
 191          TRI.hasVectorRegisters(SrcRC);
 192 }
 193
 194 static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
 195                              const TargetRegisterClass *DstRC,
 196                              const SIRegisterInfo &TRI) {
 197   return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
 198          TRI.hasVectorRegisters(DstRC);
 199 }
 200
 201 static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
 202                                       const SIRegisterInfo *TRI,
 203                                       const SIInstrInfo *TII) {
 204   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 205   auto &Src = MI.getOperand(1);
 206   Register DstReg = MI.getOperand(0).getReg();
 207   Register SrcReg = Src.getReg();
 208   if (!Register::isVirtualRegister(SrcReg) ||
 209       !Register::isVirtualRegister(DstReg))
 210     return false;
 211
 212   for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
 213     const auto *UseMI = MO.getParent();
 214     if (UseMI == &MI)
 215       continue;
 216     if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
 217         UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
 218         !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
 219       return false;
 220   }
 221   // Change VGPR to SGPR destination.
 222   MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
 223   return true;
 224 }
 225
 226 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
 227 //
 228 // SGPRx = ...
 229 // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
 230 // VGPRz = COPY SGPRy
 231 //
 232 // ==>
 233 //
 234 // VGPRx = COPY SGPRx
 235 // VGPRz = REG_SEQUENCE VGPRx, sub0
 236 //
 237 // This exposes immediate folding opportunities when materializing 64-bit
 238 // immediates.
 239 static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 240                                         const SIRegisterInfo *TRI,
 241                                         const SIInstrInfo *TII,
 242                                         MachineRegisterInfo &MRI) {
 243   assert(MI.isRegSequence());
 244
 245   Register DstReg = MI.getOperand(0).getReg();
 246   if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
 247     return false;
 248
 249   if (!MRI.hasOneUse(DstReg))
 250     return false;
 251
 252   MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg);
 253   if (!CopyUse.isCopy())
 254     return false;
 255
 256   // It is illegal to have vreg inputs to a physreg defining reg_sequence.
 257   if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
 258     return false;
 259
 260   const TargetRegisterClass *SrcRC, *DstRC;
 261   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 262
 263   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
 264     return false;
 265
 266   if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
 267     return true;
 268
 269   // TODO: Could have multiple extracts?
 270   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
 271   if (SubReg != AMDGPU::NoSubRegister)
 272     return false;
 273
 274   MRI.setRegClass(DstReg, DstRC);
 275
 276   // SGPRx = ...
 277   // SGPRy = REG_SEQUENCE SGPRx, sub0 ...
 278   // VGPRz = COPY SGPRy
 279
 280   // =>
 281   // VGPRx = COPY SGPRx
 282   // VGPRz = REG_SEQUENCE VGPRx, sub0
 283
 284   MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg());
 285   bool IsAGPR = TRI->hasAGPRs(DstRC);
 286
 287   for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
 288     Register SrcReg = MI.getOperand(I).getReg();
 289     unsigned SrcSubReg = MI.getOperand(I).getSubReg();
 290
 291     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 292     assert(TRI->isSGPRClass(SrcRC) &&
 293            "Expected SGPR REG_SEQUENCE to only have SGPR inputs");
 294
 295     SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
 296     const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
 297
 298     Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
 299
 300     BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
 301             TmpReg)
 302         .add(MI.getOperand(I));
 303
 304     if (IsAGPR) {
 305       const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
 306       Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
 307       unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
 308         AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
 309       BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
 310             TmpAReg)
 311         .addReg(TmpReg, RegState::Kill);
 312       TmpReg = TmpAReg;
 313     }
 314
 315     MI.getOperand(I).setReg(TmpReg);
 316   }
 317
 318   CopyUse.eraseFromParent();
 319   return true;
 320 }
 321
 322 static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
 323                                     const MachineInstr *MoveImm,
 324                                     const SIInstrInfo *TII,
 325                                     unsigned &SMovOp,
 326                                     int64_t &Imm) {
 327   if (Copy->getOpcode() != AMDGPU::COPY)
 328     return false;
 329
 330   if (!MoveImm->isMoveImmediate())
 331     return false;
 332
 333   const MachineOperand *ImmOp =
 334       TII->getNamedOperand(*MoveImm, AMDGPU::OpName::src0);
 335   if (!ImmOp->isImm())
 336     return false;
 337
 338   // FIXME: Handle copies with sub-regs.
 339   if (Copy->getOperand(0).getSubReg())
 340     return false;
 341
 342   switch (MoveImm->getOpcode()) {
 343   default:
 344     return false;
 345   case AMDGPU::V_MOV_B32_e32:
 346     SMovOp = AMDGPU::S_MOV_B32;
 347     break;
 348   case AMDGPU::V_MOV_B64_PSEUDO:
 349     SMovOp = AMDGPU::S_MOV_B64;
 350     break;
 351   }
 352   Imm = ImmOp->getImm();
 353   return true;
 354 }
 355
 356 template <class UnaryPredicate>
 357 bool searchPredecessors(const MachineBasicBlock *MBB,
 358                         const MachineBasicBlock *CutOff,
 359                         UnaryPredicate Predicate) {
 360   if (MBB == CutOff)
 361     return false;
 362
 363   DenseSet<const MachineBasicBlock *> Visited;
 364   SmallVector<MachineBasicBlock *, 4> Worklist(MBB->pred_begin(),
 365                                                MBB->pred_end());
 366
 367   while (!Worklist.empty()) {
 368     MachineBasicBlock *MBB = Worklist.pop_back_val();
 369
 370     if (!Visited.insert(MBB).second)
 371       continue;
 372     if (MBB == CutOff)
 373       continue;
 374     if (Predicate(MBB))
 375       return true;
 376
 377     Worklist.append(MBB->pred_begin(), MBB->pred_end());
 378   }
 379
 380   return false;
 381 }
 382
 383 // Checks if there is potential path From instruction To instruction.
 384 // If CutOff is specified and it sits in between of that path we ignore
 385 // a higher portion of the path and report it is not reachable.
 386 static bool isReachable(const MachineInstr *From,
 387                         const MachineInstr *To,
 388                         const MachineBasicBlock *CutOff,
 389                         MachineDominatorTree &MDT) {
 390   // If either From block dominates To block or instructions are in the same
 391   // block and From is higher.
 392   if (MDT.dominates(From, To))
 393     return true;
 394
 395   const MachineBasicBlock *MBBFrom = From->getParent();
 396   const MachineBasicBlock *MBBTo = To->getParent();
 397   if (MBBFrom == MBBTo)
 398     return false;
 399
 400   // Instructions are in different blocks, do predecessor search.
 401   // We should almost never get here since we do not usually produce M0 stores
 402   // other than -1.
 403   return searchPredecessors(MBBTo, CutOff, [MBBFrom]
 404            (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
 405 }
 406
 407 // Return the first non-prologue instruction in the block.
 408 static MachineBasicBlock::iterator
 409 getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
 410   MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
 411   while (I != MBB->end() && TII->isBasicBlockPrologue(*I))
 412     ++I;
 413
 414   return I;
 415 }
 416
 417 // Hoist and merge identical SGPR initializations into a common predecessor.
 418 // This is intended to combine M0 initializations, but can work with any
 419 // SGPR. A VGPR cannot be processed since we cannot guarantee vector
 420 // executioon.
 421 static bool hoistAndMergeSGPRInits(unsigned Reg,
 422                                    const MachineRegisterInfo &MRI,
 423                                    const TargetRegisterInfo *TRI,
 424                                    MachineDominatorTree &MDT,
 425                                    const TargetInstrInfo *TII) {
 426   // List of inits by immediate value.
 427   using InitListMap = std::map<unsigned, std::list<MachineInstr *>>;
 428   InitListMap Inits;
 429   // List of clobbering instructions.
 430   SmallVector<MachineInstr*, 8> Clobbers;
 431   // List of instructions marked for deletion.
 432   SmallSet<MachineInstr*, 8> MergedInstrs;
 433
 434   bool Changed = false;
 435
 436   for (auto &MI : MRI.def_instructions(Reg)) {
 437     MachineOperand *Imm = nullptr;
 438     for (auto &MO : MI.operands()) {
 439       if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
 440           (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
 441         Imm = nullptr;
 442         break;
 443       } else if (MO.isImm())
 444         Imm = &MO;
 445     }
 446     if (Imm)
 447       Inits[Imm->getImm()].push_front(&MI);
 448     else
 449       Clobbers.push_back(&MI);
 450   }
 451
 452   for (auto &Init : Inits) {
 453     auto &Defs = Init.second;
 454
 455     for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
 456       MachineInstr *MI1 = *I1;
 457
 458       for (auto I2 = std::next(I1); I2 != E; ) {
 459         MachineInstr *MI2 = *I2;
 460
 461         // Check any possible interference
 462         auto interferes = [&](MachineBasicBlock::iterator From,
 463                               MachineBasicBlock::iterator To) -> bool {
 464
 465           assert(MDT.dominates(&*To, &*From));
 466
 467           auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
 468             const MachineBasicBlock *MBBFrom = From->getParent();
 469             const MachineBasicBlock *MBBTo = To->getParent();
 470             bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
 471             bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
 472             if (!MayClobberFrom && !MayClobberTo)
 473               return false;
 474             if ((MayClobberFrom && !MayClobberTo) ||
 475                 (!MayClobberFrom && MayClobberTo))
 476               return true;
 477             // Both can clobber, this is not an interference only if both are
 478             // dominated by Clobber and belong to the same block or if Clobber
 479             // properly dominates To, given that To >> From, so it dominates
 480             // both and located in a common dominator.
 481             return !((MBBFrom == MBBTo &&
 482                       MDT.dominates(Clobber, &*From) &&
 483                       MDT.dominates(Clobber, &*To)) ||
 484                      MDT.properlyDominates(Clobber->getParent(), MBBTo));
 485           };
 486
 487           return (llvm::any_of(Clobbers, interferes)) ||
 488                  (llvm::any_of(Inits, [&](InitListMap::value_type &C) {
 489                     return C.first != Init.first &&
 490                            llvm::any_of(C.second, interferes);
 491                   }));
 492         };
 493
 494         if (MDT.dominates(MI1, MI2)) {
 495           if (!interferes(MI2, MI1)) {
 496             LLVM_DEBUG(dbgs()
 497                        << "Erasing from "
 498                        << printMBBReference(*MI2->getParent()) << " " << *MI2);
 499             MergedInstrs.insert(MI2);
 500             Changed = true;
 501             ++I2;
 502             continue;
 503           }
 504         } else if (MDT.dominates(MI2, MI1)) {
 505           if (!interferes(MI1, MI2)) {
 506             LLVM_DEBUG(dbgs()
 507                        << "Erasing from "
 508                        << printMBBReference(*MI1->getParent()) << " " << *MI1);
 509             MergedInstrs.insert(MI1);
 510             Changed = true;
 511             ++I1;
 512             break;
 513           }
 514         } else {
 515           auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
 516                                                      MI2->getParent());
 517           if (!MBB) {
 518             ++I2;
 519             continue;
 520           }
 521
 522           MachineBasicBlock::iterator I = getFirstNonPrologue(MBB, TII);
 523           if (!interferes(MI1, I) && !interferes(MI2, I)) {
 524             LLVM_DEBUG(dbgs()
 525                        << "Erasing from "
 526                        << printMBBReference(*MI1->getParent()) << " " << *MI1
 527                        << "and moving from "
 528                        << printMBBReference(*MI2->getParent()) << " to "
 529                        << printMBBReference(*I->getParent()) << " " << *MI2);
 530             I->getParent()->splice(I, MI2->getParent(), MI2);
 531             MergedInstrs.insert(MI1);
 532             Changed = true;
 533             ++I1;
 534             break;
 535           }
 536         }
 537         ++I2;
 538       }
 539       ++I1;
 540     }
 541   }
 542
 543   // Remove initializations that were merged into another.
 544   for (auto &Init : Inits) {
 545     auto &Defs = Init.second;
 546     auto I = Defs.begin();
 547     while (I != Defs.end()) {
 548       if (MergedInstrs.count(*I)) {
 549         (*I)->eraseFromParent();
 550         I = Defs.erase(I);
 551       } else
 552         ++I;
 553     }
 554   }
 555
 556   // Try to schedule SGPR initializations as early as possible in the MBB.
 557   for (auto &Init : Inits) {
 558     auto &Defs = Init.second;
 559     for (auto MI : Defs) {
 560       auto MBB = MI->getParent();
 561       MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
 562       MachineBasicBlock::reverse_iterator B(BoundaryMI);
 563       // Check if B should actually be a boundary. If not set the previous
 564       // instruction as the boundary instead.
 565       if (!TII->isBasicBlockPrologue(*B))
 566         B++;
 567
 568       auto R = std::next(MI->getReverseIterator());
 569       const unsigned Threshold = 50;
 570       // Search until B or Threshold for a place to insert the initialization.
 571       for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
 572         if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
 573             TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
 574           break;
 575
 576       // Move to directly after R.
 577       if (&*--R != MI)
 578         MBB->splice(*R, MBB, MI);
 579     }
 580   }
 581
 582   if (Changed)
 583     MRI.clearKillFlags(Reg);
 584
 585   return Changed;
 586 }
 587
 588 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
 589   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 590   MRI = &MF.getRegInfo();
 591   TRI = ST.getRegisterInfo();
 592   TII = ST.getInstrInfo();
 593   MDT = &getAnalysis<MachineDominatorTree>();
 594
 595   SmallVector<MachineInstr *, 16> Worklist;
 596
 597   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 598                                                   BI != BE; ++BI) {
 599     MachineBasicBlock &MBB = *BI;
 600     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
 601          I != E; ++I) {
 602       MachineInstr &MI = *I;
 603
 604       switch (MI.getOpcode()) {
 605       default:
 606         continue;
 607       case AMDGPU::COPY:
 608       case AMDGPU::WQM:
 609       case AMDGPU::SOFT_WQM:
 610       case AMDGPU::WWM: {
 611         Register DstReg = MI.getOperand(0).getReg();
 612
 613         const TargetRegisterClass *SrcRC, *DstRC;
 614         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 615
 616         if (!Register::isVirtualRegister(DstReg)) {
 617           // If the destination register is a physical register there isn't
 618           // really much we can do to fix this.
 619           // Some special instructions use M0 as an input. Some even only use
 620           // the first lane. Insert a readfirstlane and hope for the best.
 621           if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
 622             Register TmpReg
 623               = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 624
 625             BuildMI(MBB, MI, MI.getDebugLoc(),
 626                     TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
 627               .add(MI.getOperand(1));
 628             MI.getOperand(1).setReg(TmpReg);
 629           }
 630
 631           continue;
 632         }
 633
 634         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
 635           Register SrcReg = MI.getOperand(1).getReg();
 636           if (!Register::isVirtualRegister(SrcReg)) {
 637             TII->moveToVALU(MI, MDT);
 638             break;
 639           }
 640
 641           MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
 642           unsigned SMovOp;
 643           int64_t Imm;
 644           // If we are just copying an immediate, we can replace the copy with
 645           // s_mov_b32.
 646           if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
 647             MI.getOperand(1).ChangeToImmediate(Imm);
 648             MI.addImplicitDefUseOperands(MF);
 649             MI.setDesc(TII->get(SMovOp));
 650             break;
 651           }
 652           TII->moveToVALU(MI, MDT);
 653         } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
 654           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
 655         }
 656
 657         break;
 658       }
 659       case AMDGPU::PHI: {
 660         processPHINode(MI);
 661         break;
 662       }
 663       case AMDGPU::REG_SEQUENCE:
 664         if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
 665             !hasVectorOperands(MI, TRI)) {
 666           foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
 667           continue;
 668         }
 669
 670         LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI);
 671
 672         TII->moveToVALU(MI, MDT);
 673         break;
 674       case AMDGPU::INSERT_SUBREG: {
 675         const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
 676         DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
 677         Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
 678         Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
 679         if (TRI->isSGPRClass(DstRC) &&
 680             (TRI->hasVectorRegisters(Src0RC) ||
 681              TRI->hasVectorRegisters(Src1RC))) {
 682           LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI);
 683           TII->moveToVALU(MI, MDT);
 684         }
 685         break;
 686       }
 687       case AMDGPU::V_WRITELANE_B32: {
 688         // Some architectures allow more than one constant bus access without
 689         // SGPR restriction
 690         if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
 691           break;
 692
 693         // Writelane is special in that it can use SGPR and M0 (which would
 694         // normally count as using the constant bus twice - but in this case it
 695         // is allowed since the lane selector doesn't count as a use of the
 696         // constant bus). However, it is still required to abide by the 1 SGPR
 697         // rule. Apply a fix here as we might have multiple SGPRs after
 698         // legalizing VGPRs to SGPRs
 699         int Src0Idx =
 700             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
 701         int Src1Idx =
 702             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
 703         MachineOperand &Src0 = MI.getOperand(Src0Idx);
 704         MachineOperand &Src1 = MI.getOperand(Src1Idx);
 705
 706         // Check to see if the instruction violates the 1 SGPR rule
 707         if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
 708              Src0.getReg() != AMDGPU::M0) &&
 709             (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
 710              Src1.getReg() != AMDGPU::M0)) {
 711
 712           // Check for trivially easy constant prop into one of the operands
 713           // If this is the case then perform the operation now to resolve SGPR
 714           // issue. If we don't do that here we will always insert a mov to m0
 715           // that can't be resolved in later operand folding pass
 716           bool Resolved = false;
 717           for (MachineOperand *MO : {&Src0, &Src1}) {
 718             if (Register::isVirtualRegister(MO->getReg())) {
 719               MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
 720               if (DefMI && TII->isFoldableCopy(*DefMI)) {
 721                 const MachineOperand &Def = DefMI->getOperand(0);
 722                 if (Def.isReg() &&
 723                     MO->getReg() == Def.getReg() &&
 724                     MO->getSubReg() == Def.getSubReg()) {
 725                   const MachineOperand &Copied = DefMI->getOperand(1);
 726                   if (Copied.isImm() &&
 727                       TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
 728                     MO->ChangeToImmediate(Copied.getImm());
 729                     Resolved = true;
 730                     break;
 731                   }
 732                 }
 733               }
 734             }
 735           }
 736
 737           if (!Resolved) {
 738             // Haven't managed to resolve by replacing an SGPR with an immediate
 739             // Move src1 to be in M0
 740             BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
 741                     TII->get(AMDGPU::COPY), AMDGPU::M0)
 742                 .add(Src1);
 743             Src1.ChangeToRegister(AMDGPU::M0, false);
 744           }
 745         }
 746         break;
 747       }
 748       }
 749     }
 750   }
 751
 752   if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
 753     hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
 754
 755   return true;
 756 }
 757
 758 void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
 759   unsigned numVGPRUses = 0;
 760   bool AllAGPRUses = true;
 761   SetVector<const MachineInstr *> worklist;
 762   SmallSet<const MachineInstr *, 4> Visited;
 763   worklist.insert(&MI);
 764   Visited.insert(&MI);
 765   while (!worklist.empty()) {
 766     const MachineInstr *Instr = worklist.pop_back_val();
 767     unsigned Reg = Instr->getOperand(0).getReg();
 768     for (const auto &Use : MRI->use_operands(Reg)) {
 769       const MachineInstr *UseMI = Use.getParent();
 770       AllAGPRUses &= (UseMI->isCopy() &&
 771                       TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
 772                      TRI->isAGPR(*MRI, Use.getReg());
 773       if (UseMI->isCopy() || UseMI->isRegSequence()) {
 774         if (UseMI->isCopy() &&
 775           UseMI->getOperand(0).getReg().isPhysical() &&
 776           !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
 777           numVGPRUses++;
 778         }
 779         if (Visited.insert(UseMI).second)
 780           worklist.insert(UseMI);
 781
 782         continue;
 783       }
 784
 785       if (UseMI->isPHI()) {
 786         const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
 787         if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
 788           UseRC != &AMDGPU::VReg_1RegClass)
 789           numVGPRUses++;
 790         continue;
 791       }
 792
 793       const TargetRegisterClass *OpRC =
 794         TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
 795       if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
 796         OpRC != &AMDGPU::VS_64RegClass) {
 797         numVGPRUses++;
 798       }
 799     }
 800   }
 801
 802   Register PHIRes = MI.getOperand(0).getReg();
 803   const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
 804   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
 805     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
 806     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
 807   }
 808
 809   bool hasVGPRInput = false;
 810   for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
 811     unsigned InputReg = MI.getOperand(i).getReg();
 812     MachineInstr *Def = MRI->getVRegDef(InputReg);
 813     if (TRI->isVectorRegister(*MRI, InputReg)) {
 814       if (Def->isCopy()) {
 815         unsigned SrcReg = Def->getOperand(1).getReg();
 816         const TargetRegisterClass *RC =
 817           TRI->getRegClassForReg(*MRI, SrcReg);
 818         if (TRI->isSGPRClass(RC))
 819           continue;
 820       }
 821       hasVGPRInput = true;
 822       break;
 823     }
 824     else if (Def->isCopy() &&
 825       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
 826       hasVGPRInput = true;
 827       break;
 828     }
 829   }
 830
 831   if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
 832        RC0 != &AMDGPU::VReg_1RegClass) &&
 833     (hasVGPRInput || numVGPRUses > 1)) {
 834     LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
 835     TII->moveToVALU(MI);
 836   }
 837   else {
 838     LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
 839     TII->legalizeOperands(MI, MDT);
 840   }
 841
 842 }