contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

   1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 /// The pass tries to use the 32-bit encoding for instructions when possible.
   9 //===----------------------------------------------------------------------===//
  10 //
  11
  12 #include "AMDGPU.h"
  13 #include "AMDGPUMCInstLower.h"
  14 #include "AMDGPUSubtarget.h"
  15 #include "SIInstrInfo.h"
  16 #include "llvm/ADT/Statistic.h"
  17 #include "llvm/CodeGen/MachineFunctionPass.h"
  18 #include "llvm/CodeGen/MachineInstrBuilder.h"
  19 #include "llvm/CodeGen/MachineRegisterInfo.h"
  20 #include "llvm/IR/Constants.h"
  21 #include "llvm/IR/Function.h"
  22 #include "llvm/IR/LLVMContext.h"
  23 #include "llvm/Support/Debug.h"
  24 #include "llvm/Support/raw_ostream.h"
  25 #include "llvm/Target/TargetMachine.h"
  26
  27 #define DEBUG_TYPE "si-shrink-instructions"
  28
  29 STATISTIC(NumInstructionsShrunk,
  30           "Number of 64-bit instruction reduced to 32-bit.");
  31 STATISTIC(NumLiteralConstantsFolded,
  32           "Number of literal constants folded into 32-bit instructions.");
  33
  34 using namespace llvm;
  35
  36 namespace {
  37
  38 class SIShrinkInstructions : public MachineFunctionPass {
  39 public:
  40   static char ID;
  41
  42 public:
  43   SIShrinkInstructions() : MachineFunctionPass(ID) {
  44   }
  45
  46   bool runOnMachineFunction(MachineFunction &MF) override;
  47
  48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
  49
  50   void getAnalysisUsage(AnalysisUsage &AU) const override {
  51     AU.setPreservesCFG();
  52     MachineFunctionPass::getAnalysisUsage(AU);
  53   }
  54 };
  55
  56 } // End anonymous namespace.
  57
  58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
  59                 "SI Shrink Instructions", false, false)
  60
  61 char SIShrinkInstructions::ID = 0;
  62
  63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
  64   return new SIShrinkInstructions();
  65 }
  66
  67 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
  68                    const MachineRegisterInfo &MRI) {
  69   if (!MO->isReg())
  70     return false;
  71
  72   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
  73     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
  74
  75   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
  76 }
  77
  78 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
  79                       const SIRegisterInfo &TRI,
  80                       const MachineRegisterInfo &MRI) {
  81
  82   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
  83   // Can't shrink instruction with three operands.
  84   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
  85   // a special case for it.  It can only be shrunk if the third operand
  86   // is vcc.  We should handle this the same way we handle vopc, by addding
  87   // a register allocation hint pre-regalloc and then do the shrinking
  88   // post-regalloc.
  89   if (Src2) {
  90     switch (MI.getOpcode()) {
  91       default: return false;
  92
  93       case AMDGPU::V_ADDC_U32_e64:
  94       case AMDGPU::V_SUBB_U32_e64:
  95         if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
  96           return false;
  97         // Additional verification is needed for sdst/src2.
  98         return true;
  99
 100       case AMDGPU::V_MAC_F32_e64:
 101       case AMDGPU::V_MAC_F16_e64:
 102         if (!isVGPR(Src2, TRI, MRI) ||
 103             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
 104           return false;
 105         break;
 106
 107       case AMDGPU::V_CNDMASK_B32_e64:
 108         break;
 109     }
 110   }
 111
 112   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 113   if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
 114                TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
 115     return false;
 116
 117   // We don't need to check src0, all input types are legal, so just make sure
 118   // src0 isn't using any modifiers.
 119   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
 120     return false;
 121
 122   // Check output modifiers
 123   return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
 124          !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 125 }
 126
 127 /// \brief This function checks \p MI for operands defined by a move immediate
 128 /// instruction and then folds the literal constant into the instruction if it
 129 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
 130 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
 131                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
 132   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 133
 134   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
 135
 136   // Try to fold Src0
 137   MachineOperand &Src0 = MI.getOperand(Src0Idx);
 138   if (Src0.isReg()) {
 139     unsigned Reg = Src0.getReg();
 140     if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
 141       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
 142       if (Def && Def->isMoveImmediate()) {
 143         MachineOperand &MovSrc = Def->getOperand(1);
 144         bool ConstantFolded = false;
 145
 146         if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
 147                                isUInt<32>(MovSrc.getImm()))) {
 148           // It's possible to have only one component of a super-reg defined by
 149           // a single mov, so we need to clear any subregister flag.
 150           Src0.setSubReg(0);
 151           Src0.ChangeToImmediate(MovSrc.getImm());
 152           ConstantFolded = true;
 153         } else if (MovSrc.isFI()) {
 154           Src0.setSubReg(0);
 155           Src0.ChangeToFrameIndex(MovSrc.getIndex());
 156           ConstantFolded = true;
 157         }
 158
 159         if (ConstantFolded) {
 160           assert(MRI.use_empty(Reg));
 161           Def->eraseFromParent();
 162           ++NumLiteralConstantsFolded;
 163           return true;
 164         }
 165       }
 166     }
 167   }
 168
 169   // We have failed to fold src0, so commute the instruction and try again.
 170   if (TryToCommute && MI.isCommutable()) {
 171     if (TII->commuteInstruction(MI)) {
 172       if (foldImmediates(MI, TII, MRI, false))
 173         return true;
 174
 175       // Commute back.
 176       TII->commuteInstruction(MI);
 177     }
 178   }
 179
 180   return false;
 181 }
 182
 183 // Copy MachineOperand with all flags except setting it as implicit.
 184 static void copyFlagsToImplicitVCC(MachineInstr &MI,
 185                                    const MachineOperand &Orig) {
 186
 187   for (MachineOperand &Use : MI.implicit_operands()) {
 188     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
 189       Use.setIsUndef(Orig.isUndef());
 190       Use.setIsKill(Orig.isKill());
 191       return;
 192     }
 193   }
 194 }
 195
 196 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
 197   return isInt<16>(Src.getImm()) &&
 198     !TII->isInlineConstant(*Src.getParent(),
 199                            Src.getParent()->getOperandNo(&Src));
 200 }
 201
 202 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
 203   return isUInt<16>(Src.getImm()) &&
 204     !TII->isInlineConstant(*Src.getParent(),
 205                            Src.getParent()->getOperandNo(&Src));
 206 }
 207
 208 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
 209                                  const MachineOperand &Src,
 210                                  bool &IsUnsigned) {
 211   if (isInt<16>(Src.getImm())) {
 212     IsUnsigned = false;
 213     return !TII->isInlineConstant(Src);
 214   }
 215
 216   if (isUInt<16>(Src.getImm())) {
 217     IsUnsigned = true;
 218     return !TII->isInlineConstant(Src);
 219   }
 220
 221   return false;
 222 }
 223
 224 /// \returns true if the constant in \p Src should be replaced with a bitreverse
 225 /// of an inline immediate.
 226 static bool isReverseInlineImm(const SIInstrInfo *TII,
 227                                const MachineOperand &Src,
 228                                int32_t &ReverseImm) {
 229   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
 230     return false;
 231
 232   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
 233   return ReverseImm >= -16 && ReverseImm <= 64;
 234 }
 235
 236 /// Copy implicit register operands from specified instruction to this
 237 /// instruction that are not part of the instruction definition.
 238 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
 239                                  const MachineInstr &MI) {
 240   for (unsigned i = MI.getDesc().getNumOperands() +
 241          MI.getDesc().getNumImplicitUses() +
 242          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
 243        i != e; ++i) {
 244     const MachineOperand &MO = MI.getOperand(i);
 245     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
 246       NewMI.addOperand(MF, MO);
 247   }
 248 }
 249
 250 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
 251   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
 252   // get constants on the RHS.
 253   if (!MI.getOperand(0).isReg())
 254     TII->commuteInstruction(MI, false, 0, 1);
 255
 256   const MachineOperand &Src1 = MI.getOperand(1);
 257   if (!Src1.isImm())
 258     return;
 259
 260   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
 261   if (SOPKOpc == -1)
 262     return;
 263
 264   // eq/ne is special because the imm16 can be treated as signed or unsigned,
 265   // and initially selectd to the unsigned versions.
 266   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
 267     bool HasUImm;
 268     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
 269       if (!HasUImm) {
 270         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
 271           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
 272       }
 273
 274       MI.setDesc(TII->get(SOPKOpc));
 275     }
 276
 277     return;
 278   }
 279
 280   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 281
 282   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
 283       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
 284     MI.setDesc(NewDesc);
 285   }
 286 }
 287
 288 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 289   if (skipFunction(*MF.getFunction()))
 290     return false;
 291
 292   MachineRegisterInfo &MRI = MF.getRegInfo();
 293   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 294   const SIInstrInfo *TII = ST.getInstrInfo();
 295   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 296
 297   std::vector<unsigned> I1Defs;
 298
 299   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 300                                                   BI != BE; ++BI) {
 301
 302     MachineBasicBlock &MBB = *BI;
 303     MachineBasicBlock::iterator I, Next;
 304     for (I = MBB.begin(); I != MBB.end(); I = Next) {
 305       Next = std::next(I);
 306       MachineInstr &MI = *I;
 307
 308       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
 309         // If this has a literal constant source that is the same as the
 310         // reversed bits of an inline immediate, replace with a bitreverse of
 311         // that constant. This saves 4 bytes in the common case of materializing
 312         // sign bits.
 313
 314         // Test if we are after regalloc. We only want to do this after any
 315         // optimizations happen because this will confuse them.
 316         // XXX - not exactly a check for post-regalloc run.
 317         MachineOperand &Src = MI.getOperand(1);
 318         if (Src.isImm() &&
 319             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
 320           int32_t ReverseImm;
 321           if (isReverseInlineImm(TII, Src, ReverseImm)) {
 322             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
 323             Src.setImm(ReverseImm);
 324             continue;
 325           }
 326         }
 327       }
 328
 329       // Combine adjacent s_nops to use the immediate operand encoding how long
 330       // to wait.
 331       //
 332       // s_nop N
 333       // s_nop M
 334       //  =>
 335       // s_nop (N + M)
 336       if (MI.getOpcode() == AMDGPU::S_NOP &&
 337           Next != MBB.end() &&
 338           (*Next).getOpcode() == AMDGPU::S_NOP) {
 339
 340         MachineInstr &NextMI = *Next;
 341         // The instruction encodes the amount to wait with an offset of 1,
 342         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
 343         // after adding.
 344         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
 345         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
 346
 347         // Make sure we don't overflow the bounds.
 348         if (Nop0 + Nop1 <= 8) {
 349           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
 350           MI.eraseFromParent();
 351         }
 352
 353         continue;
 354       }
 355
 356       // FIXME: We also need to consider movs of constant operands since
 357       // immediate operands are not folded if they have more than one use, and
 358       // the operand folding pass is unaware if the immediate will be free since
 359       // it won't know if the src == dest constraint will end up being
 360       // satisfied.
 361       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
 362           MI.getOpcode() == AMDGPU::S_MUL_I32) {
 363         const MachineOperand *Dest = &MI.getOperand(0);
 364         MachineOperand *Src0 = &MI.getOperand(1);
 365         MachineOperand *Src1 = &MI.getOperand(2);
 366
 367         if (!Src0->isReg() && Src1->isReg()) {
 368           if (TII->commuteInstruction(MI, false, 1, 2))
 369             std::swap(Src0, Src1);
 370         }
 371
 372         // FIXME: This could work better if hints worked with subregisters. If
 373         // we have a vector add of a constant, we usually don't get the correct
 374         // allocation due to the subregister usage.
 375         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
 376             Src0->isReg()) {
 377           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
 378           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
 379           continue;
 380         }
 381
 382         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
 383           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
 384             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
 385               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 386
 387             MI.setDesc(TII->get(Opc));
 388             MI.tieOperands(0, 1);
 389           }
 390         }
 391       }
 392
 393       // Try to use s_cmpk_*
 394       if (MI.isCompare() && TII->isSOPC(MI)) {
 395         shrinkScalarCompare(TII, MI);
 396         continue;
 397       }
 398
 399       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
 400       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
 401         const MachineOperand &Dst = MI.getOperand(0);
 402         MachineOperand &Src = MI.getOperand(1);
 403
 404         if (Src.isImm() &&
 405             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
 406           int32_t ReverseImm;
 407           if (isKImmOperand(TII, Src))
 408             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 409           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
 410             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
 411             Src.setImm(ReverseImm);
 412           }
 413         }
 414
 415         continue;
 416       }
 417
 418       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
 419         continue;
 420
 421       if (!canShrink(MI, TII, TRI, MRI)) {
 422         // Try commuting the instruction and see if that enables us to shrink
 423         // it.
 424         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
 425             !canShrink(MI, TII, TRI, MRI))
 426           continue;
 427       }
 428
 429       // getVOPe32 could be -1 here if we started with an instruction that had
 430       // a 32-bit encoding and then commuted it to an instruction that did not.
 431       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
 432         continue;
 433
 434       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 435
 436       if (TII->isVOPC(Op32)) {
 437         unsigned DstReg = MI.getOperand(0).getReg();
 438         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
 439           // VOPC instructions can only write to the VCC register. We can't
 440           // force them to use VCC here, because this is only one register and
 441           // cannot deal with sequences which would require multiple copies of
 442           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
 443           //
 444           // So, instead of forcing the instruction to write to VCC, we provide
 445           // a hint to the register allocator to use VCC and then we we will run
 446           // this pass again after RA and shrink it if it outputs to VCC.
 447           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
 448           continue;
 449         }
 450         if (DstReg != AMDGPU::VCC)
 451           continue;
 452       }
 453
 454       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
 455         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
 456         // instructions.
 457         const MachineOperand *Src2 =
 458             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
 459         if (!Src2->isReg())
 460           continue;
 461         unsigned SReg = Src2->getReg();
 462         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
 463           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
 464           continue;
 465         }
 466         if (SReg != AMDGPU::VCC)
 467           continue;
 468       }
 469
 470       // Check for the bool flag output for instructions like V_ADD_I32_e64.
 471       const MachineOperand *SDst = TII->getNamedOperand(MI,
 472                                                         AMDGPU::OpName::sdst);
 473
 474       // Check the carry-in operand for v_addc_u32_e64.
 475       const MachineOperand *Src2 = TII->getNamedOperand(MI,
 476                                                         AMDGPU::OpName::src2);
 477
 478       if (SDst) {
 479         if (SDst->getReg() != AMDGPU::VCC) {
 480           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
 481             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
 482           continue;
 483         }
 484
 485         // All of the instructions with carry outs also have an SGPR input in
 486         // src2.
 487         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
 488           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
 489             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
 490
 491           continue;
 492         }
 493       }
 494
 495       // We can shrink this instruction
 496       DEBUG(dbgs() << "Shrinking " << MI);
 497
 498       MachineInstrBuilder Inst32 =
 499           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 500
 501       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
 502       // For VOPC instructions, this is replaced by an implicit def of vcc.
 503       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
 504       if (Op32DstIdx != -1) {
 505         // dst
 506         Inst32.add(MI.getOperand(0));
 507       } else {
 508         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
 509                "Unexpected case");
 510       }
 511
 512
 513       Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 514
 515       const MachineOperand *Src1 =
 516           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 517       if (Src1)
 518         Inst32.add(*Src1);
 519
 520       if (Src2) {
 521         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
 522         if (Op32Src2Idx != -1) {
 523           Inst32.add(*Src2);
 524         } else {
 525           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
 526           // replaced with an implicit read of vcc. This was already added
 527           // during the initial BuildMI, so find it to preserve the flags.
 528           copyFlagsToImplicitVCC(*Inst32, *Src2);
 529         }
 530       }
 531
 532       ++NumInstructionsShrunk;
 533
 534       // Copy extra operands not present in the instruction definition.
 535       copyExtraImplicitOps(*Inst32, MF, MI);
 536
 537       MI.eraseFromParent();
 538       foldImmediates(*Inst32, TII, MRI);
 539
 540       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
 541
 542
 543     }
 544   }
 545   return false;
 546 }