contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp

   1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 /// The pass tries to use the 32-bit encoding for instructions when possible.
   9 //===----------------------------------------------------------------------===//
  10 //
  11
  12 #include "AMDGPU.h"
  13 #include "AMDGPUMCInstLower.h"
  14 #include "AMDGPUSubtarget.h"
  15 #include "SIInstrInfo.h"
  16 #include "llvm/ADT/Statistic.h"
  17 #include "llvm/CodeGen/MachineFunctionPass.h"
  18 #include "llvm/CodeGen/MachineInstrBuilder.h"
  19 #include "llvm/CodeGen/MachineRegisterInfo.h"
  20 #include "llvm/IR/Constants.h"
  21 #include "llvm/IR/Function.h"
  22 #include "llvm/IR/LLVMContext.h"
  23 #include "llvm/Support/Debug.h"
  24 #include "llvm/Support/raw_ostream.h"
  25 #include "llvm/Target/TargetMachine.h"
  26
  27 #define DEBUG_TYPE "si-shrink-instructions"
  28
  29 STATISTIC(NumInstructionsShrunk,
  30           "Number of 64-bit instruction reduced to 32-bit.");
  31 STATISTIC(NumLiteralConstantsFolded,
  32           "Number of literal constants folded into 32-bit instructions.");
  33
  34 using namespace llvm;
  35
  36 namespace {
  37
  38 class SIShrinkInstructions : public MachineFunctionPass {
  39 public:
  40   static char ID;
  41
  42 public:
  43   SIShrinkInstructions() : MachineFunctionPass(ID) {
  44   }
  45
  46   bool runOnMachineFunction(MachineFunction &MF) override;
  47
  48   StringRef getPassName() const override { return "SI Shrink Instructions"; }
  49
  50   void getAnalysisUsage(AnalysisUsage &AU) const override {
  51     AU.setPreservesCFG();
  52     MachineFunctionPass::getAnalysisUsage(AU);
  53   }
  54 };
  55
  56 } // End anonymous namespace.
  57
  58 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
  59                 "SI Shrink Instructions", false, false)
  60
  61 char SIShrinkInstructions::ID = 0;
  62
  63 FunctionPass *llvm::createSIShrinkInstructionsPass() {
  64   return new SIShrinkInstructions();
  65 }
  66
  67 static bool isVGPR(const MachineOperand *MO, const SIRegisterInfo &TRI,
  68                    const MachineRegisterInfo &MRI) {
  69   if (!MO->isReg())
  70     return false;
  71
  72   if (TargetRegisterInfo::isVirtualRegister(MO->getReg()))
  73     return TRI.hasVGPRs(MRI.getRegClass(MO->getReg()));
  74
  75   return TRI.hasVGPRs(TRI.getPhysRegClass(MO->getReg()));
  76 }
  77
  78 static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
  79                       const SIRegisterInfo &TRI,
  80                       const MachineRegisterInfo &MRI) {
  81
  82   const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
  83   // Can't shrink instruction with three operands.
  84   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
  85   // a special case for it.  It can only be shrunk if the third operand
  86   // is vcc.  We should handle this the same way we handle vopc, by addding
  87   // a register allocation hint pre-regalloc and then do the shrinking
  88   // post-regalloc.
  89   if (Src2) {
  90     switch (MI.getOpcode()) {
  91       default: return false;
  92
  93       case AMDGPU::V_ADDC_U32_e64:
  94       case AMDGPU::V_SUBB_U32_e64:
  95         // Additional verification is needed for sdst/src2.
  96         return true;
  97
  98       case AMDGPU::V_MAC_F32_e64:
  99       case AMDGPU::V_MAC_F16_e64:
 100         if (!isVGPR(Src2, TRI, MRI) ||
 101             TII->hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
 102           return false;
 103         break;
 104
 105       case AMDGPU::V_CNDMASK_B32_e64:
 106         break;
 107     }
 108   }
 109
 110   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 111   const MachineOperand *Src1Mod =
 112       TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
 113
 114   if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
 115     return false;
 116
 117   // We don't need to check src0, all input types are legal, so just make sure
 118   // src0 isn't using any modifiers.
 119   if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
 120     return false;
 121
 122   // Check output modifiers
 123   if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
 124     return false;
 125
 126   return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 127 }
 128
 129 /// \brief This function checks \p MI for operands defined by a move immediate
 130 /// instruction and then folds the literal constant into the instruction if it
 131 /// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
 132 /// and will only fold literal constants if we are still in SSA.
 133 static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
 134                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
 135
 136   if (!MRI.isSSA())
 137     return;
 138
 139   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 140
 141   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
 142
 143   // Only one literal constant is allowed per instruction, so if src0 is a
 144   // literal constant then we can't do any folding.
 145   if (TII->isLiteralConstant(MI, Src0Idx))
 146     return;
 147
 148   // Try to fold Src0
 149   MachineOperand &Src0 = MI.getOperand(Src0Idx);
 150   if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
 151     unsigned Reg = Src0.getReg();
 152     MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
 153     if (Def && Def->isMoveImmediate()) {
 154       MachineOperand &MovSrc = Def->getOperand(1);
 155       bool ConstantFolded = false;
 156
 157       if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
 158                              isUInt<32>(MovSrc.getImm()))) {
 159         Src0.ChangeToImmediate(MovSrc.getImm());
 160         ConstantFolded = true;
 161       }
 162       if (ConstantFolded) {
 163         if (MRI.use_empty(Reg))
 164           Def->eraseFromParent();
 165         ++NumLiteralConstantsFolded;
 166         return;
 167       }
 168     }
 169   }
 170
 171   // We have failed to fold src0, so commute the instruction and try again.
 172   if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
 173     foldImmediates(MI, TII, MRI, false);
 174
 175 }
 176
 177 // Copy MachineOperand with all flags except setting it as implicit.
 178 static void copyFlagsToImplicitVCC(MachineInstr &MI,
 179                                    const MachineOperand &Orig) {
 180
 181   for (MachineOperand &Use : MI.implicit_operands()) {
 182     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
 183       Use.setIsUndef(Orig.isUndef());
 184       Use.setIsKill(Orig.isKill());
 185       return;
 186     }
 187   }
 188 }
 189
 190 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
 191   return isInt<16>(Src.getImm()) &&
 192     !TII->isInlineConstant(*Src.getParent(),
 193                            Src.getParent()->getOperandNo(&Src));
 194 }
 195
 196 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
 197   return isUInt<16>(Src.getImm()) &&
 198     !TII->isInlineConstant(*Src.getParent(),
 199                            Src.getParent()->getOperandNo(&Src));
 200 }
 201
 202 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
 203                                  const MachineOperand &Src,
 204                                  bool &IsUnsigned) {
 205   if (isInt<16>(Src.getImm())) {
 206     IsUnsigned = false;
 207     return !TII->isInlineConstant(Src);
 208   }
 209
 210   if (isUInt<16>(Src.getImm())) {
 211     IsUnsigned = true;
 212     return !TII->isInlineConstant(Src);
 213   }
 214
 215   return false;
 216 }
 217
 218 /// \returns true if the constant in \p Src should be replaced with a bitreverse
 219 /// of an inline immediate.
 220 static bool isReverseInlineImm(const SIInstrInfo *TII,
 221                                const MachineOperand &Src,
 222                                int32_t &ReverseImm) {
 223   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
 224     return false;
 225
 226   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
 227   return ReverseImm >= -16 && ReverseImm <= 64;
 228 }
 229
 230 /// Copy implicit register operands from specified instruction to this
 231 /// instruction that are not part of the instruction definition.
 232 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
 233                                  const MachineInstr &MI) {
 234   for (unsigned i = MI.getDesc().getNumOperands() +
 235          MI.getDesc().getNumImplicitUses() +
 236          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
 237        i != e; ++i) {
 238     const MachineOperand &MO = MI.getOperand(i);
 239     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
 240       NewMI.addOperand(MF, MO);
 241   }
 242 }
 243
 244 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
 245   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
 246   // get constants on the RHS.
 247   if (!MI.getOperand(0).isReg())
 248     TII->commuteInstruction(MI, false, 0, 1);
 249
 250   const MachineOperand &Src1 = MI.getOperand(1);
 251   if (!Src1.isImm())
 252     return;
 253
 254   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
 255   if (SOPKOpc == -1)
 256     return;
 257
 258   // eq/ne is special because the imm16 can be treated as signed or unsigned,
 259   // and initially selectd to the unsigned versions.
 260   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
 261     bool HasUImm;
 262     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
 263       if (!HasUImm) {
 264         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
 265           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
 266       }
 267
 268       MI.setDesc(TII->get(SOPKOpc));
 269     }
 270
 271     return;
 272   }
 273
 274   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
 275
 276   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
 277       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
 278     MI.setDesc(NewDesc);
 279   }
 280 }
 281
 282 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 283   if (skipFunction(*MF.getFunction()))
 284     return false;
 285
 286   MachineRegisterInfo &MRI = MF.getRegInfo();
 287   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 288   const SIInstrInfo *TII = ST.getInstrInfo();
 289   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 290
 291   std::vector<unsigned> I1Defs;
 292
 293   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
 294                                                   BI != BE; ++BI) {
 295
 296     MachineBasicBlock &MBB = *BI;
 297     MachineBasicBlock::iterator I, Next;
 298     for (I = MBB.begin(); I != MBB.end(); I = Next) {
 299       Next = std::next(I);
 300       MachineInstr &MI = *I;
 301
 302       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
 303         // If this has a literal constant source that is the same as the
 304         // reversed bits of an inline immediate, replace with a bitreverse of
 305         // that constant. This saves 4 bytes in the common case of materializing
 306         // sign bits.
 307
 308         // Test if we are after regalloc. We only want to do this after any
 309         // optimizations happen because this will confuse them.
 310         // XXX - not exactly a check for post-regalloc run.
 311         MachineOperand &Src = MI.getOperand(1);
 312         if (Src.isImm() &&
 313             TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
 314           int32_t ReverseImm;
 315           if (isReverseInlineImm(TII, Src, ReverseImm)) {
 316             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
 317             Src.setImm(ReverseImm);
 318             continue;
 319           }
 320         }
 321       }
 322
 323       // Combine adjacent s_nops to use the immediate operand encoding how long
 324       // to wait.
 325       //
 326       // s_nop N
 327       // s_nop M
 328       //  =>
 329       // s_nop (N + M)
 330       if (MI.getOpcode() == AMDGPU::S_NOP &&
 331           Next != MBB.end() &&
 332           (*Next).getOpcode() == AMDGPU::S_NOP) {
 333
 334         MachineInstr &NextMI = *Next;
 335         // The instruction encodes the amount to wait with an offset of 1,
 336         // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back
 337         // after adding.
 338         uint8_t Nop0 = MI.getOperand(0).getImm() + 1;
 339         uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1;
 340
 341         // Make sure we don't overflow the bounds.
 342         if (Nop0 + Nop1 <= 8) {
 343           NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1);
 344           MI.eraseFromParent();
 345         }
 346
 347         continue;
 348       }
 349
 350       // FIXME: We also need to consider movs of constant operands since
 351       // immediate operands are not folded if they have more than one use, and
 352       // the operand folding pass is unaware if the immediate will be free since
 353       // it won't know if the src == dest constraint will end up being
 354       // satisfied.
 355       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
 356           MI.getOpcode() == AMDGPU::S_MUL_I32) {
 357         const MachineOperand *Dest = &MI.getOperand(0);
 358         MachineOperand *Src0 = &MI.getOperand(1);
 359         MachineOperand *Src1 = &MI.getOperand(2);
 360
 361         if (!Src0->isReg() && Src1->isReg()) {
 362           if (TII->commuteInstruction(MI, false, 1, 2))
 363             std::swap(Src0, Src1);
 364         }
 365
 366         // FIXME: This could work better if hints worked with subregisters. If
 367         // we have a vector add of a constant, we usually don't get the correct
 368         // allocation due to the subregister usage.
 369         if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
 370             Src0->isReg()) {
 371           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
 372           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
 373           continue;
 374         }
 375
 376         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
 377           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
 378             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
 379               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 380
 381             MI.setDesc(TII->get(Opc));
 382             MI.tieOperands(0, 1);
 383           }
 384         }
 385       }
 386
 387       // Try to use s_cmpk_*
 388       if (MI.isCompare() && TII->isSOPC(MI)) {
 389         shrinkScalarCompare(TII, MI);
 390         continue;
 391       }
 392
 393       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
 394       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
 395         const MachineOperand &Dst = MI.getOperand(0);
 396         MachineOperand &Src = MI.getOperand(1);
 397
 398         if (Src.isImm() &&
 399             TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
 400           int32_t ReverseImm;
 401           if (isKImmOperand(TII, Src))
 402             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
 403           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
 404             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
 405             Src.setImm(ReverseImm);
 406           }
 407         }
 408
 409         continue;
 410       }
 411
 412       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
 413         continue;
 414
 415       if (!canShrink(MI, TII, TRI, MRI)) {
 416         // Try commuting the instruction and see if that enables us to shrink
 417         // it.
 418         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
 419             !canShrink(MI, TII, TRI, MRI))
 420           continue;
 421       }
 422
 423       // getVOPe32 could be -1 here if we started with an instruction that had
 424       // a 32-bit encoding and then commuted it to an instruction that did not.
 425       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
 426         continue;
 427
 428       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 429
 430       if (TII->isVOPC(Op32)) {
 431         unsigned DstReg = MI.getOperand(0).getReg();
 432         if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
 433           // VOPC instructions can only write to the VCC register. We can't
 434           // force them to use VCC here, because this is only one register and
 435           // cannot deal with sequences which would require multiple copies of
 436           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
 437           //
 438           // So, instead of forcing the instruction to write to VCC, we provide
 439           // a hint to the register allocator to use VCC and then we we will run
 440           // this pass again after RA and shrink it if it outputs to VCC.
 441           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, AMDGPU::VCC);
 442           continue;
 443         }
 444         if (DstReg != AMDGPU::VCC)
 445           continue;
 446       }
 447
 448       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
 449         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
 450         // instructions.
 451         const MachineOperand *Src2 =
 452             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
 453         if (!Src2->isReg())
 454           continue;
 455         unsigned SReg = Src2->getReg();
 456         if (TargetRegisterInfo::isVirtualRegister(SReg)) {
 457           MRI.setRegAllocationHint(SReg, 0, AMDGPU::VCC);
 458           continue;
 459         }
 460         if (SReg != AMDGPU::VCC)
 461           continue;
 462       }
 463
 464       // Check for the bool flag output for instructions like V_ADD_I32_e64.
 465       const MachineOperand *SDst = TII->getNamedOperand(MI,
 466                                                         AMDGPU::OpName::sdst);
 467
 468       // Check the carry-in operand for v_addc_u32_e64.
 469       const MachineOperand *Src2 = TII->getNamedOperand(MI,
 470                                                         AMDGPU::OpName::src2);
 471
 472       if (SDst) {
 473         if (SDst->getReg() != AMDGPU::VCC) {
 474           if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
 475             MRI.setRegAllocationHint(SDst->getReg(), 0, AMDGPU::VCC);
 476           continue;
 477         }
 478
 479         // All of the instructions with carry outs also have an SGPR input in
 480         // src2.
 481         if (Src2 && Src2->getReg() != AMDGPU::VCC) {
 482           if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
 483             MRI.setRegAllocationHint(Src2->getReg(), 0, AMDGPU::VCC);
 484
 485           continue;
 486         }
 487       }
 488
 489       // We can shrink this instruction
 490       DEBUG(dbgs() << "Shrinking " << MI);
 491
 492       MachineInstrBuilder Inst32 =
 493           BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32));
 494
 495       // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
 496       // For VOPC instructions, this is replaced by an implicit def of vcc.
 497       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
 498       if (Op32DstIdx != -1) {
 499         // dst
 500         Inst32.add(MI.getOperand(0));
 501       } else {
 502         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
 503                "Unexpected case");
 504       }
 505
 506
 507       Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 508
 509       const MachineOperand *Src1 =
 510           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
 511       if (Src1)
 512         Inst32.add(*Src1);
 513
 514       if (Src2) {
 515         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
 516         if (Op32Src2Idx != -1) {
 517           Inst32.add(*Src2);
 518         } else {
 519           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
 520           // replaced with an implicit read of vcc. This was already added
 521           // during the initial BuildMI, so find it to preserve the flags.
 522           copyFlagsToImplicitVCC(*Inst32, *Src2);
 523         }
 524       }
 525
 526       ++NumInstructionsShrunk;
 527
 528       // Copy extra operands not present in the instruction definition.
 529       copyExtraImplicitOps(*Inst32, MF, MI);
 530
 531       MI.eraseFromParent();
 532       foldImmediates(*Inst32, TII, MRI);
 533
 534       DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
 535
 536
 537     }
 538   }
 539   return false;
 540 }