1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
9 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
10 // operand.If any of the use instruction cannot be combined with the mov the
11 // whole sequence is reverted.
14 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
15 // dpp_controls..., $bound_ctrl
16 // $res = VALU $dpp_value, ...
20 // $res = VALU_DPP $folded_old, $vgpr_to_be_read_from_other_lane, ...,
21 // dpp_controls..., $folded_bound_ctrl
25 // $bound_ctrl is DPP_BOUND_ZERO, $old is any
26 // $bound_ctrl is DPP_BOUND_OFF, $old is 0
28 // ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_ZERO
29 // $bound_ctrl is DPP_BOUND_OFF, $old is undef
31 // ->$folded_old = undef, $folded_bound_ctrl = DPP_BOUND_OFF
32 // $bound_ctrl is DPP_BOUND_OFF, $old is foldable
34 // ->$folded_old = folded value, $folded_bound_ctrl = DPP_BOUND_OFF
35 //===----------------------------------------------------------------------===//
38 #include "AMDGPUSubtarget.h"
39 #include "SIInstrInfo.h"
40 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
41 #include "llvm/ADT/SmallVector.h"
42 #include "llvm/ADT/Statistic.h"
43 #include "llvm/CodeGen/MachineBasicBlock.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineFunctionPass.h"
46 #include "llvm/CodeGen/MachineInstr.h"
47 #include "llvm/CodeGen/MachineInstrBuilder.h"
48 #include "llvm/CodeGen/MachineOperand.h"
49 #include "llvm/CodeGen/MachineRegisterInfo.h"
50 #include "llvm/CodeGen/TargetRegisterInfo.h"
51 #include "llvm/Pass.h"
56 #define DEBUG_TYPE "gcn-dpp-combine"
58 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
62 class GCNDPPCombine : public MachineFunctionPass {
63 MachineRegisterInfo *MRI;
64 const SIInstrInfo *TII;
66 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
68 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
70 RegSubRegPair foldOldOpnd(MachineInstr &OrigMI,
71 RegSubRegPair OldOpndVGPR,
72 MachineOperand &OldOpndValue) const;
74 MachineInstr *createDPPInst(MachineInstr &OrigMI,
76 RegSubRegPair OldOpndVGPR,
77 MachineOperand *OldOpnd,
78 bool BoundCtrlZero) const;
80 MachineInstr *createDPPInst(MachineInstr &OrigMI,
82 RegSubRegPair OldOpndVGPR,
83 bool BoundCtrlZero) const;
85 bool hasNoImmOrEqual(MachineInstr &MI,
88 int64_t Mask = -1) const;
90 bool combineDPPMov(MachineInstr &MI) const;
95 GCNDPPCombine() : MachineFunctionPass(ID) {
96 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
99 bool runOnMachineFunction(MachineFunction &MF) override;
101 StringRef getPassName() const override { return "GCN DPP Combine"; }
103 void getAnalysisUsage(AnalysisUsage &AU) const override {
104 AU.setPreservesCFG();
105 MachineFunctionPass::getAnalysisUsage(AU);
109 } // end anonymous namespace
111 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
113 char GCNDPPCombine::ID = 0;
115 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
117 FunctionPass *llvm::createGCNDPPCombinePass() {
118 return new GCNDPPCombine();
121 static int getDPPOp(unsigned Op) {
122 auto DPP32 = AMDGPU::getDPPOp32(Op);
126 auto E32 = AMDGPU::getVOPe32(Op);
127 return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
130 // tracks the register operand definition and returns:
131 // 1. immediate operand used to initialize the register if found
132 // 2. nullptr if the register operand is undef
133 // 3. the operand itself otherwise
134 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
135 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
139 switch(Def->getOpcode()) {
141 case AMDGPU::IMPLICIT_DEF:
144 case AMDGPU::V_MOV_B32_e32: {
145 auto &Op1 = Def->getOperand(1);
154 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
156 RegSubRegPair OldOpndVGPR,
157 bool BoundCtrlZero) const {
158 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
159 assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
160 TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
162 auto OrigOp = OrigMI.getOpcode();
163 auto DPPOp = getDPPOp(OrigOp);
165 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
169 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
170 OrigMI.getDebugLoc(), TII->get(DPPOp));
173 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
178 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
180 assert(OldIdx == NumOperands);
181 assert(isOfRegClass(OldOpndVGPR, AMDGPU::VGPR_32RegClass, *MRI));
182 DPPInst.addReg(OldOpndVGPR.Reg, 0, OldOpndVGPR.SubReg);
186 if (auto *Mod0 = TII->getNamedOperand(OrigMI,
187 AMDGPU::OpName::src0_modifiers)) {
188 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
189 AMDGPU::OpName::src0_modifiers));
190 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
191 DPPInst.addImm(Mod0->getImm());
194 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
196 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
197 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
204 if (auto *Mod1 = TII->getNamedOperand(OrigMI,
205 AMDGPU::OpName::src1_modifiers)) {
206 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
207 AMDGPU::OpName::src1_modifiers));
208 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
209 DPPInst.addImm(Mod1->getImm());
212 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
213 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
214 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
222 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
223 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
224 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
231 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
232 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
233 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
234 DPPInst.addImm(BoundCtrlZero ? 1 : 0);
238 DPPInst.getInstr()->eraseFromParent();
241 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
242 return DPPInst.getInstr();
245 GCNDPPCombine::RegSubRegPair
246 GCNDPPCombine::foldOldOpnd(MachineInstr &OrigMI,
247 RegSubRegPair OldOpndVGPR,
248 MachineOperand &OldOpndValue) const {
249 assert(OldOpndValue.isImm());
250 switch (OrigMI.getOpcode()) {
252 case AMDGPU::V_MAX_U32_e32:
253 if (OldOpndValue.getImm() == std::numeric_limits<uint32_t>::max())
256 case AMDGPU::V_MAX_I32_e32:
257 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::max())
260 case AMDGPU::V_MIN_I32_e32:
261 if (OldOpndValue.getImm() == std::numeric_limits<int32_t>::min())
265 case AMDGPU::V_MUL_I32_I24_e32:
266 case AMDGPU::V_MUL_U32_U24_e32:
267 if (OldOpndValue.getImm() == 1) {
268 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
269 assert(Src1 && Src1->isReg());
270 return getRegSubRegPair(*Src1);
274 return RegSubRegPair();
278 // $bound_ctrl is DPP_BOUND_ZERO, $old is any
279 // $bound_ctrl is DPP_BOUND_OFF, $old is 0
280 // -> $old = undef, $bound_ctrl = DPP_BOUND_ZERO
282 // $bound_ctrl is DPP_BOUND_OFF, $old is undef
283 // -> $old = undef, $bound_ctrl = DPP_BOUND_OFF
285 // $bound_ctrl is DPP_BOUND_OFF, $old is foldable
286 // -> $old = folded value, $bound_ctrl = DPP_BOUND_OFF
288 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
290 RegSubRegPair OldOpndVGPR,
291 MachineOperand *OldOpndValue,
292 bool BoundCtrlZero) const {
293 assert(OldOpndVGPR.Reg);
294 if (!BoundCtrlZero && OldOpndValue) {
295 assert(OldOpndValue->isImm());
296 OldOpndVGPR = foldOldOpnd(OrigMI, OldOpndVGPR, *OldOpndValue);
297 if (!OldOpndVGPR.Reg) {
298 LLVM_DEBUG(dbgs() << " failed: old immediate cannot be folded\n");
302 return createDPPInst(OrigMI, MovMI, OldOpndVGPR, BoundCtrlZero);
305 // returns true if MI doesn't have OpndName immediate operand or the
307 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
308 int64_t Value, int64_t Mask) const {
309 auto *Imm = TII->getNamedOperand(MI, OpndName);
313 assert(Imm->isImm());
314 return (Imm->getImm() & Mask) == Value;
317 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
318 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
319 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
320 assert(BCZOpnd && BCZOpnd->isImm());
321 bool BoundCtrlZero = 0 != BCZOpnd->getImm();
323 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
325 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
326 assert(OldOpnd && OldOpnd->isReg());
327 auto OldOpndVGPR = getRegSubRegPair(*OldOpnd);
328 auto *OldOpndValue = getOldOpndValue(*OldOpnd);
329 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
332 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef, ignore old opnd
333 OldOpndValue = nullptr;
335 if (!OldOpndValue->isImm()) {
336 LLVM_DEBUG(dbgs() << " failed: old operand isn't an imm or undef\n");
339 if (OldOpndValue->getImm() == 0) {
340 OldOpndVGPR.Reg = AMDGPU::NoRegister; // should be undef
341 OldOpndValue = nullptr;
342 BoundCtrlZero = true;
347 LLVM_DEBUG(dbgs() << " old=";
351 dbgs() << OldOpndValue->getImm();
352 dbgs() << ", bound_ctrl=" << BoundCtrlZero << '\n');
354 std::vector<MachineInstr*> OrigMIs, DPPMIs;
355 if (!OldOpndVGPR.Reg) { // OldOpndVGPR = undef
356 OldOpndVGPR = RegSubRegPair(
357 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
358 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
359 TII->get(AMDGPU::IMPLICIT_DEF), OldOpndVGPR.Reg);
360 DPPMIs.push_back(UndefInst.getInstr());
363 OrigMIs.push_back(&MovMI);
364 bool Rollback = true;
365 for (auto &Use : MRI->use_nodbg_operands(
366 TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg())) {
369 auto &OrigMI = *Use.getParent();
370 auto OrigOp = OrigMI.getOpcode();
371 if (TII->isVOP3(OrigOp)) {
372 if (!TII->hasVALU32BitEncoding(OrigOp)) {
373 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
376 // check if other than abs|neg modifiers are set (opsel for example)
377 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
378 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
379 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
380 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
381 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
382 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
385 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
386 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
390 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
391 if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
392 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, OldOpndVGPR,
393 OldOpndValue, BoundCtrlZero)) {
394 DPPMIs.push_back(DPPInst);
397 } else if (OrigMI.isCommutable() &&
398 &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
399 auto *BB = OrigMI.getParent();
400 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
401 BB->insert(OrigMI, NewMI);
402 if (TII->commuteInstruction(*NewMI)) {
403 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
404 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, OldOpndVGPR,
405 OldOpndValue, BoundCtrlZero)) {
406 DPPMIs.push_back(DPPInst);
410 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
411 NewMI->eraseFromParent();
413 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
416 OrigMIs.push_back(&OrigMI);
419 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
420 MI->eraseFromParent();
425 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
426 auto &ST = MF.getSubtarget<GCNSubtarget>();
427 if (!ST.hasDPP() || skipFunction(MF.getFunction()))
430 MRI = &MF.getRegInfo();
431 TII = ST.getInstrInfo();
433 assert(MRI->isSSA() && "Must be run on SSA");
435 bool Changed = false;
436 for (auto &MBB : MF) {
437 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
439 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
441 ++NumDPPMovsCombined;