1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 // dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 // dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 // $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 // $combined_bound_ctrl = DPP_BOUND_ZERO
29 // if the VALU op is binary and
30 // $bound_ctrl==DPP_BOUND_OFF and
31 // $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 // $combined_bound_ctrl = DPP_BOUND_OFF
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/DenseMap.h"
45 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/ADT/Statistic.h"
47 #include "llvm/CodeGen/MachineBasicBlock.h"
48 #include "llvm/CodeGen/MachineFunction.h"
49 #include "llvm/CodeGen/MachineFunctionPass.h"
50 #include "llvm/CodeGen/MachineInstr.h"
51 #include "llvm/CodeGen/MachineInstrBuilder.h"
52 #include "llvm/CodeGen/MachineOperand.h"
53 #include "llvm/CodeGen/MachineRegisterInfo.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/Pass.h"
60 #define DEBUG_TYPE "gcn-dpp-combine"
62 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
66 class GCNDPPCombine : public MachineFunctionPass {
67 MachineRegisterInfo *MRI;
68 const SIInstrInfo *TII;
70 using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
72 MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
74 MachineInstr *createDPPInst(MachineInstr &OrigMI,
76 RegSubRegPair CombOldVGPR,
77 MachineOperand *OldOpnd,
80 MachineInstr *createDPPInst(MachineInstr &OrigMI,
82 RegSubRegPair CombOldVGPR,
85 bool hasNoImmOrEqual(MachineInstr &MI,
88 int64_t Mask = -1) const;
90 bool combineDPPMov(MachineInstr &MI) const;
95 GCNDPPCombine() : MachineFunctionPass(ID) {
96 initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
99 bool runOnMachineFunction(MachineFunction &MF) override;
101 StringRef getPassName() const override { return "GCN DPP Combine"; }
103 void getAnalysisUsage(AnalysisUsage &AU) const override {
104 AU.setPreservesCFG();
105 MachineFunctionPass::getAnalysisUsage(AU);
109 int getDPPOp(unsigned Op) const;
112 } // end anonymous namespace
114 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
116 char GCNDPPCombine::ID = 0;
118 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
120 FunctionPass *llvm::createGCNDPPCombinePass() {
121 return new GCNDPPCombine();
124 int GCNDPPCombine::getDPPOp(unsigned Op) const {
125 auto DPP32 = AMDGPU::getDPPOp32(Op);
127 auto E32 = AMDGPU::getVOPe32(Op);
128 DPP32 = (E32 == -1)? -1 : AMDGPU::getDPPOp32(E32);
130 return (DPP32 == -1 || TII->pseudoToMCOpcode(DPP32) == -1) ? -1 : DPP32;
133 // tracks the register operand definition and returns:
134 // 1. immediate operand used to initialize the register if found
135 // 2. nullptr if the register operand is undef
136 // 3. the operand itself otherwise
137 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
138 auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
142 switch(Def->getOpcode()) {
144 case AMDGPU::IMPLICIT_DEF:
147 case AMDGPU::V_MOV_B32_e32: {
148 auto &Op1 = Def->getOperand(1);
157 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
159 RegSubRegPair CombOldVGPR,
160 bool CombBCZ) const {
161 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
163 auto OrigOp = OrigMI.getOpcode();
164 auto DPPOp = getDPPOp(OrigOp);
166 LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
170 auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
171 OrigMI.getDebugLoc(), TII->get(DPPOp));
174 auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
179 const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
181 assert(OldIdx == NumOperands);
182 assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
183 auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
184 DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
188 // TODO: this discards MAC/FMA instructions for now, let's add it later
189 LLVM_DEBUG(dbgs() << " failed: no old operand in DPP instruction,"
195 if (auto *Mod0 = TII->getNamedOperand(OrigMI,
196 AMDGPU::OpName::src0_modifiers)) {
197 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
198 AMDGPU::OpName::src0_modifiers));
199 assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
200 DPPInst.addImm(Mod0->getImm());
202 } else if (AMDGPU::getNamedOperandIdx(DPPOp,
203 AMDGPU::OpName::src0_modifiers) != -1) {
207 auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
209 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
210 LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
215 DPPInst->getOperand(NumOperands).setIsKill(false);
218 if (auto *Mod1 = TII->getNamedOperand(OrigMI,
219 AMDGPU::OpName::src1_modifiers)) {
220 assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
221 AMDGPU::OpName::src1_modifiers));
222 assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
223 DPPInst.addImm(Mod1->getImm());
225 } else if (AMDGPU::getNamedOperandIdx(DPPOp,
226 AMDGPU::OpName::src1_modifiers) != -1) {
230 if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
231 if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
232 LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
240 if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
241 if (!TII->getNamedOperand(*DPPInst.getInstr(), AMDGPU::OpName::src2) ||
242 !TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
243 LLVM_DEBUG(dbgs() << " failed: src2 is illegal\n");
250 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
251 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
252 DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
253 DPPInst.addImm(CombBCZ ? 1 : 0);
257 DPPInst.getInstr()->eraseFromParent();
260 LLVM_DEBUG(dbgs() << " combined: " << *DPPInst.getInstr());
261 return DPPInst.getInstr();
264 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
265 assert(OldOpnd->isImm());
268 case AMDGPU::V_ADD_U32_e32:
269 case AMDGPU::V_ADD_U32_e64:
270 case AMDGPU::V_ADD_I32_e32:
271 case AMDGPU::V_ADD_I32_e64:
272 case AMDGPU::V_OR_B32_e32:
273 case AMDGPU::V_OR_B32_e64:
274 case AMDGPU::V_SUBREV_U32_e32:
275 case AMDGPU::V_SUBREV_U32_e64:
276 case AMDGPU::V_SUBREV_I32_e32:
277 case AMDGPU::V_SUBREV_I32_e64:
278 case AMDGPU::V_MAX_U32_e32:
279 case AMDGPU::V_MAX_U32_e64:
280 case AMDGPU::V_XOR_B32_e32:
281 case AMDGPU::V_XOR_B32_e64:
282 if (OldOpnd->getImm() == 0)
285 case AMDGPU::V_AND_B32_e32:
286 case AMDGPU::V_AND_B32_e64:
287 case AMDGPU::V_MIN_U32_e32:
288 case AMDGPU::V_MIN_U32_e64:
289 if (static_cast<uint32_t>(OldOpnd->getImm()) ==
290 std::numeric_limits<uint32_t>::max())
293 case AMDGPU::V_MIN_I32_e32:
294 case AMDGPU::V_MIN_I32_e64:
295 if (static_cast<int32_t>(OldOpnd->getImm()) ==
296 std::numeric_limits<int32_t>::max())
299 case AMDGPU::V_MAX_I32_e32:
300 case AMDGPU::V_MAX_I32_e64:
301 if (static_cast<int32_t>(OldOpnd->getImm()) ==
302 std::numeric_limits<int32_t>::min())
305 case AMDGPU::V_MUL_I32_I24_e32:
306 case AMDGPU::V_MUL_I32_I24_e64:
307 case AMDGPU::V_MUL_U32_U24_e32:
308 case AMDGPU::V_MUL_U32_U24_e64:
309 if (OldOpnd->getImm() == 1)
316 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
318 RegSubRegPair CombOldVGPR,
319 MachineOperand *OldOpndValue,
320 bool CombBCZ) const {
321 assert(CombOldVGPR.Reg);
322 if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
323 auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
324 if (!Src1 || !Src1->isReg()) {
325 LLVM_DEBUG(dbgs() << " failed: no src1 or it isn't a register\n");
328 if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
329 LLVM_DEBUG(dbgs() << " failed: old immediate isn't an identity\n");
332 CombOldVGPR = getRegSubRegPair(*Src1);
333 if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
334 LLVM_DEBUG(dbgs() << " failed: src1 isn't a VGPR32 register\n");
338 return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
341 // returns true if MI doesn't have OpndName immediate operand or the
343 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
344 int64_t Value, int64_t Mask) const {
345 auto *Imm = TII->getNamedOperand(MI, OpndName);
349 assert(Imm->isImm());
350 return (Imm->getImm() & Mask) == Value;
353 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
354 assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
355 LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
357 auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
358 assert(DstOpnd && DstOpnd->isReg());
359 auto DPPMovReg = DstOpnd->getReg();
360 if (DPPMovReg.isPhysical()) {
361 LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
364 if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
365 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
370 auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
371 assert(RowMaskOpnd && RowMaskOpnd->isImm());
372 auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
373 assert(BankMaskOpnd && BankMaskOpnd->isImm());
374 const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
375 BankMaskOpnd->getImm() == 0xF;
377 auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
378 assert(BCZOpnd && BCZOpnd->isImm());
379 bool BoundCtrlZero = BCZOpnd->getImm();
381 auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
382 auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
383 assert(OldOpnd && OldOpnd->isReg());
384 assert(SrcOpnd && SrcOpnd->isReg());
385 if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
386 LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
390 auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
391 // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
392 // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
393 // but the third option is used to distinguish undef from non-immediate
394 // to reuse IMPLICIT_DEF instruction later
395 assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
397 bool CombBCZ = false;
399 if (MaskAllLanes && BoundCtrlZero) { // [1]
402 if (!OldOpndValue || !OldOpndValue->isImm()) {
403 LLVM_DEBUG(dbgs() << " failed: the DPP mov isn't combinable\n");
407 if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
409 " failed: old reg def and mov should be in the same BB\n");
413 if (OldOpndValue->getImm() == 0) {
415 assert(!BoundCtrlZero); // by check [1]
418 } else if (BoundCtrlZero) {
419 assert(!MaskAllLanes); // by check [1]
421 " failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
426 LLVM_DEBUG(dbgs() << " old=";
430 dbgs() << *OldOpndValue;
431 dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
433 SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
434 DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
435 auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
436 // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
437 if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
438 CombOldVGPR = RegSubRegPair(
439 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
440 auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
441 TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
442 DPPMIs.push_back(UndefInst.getInstr());
445 OrigMIs.push_back(&MovMI);
446 bool Rollback = true;
447 SmallVector<MachineOperand*, 16> Uses;
449 for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
450 Uses.push_back(&Use);
453 while (!Uses.empty()) {
454 MachineOperand *Use = Uses.pop_back_val();
457 auto &OrigMI = *Use->getParent();
458 LLVM_DEBUG(dbgs() << " try: " << OrigMI);
460 auto OrigOp = OrigMI.getOpcode();
461 if (OrigOp == AMDGPU::REG_SEQUENCE) {
462 Register FwdReg = OrigMI.getOperand(0).getReg();
463 unsigned FwdSubReg = 0;
465 if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
466 LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
471 unsigned OpNo, E = OrigMI.getNumOperands();
472 for (OpNo = 1; OpNo < E; OpNo += 2) {
473 if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
474 FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
482 for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
483 if (Op.getSubReg() == FwdSubReg)
486 RegSeqWithOpNos[&OrigMI].push_back(OpNo);
490 if (TII->isVOP3(OrigOp)) {
491 if (!TII->hasVALU32BitEncoding(OrigOp)) {
492 LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
495 // check if other than abs|neg modifiers are set (opsel for example)
496 const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
497 if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
498 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
499 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
500 !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
501 LLVM_DEBUG(dbgs() << " failed: VOP3 has non-default modifiers\n");
504 } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
505 LLVM_DEBUG(dbgs() << " failed: not VOP1/2/3\n");
509 LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
510 if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
511 if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
512 OldOpndValue, CombBCZ)) {
513 DPPMIs.push_back(DPPInst);
516 } else if (OrigMI.isCommutable() &&
517 Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
518 auto *BB = OrigMI.getParent();
519 auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
520 BB->insert(OrigMI, NewMI);
521 if (TII->commuteInstruction(*NewMI)) {
522 LLVM_DEBUG(dbgs() << " commuted: " << *NewMI);
523 if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
524 OldOpndValue, CombBCZ)) {
525 DPPMIs.push_back(DPPInst);
529 LLVM_DEBUG(dbgs() << " failed: cannot be commuted\n");
530 NewMI->eraseFromParent();
532 LLVM_DEBUG(dbgs() << " failed: no suitable operands\n");
535 OrigMIs.push_back(&OrigMI);
538 Rollback |= !Uses.empty();
540 for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
541 MI->eraseFromParent();
544 for (auto &S : RegSeqWithOpNos) {
545 if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
546 S.first->eraseFromParent();
549 while (!S.second.empty())
550 S.first->getOperand(S.second.pop_back_val()).setIsUndef(true);
557 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
558 auto &ST = MF.getSubtarget<GCNSubtarget>();
559 if (!ST.hasDPP() || skipFunction(MF.getFunction()))
562 MRI = &MF.getRegInfo();
563 TII = ST.getInstrInfo();
565 assert(MRI->isSSA() && "Must be run on SSA");
567 bool Changed = false;
568 for (auto &MBB : MF) {
569 for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
571 if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
573 ++NumDPPMovsCombined;
574 } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
575 auto Split = TII->expandMovDPP64(MI);
576 for (auto M : { Split.first, Split.second }) {
577 if (combineDPPMov(*M))
578 ++NumDPPMovsCombined;