1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
24 // Future improvements:
26 // - This currently relies on the scheduler to place loads and stores next to
27 // each other, and then only merges adjacent pairs of instructions. It would
28 // be good to be more flexible with interleaved instructions, and possibly run
29 // before scheduling. It currently missing stores of constants because loading
30 // the constant into the data register is placed between the stores, although
31 // this is arguably a scheduling problem.
33 // - Live interval recomputing seems inefficient. This currently only matches
34 // one pair, and recomputes live intervals and moves on to the next pair. It
35 // would be better to compute a list of all merges that need to occur.
37 // - With a list of instructions to process, we can also merge more. If a
38 // cluster of loads have offsets that are too large to fit in the 8-bit
39 // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 // pointer and use the new reduced offsets.
42 //===----------------------------------------------------------------------===//
45 #include "AMDGPUSubtarget.h"
46 #include "SIInstrInfo.h"
47 #include "SIRegisterInfo.h"
48 #include "Utils/AMDGPUBaseInfo.h"
49 #include "llvm/ADT/ArrayRef.h"
50 #include "llvm/ADT/SmallVector.h"
51 #include "llvm/ADT/StringRef.h"
52 #include "llvm/Analysis/AliasAnalysis.h"
53 #include "llvm/CodeGen/MachineBasicBlock.h"
54 #include "llvm/CodeGen/MachineFunction.h"
55 #include "llvm/CodeGen/MachineFunctionPass.h"
56 #include "llvm/CodeGen/MachineInstr.h"
57 #include "llvm/CodeGen/MachineInstrBuilder.h"
58 #include "llvm/CodeGen/MachineOperand.h"
59 #include "llvm/CodeGen/MachineRegisterInfo.h"
60 #include "llvm/IR/DebugLoc.h"
61 #include "llvm/Pass.h"
62 #include "llvm/Support/Debug.h"
63 #include "llvm/Support/MathExtras.h"
64 #include "llvm/Support/raw_ostream.h"
73 #define DEBUG_TYPE "si-load-store-opt"
77 class SILoadStoreOptimizer : public MachineFunctionPass {
88 MachineBasicBlock::iterator I;
89 MachineBasicBlock::iterator Paired;
94 InstClassEnum InstClass;
101 SmallVector<MachineInstr*, 8> InstsToMove;
105 const SISubtarget *STM = nullptr;
106 const SIInstrInfo *TII = nullptr;
107 const SIRegisterInfo *TRI = nullptr;
108 MachineRegisterInfo *MRI = nullptr;
109 AliasAnalysis *AA = nullptr;
112 static bool offsetsCanBeCombined(CombineInfo &CI);
114 bool findMatchingInst(CombineInfo &CI);
116 unsigned read2Opcode(unsigned EltSize) const;
117 unsigned read2ST64Opcode(unsigned EltSize) const;
118 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
120 unsigned write2Opcode(unsigned EltSize) const;
121 unsigned write2ST64Opcode(unsigned EltSize) const;
122 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
123 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
124 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
125 unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
126 bool &IsOffen) const;
127 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
132 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
133 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
136 bool optimizeBlock(MachineBasicBlock &MBB);
138 bool runOnMachineFunction(MachineFunction &MF) override;
140 StringRef getPassName() const override { return "SI Load / Store Optimizer"; }
142 void getAnalysisUsage(AnalysisUsage &AU) const override {
143 AU.setPreservesCFG();
144 AU.addRequired<AAResultsWrapperPass>();
146 MachineFunctionPass::getAnalysisUsage(AU);
150 } // end anonymous namespace.
152 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
153 "SI Load / Store Optimizer", false, false)
154 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
155 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
156 "SI Load / Store Optimizer", false, false)
158 char SILoadStoreOptimizer::ID = 0;
160 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
162 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
163 return new SILoadStoreOptimizer();
166 static void moveInstsAfter(MachineBasicBlock::iterator I,
167 ArrayRef<MachineInstr*> InstsToMove) {
168 MachineBasicBlock *MBB = I->getParent();
170 for (MachineInstr *MI : InstsToMove) {
171 MI->removeFromParent();
176 static void addDefsToList(const MachineInstr &MI, DenseSet<unsigned> &Defs) {
177 // XXX: Should this be looking for implicit defs?
178 for (const MachineOperand &Def : MI.defs())
179 Defs.insert(Def.getReg());
182 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
183 MachineBasicBlock::iterator B,
184 const SIInstrInfo *TII,
185 AliasAnalysis * AA) {
186 // RAW or WAR - cannot reorder
187 // WAW - cannot reorder
188 // RAR - safe to reorder
189 return !(A->mayStore() || B->mayStore()) ||
190 TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
193 // Add MI and its defs to the lists if MI reads one of the defs that are
194 // already in the list. Returns true in that case.
196 addToListsIfDependent(MachineInstr &MI,
197 DenseSet<unsigned> &Defs,
198 SmallVectorImpl<MachineInstr*> &Insts) {
199 for (MachineOperand &Use : MI.operands()) {
200 // If one of the defs is read, then there is a use of Def between I and the
201 // instruction that I will potentially be merged with. We will need to move
202 // this instruction after the merged instructions.
204 if (Use.isReg() && Use.readsReg() && Defs.count(Use.getReg())) {
205 Insts.push_back(&MI);
206 addDefsToList(MI, Defs);
215 canMoveInstsAcrossMemOp(MachineInstr &MemOp,
216 ArrayRef<MachineInstr*> InstsToMove,
217 const SIInstrInfo *TII,
219 assert(MemOp.mayLoadOrStore());
221 for (MachineInstr *InstToMove : InstsToMove) {
222 if (!InstToMove->mayLoadOrStore())
224 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
230 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
231 // XXX - Would the same offset be OK? Is there any reason this would happen or
233 if (CI.Offset0 == CI.Offset1)
236 // This won't be valid if the offset isn't aligned.
237 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
240 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
241 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
245 // Handle SMEM and VMEM instructions.
246 if (CI.InstClass != DS_READ_WRITE) {
247 unsigned Diff = CI.IsX2 ? 2 : 1;
248 return (EltOffset0 + Diff == EltOffset1 ||
249 EltOffset1 + Diff == EltOffset0) &&
250 CI.GLC0 == CI.GLC1 &&
251 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
254 // If the offset in elements doesn't fit in 8-bits, we might be able to use
255 // the stride 64 versions.
256 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
257 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
258 CI.Offset0 = EltOffset0 / 64;
259 CI.Offset1 = EltOffset1 / 64;
264 // Check if the new offsets fit in the reduced 8-bit range.
265 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
266 CI.Offset0 = EltOffset0;
267 CI.Offset1 = EltOffset1;
271 // Try to shift base address to decrease offsets.
272 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
273 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
275 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
276 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
277 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
282 if (isUInt<8>(OffsetDiff)) {
283 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
284 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
291 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
292 MachineBasicBlock *MBB = CI.I->getParent();
293 MachineBasicBlock::iterator E = MBB->end();
294 MachineBasicBlock::iterator MBBI = CI.I;
296 unsigned AddrOpName[3] = {0};
298 const MachineOperand *AddrReg[3];
299 unsigned NumAddresses = 0;
301 switch (CI.InstClass) {
303 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
305 case S_BUFFER_LOAD_IMM:
306 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
308 case BUFFER_LOAD_OFFEN:
309 case BUFFER_STORE_OFFEN:
310 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
311 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
312 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
314 case BUFFER_LOAD_OFFSET:
315 case BUFFER_STORE_OFFSET:
316 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
317 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
321 for (unsigned i = 0; i < NumAddresses; i++) {
322 AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
323 AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
325 // We only ever merge operations with the same base address register, so don't
326 // bother scanning forward if there are no other uses.
327 if (AddrReg[i]->isReg() &&
328 (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
329 MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
335 DenseSet<unsigned> DefsToMove;
336 addDefsToList(*CI.I, DefsToMove);
338 for ( ; MBBI != E; ++MBBI) {
339 if (MBBI->getOpcode() != CI.I->getOpcode()) {
340 // This is not a matching DS instruction, but we can keep looking as
341 // long as one of these conditions are met:
342 // 1. It is safe to move I down past MBBI.
343 // 2. It is safe to move MBBI down past the instruction that I will
346 if (MBBI->hasUnmodeledSideEffects()) {
347 // We can't re-order this instruction with respect to other memory
348 // operations, so we fail both conditions mentioned above.
352 if (MBBI->mayLoadOrStore() &&
353 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
354 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
355 // We fail condition #1, but we may still be able to satisfy condition
356 // #2. Add this instruction to the move list and then we will check
357 // if condition #2 holds once we have selected the matching instruction.
358 CI.InstsToMove.push_back(&*MBBI);
359 addDefsToList(*MBBI, DefsToMove);
363 // When we match I with another DS instruction we will be moving I down
364 // to the location of the matched instruction any uses of I will need to
365 // be moved down as well.
366 addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
370 // Don't merge volatiles.
371 if (MBBI->hasOrderedMemoryRef())
374 // Handle a case like
375 // DS_WRITE_B32 addr, v, idx0
376 // w = DS_READ_B32 addr, idx0
377 // DS_WRITE_B32 addr, f(w), idx1
378 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
379 // merging of the two writes.
380 if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
384 for (unsigned i = 0; i < NumAddresses; i++) {
385 const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
387 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
388 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
389 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
396 // Check same base pointer. Be careful of subregisters, which can occur with
397 // vectors of pointers.
398 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
399 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
406 int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
407 AMDGPU::OpName::offset);
408 CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
409 CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
412 if (CI.InstClass == DS_READ_WRITE) {
413 CI.Offset0 &= 0xffff;
414 CI.Offset1 &= 0xffff;
416 CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
417 CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
418 if (CI.InstClass != S_BUFFER_LOAD_IMM) {
419 CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
420 CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
424 // Check both offsets fit in the reduced range.
425 // We also need to go through the list of instructions that we plan to
426 // move and make sure they are all safe to move down past the merged
428 if (offsetsCanBeCombined(CI))
429 if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
433 // We've found a load/store that we couldn't merge for some reason.
434 // We could potentially keep looking, but we'd need to make sure that
435 // it was safe to move I and also all the instruction in InstsToMove
436 // down past this instruction.
437 // check if we can move I across MBBI and if we can move all I's users
438 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
439 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
445 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
446 if (STM->ldsRequiresM0Init())
447 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
448 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
451 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
452 if (STM->ldsRequiresM0Init())
453 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
455 return (EltSize == 4) ?
456 AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
459 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
461 MachineBasicBlock *MBB = CI.I->getParent();
463 // Be careful, since the addresses could be subregisters themselves in weird
464 // cases, like vectors of pointers.
465 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
467 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
468 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
470 unsigned NewOffset0 = CI.Offset0;
471 unsigned NewOffset1 = CI.Offset1;
472 unsigned Opc = CI.UseST64 ?
473 read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
475 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
476 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
478 if (NewOffset0 > NewOffset1) {
479 // Canonicalize the merged instruction so the smaller offset comes first.
480 std::swap(NewOffset0, NewOffset1);
481 std::swap(SubRegIdx0, SubRegIdx1);
484 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
485 (NewOffset0 != NewOffset1) &&
486 "Computed offset doesn't fit");
488 const MCInstrDesc &Read2Desc = TII->get(Opc);
490 const TargetRegisterClass *SuperRC
491 = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
492 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
494 DebugLoc DL = CI.I->getDebugLoc();
496 unsigned BaseReg = AddrReg->getReg();
497 unsigned BaseRegFlags = 0;
499 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
500 BaseRegFlags = RegState::Kill;
502 unsigned AddOpc = STM->hasAddNoCarry() ?
503 AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
504 BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
506 .addReg(AddrReg->getReg());
509 MachineInstrBuilder Read2 =
510 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
511 .addReg(BaseReg, BaseRegFlags) // addr
512 .addImm(NewOffset0) // offset0
513 .addImm(NewOffset1) // offset1
515 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
519 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
521 // Copy to the old destination registers.
522 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
523 .add(*Dest0) // Copy to same destination including flags and sub reg.
524 .addReg(DestReg, 0, SubRegIdx0);
525 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
527 .addReg(DestReg, RegState::Kill, SubRegIdx1);
529 moveInstsAfter(Copy1, CI.InstsToMove);
531 MachineBasicBlock::iterator Next = std::next(CI.I);
532 CI.I->eraseFromParent();
533 CI.Paired->eraseFromParent();
535 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
539 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
540 if (STM->ldsRequiresM0Init())
541 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
542 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
545 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
546 if (STM->ldsRequiresM0Init())
547 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
549 return (EltSize == 4) ?
550 AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
553 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
555 MachineBasicBlock *MBB = CI.I->getParent();
557 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
558 // sure we preserve the subregister index and any register flags set on them.
559 const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
560 const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
561 const MachineOperand *Data1
562 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
564 unsigned NewOffset0 = CI.Offset0;
565 unsigned NewOffset1 = CI.Offset1;
566 unsigned Opc = CI.UseST64 ?
567 write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
569 if (NewOffset0 > NewOffset1) {
570 // Canonicalize the merged instruction so the smaller offset comes first.
571 std::swap(NewOffset0, NewOffset1);
572 std::swap(Data0, Data1);
575 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
576 (NewOffset0 != NewOffset1) &&
577 "Computed offset doesn't fit");
579 const MCInstrDesc &Write2Desc = TII->get(Opc);
580 DebugLoc DL = CI.I->getDebugLoc();
582 unsigned BaseReg = Addr->getReg();
583 unsigned BaseRegFlags = 0;
585 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
586 BaseRegFlags = RegState::Kill;
588 unsigned AddOpc = STM->hasAddNoCarry() ?
589 AMDGPU::V_ADD_U32_e32 : AMDGPU::V_ADD_I32_e32;
590 BuildMI(*MBB, CI.Paired, DL, TII->get(AddOpc), BaseReg)
592 .addReg(Addr->getReg());
595 MachineInstrBuilder Write2 =
596 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
597 .addReg(BaseReg, BaseRegFlags) // addr
598 .add(*Data0) // data0
599 .add(*Data1) // data1
600 .addImm(NewOffset0) // offset0
601 .addImm(NewOffset1) // offset1
603 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
605 moveInstsAfter(Write2, CI.InstsToMove);
607 MachineBasicBlock::iterator Next = std::next(CI.I);
608 CI.I->eraseFromParent();
609 CI.Paired->eraseFromParent();
611 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
615 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
617 MachineBasicBlock *MBB = CI.I->getParent();
618 DebugLoc DL = CI.I->getDebugLoc();
619 unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
620 AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
622 const TargetRegisterClass *SuperRC =
623 CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
624 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
625 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
627 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
628 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
629 .addImm(MergedOffset) // offset
630 .addImm(CI.GLC0) // glc
631 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
633 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
634 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
636 // Handle descending offsets
637 if (CI.Offset0 > CI.Offset1)
638 std::swap(SubRegIdx0, SubRegIdx1);
640 // Copy to the old destination registers.
641 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
642 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
643 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
645 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
646 .add(*Dest0) // Copy to same destination including flags and sub reg.
647 .addReg(DestReg, 0, SubRegIdx0);
648 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
650 .addReg(DestReg, RegState::Kill, SubRegIdx1);
652 moveInstsAfter(Copy1, CI.InstsToMove);
654 MachineBasicBlock::iterator Next = std::next(CI.I);
655 CI.I->eraseFromParent();
656 CI.Paired->eraseFromParent();
660 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
662 MachineBasicBlock *MBB = CI.I->getParent();
663 DebugLoc DL = CI.I->getDebugLoc();
666 if (CI.InstClass == BUFFER_LOAD_OFFEN) {
667 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
668 AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
670 Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
671 AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
674 const TargetRegisterClass *SuperRC =
675 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
676 unsigned DestReg = MRI->createVirtualRegister(SuperRC);
677 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
679 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
681 if (CI.InstClass == BUFFER_LOAD_OFFEN)
682 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
684 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
685 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
686 .addImm(MergedOffset) // offset
687 .addImm(CI.GLC0) // glc
688 .addImm(CI.SLC0) // slc
690 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
692 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
693 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
695 // Handle descending offsets
696 if (CI.Offset0 > CI.Offset1)
697 std::swap(SubRegIdx0, SubRegIdx1);
699 // Copy to the old destination registers.
700 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
701 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
702 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
704 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
705 .add(*Dest0) // Copy to same destination including flags and sub reg.
706 .addReg(DestReg, 0, SubRegIdx0);
707 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
709 .addReg(DestReg, RegState::Kill, SubRegIdx1);
711 moveInstsAfter(Copy1, CI.InstsToMove);
713 MachineBasicBlock::iterator Next = std::next(CI.I);
714 CI.I->eraseFromParent();
715 CI.Paired->eraseFromParent();
719 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
720 const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
724 switch (I.getOpcode()) {
725 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
727 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
728 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
730 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
731 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
734 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
735 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
738 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
739 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
740 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
741 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
742 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
743 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
745 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
746 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
748 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
753 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
755 MachineBasicBlock *MBB = CI.I->getParent();
756 DebugLoc DL = CI.I->getDebugLoc();
757 bool Unused1, Unused2;
758 unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
760 unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
761 unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
763 // Handle descending offsets
764 if (CI.Offset0 > CI.Offset1)
765 std::swap(SubRegIdx0, SubRegIdx1);
767 // Copy to the new source register.
768 const TargetRegisterClass *SuperRC =
769 CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
770 unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
772 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
773 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
775 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
781 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
782 .addReg(SrcReg, RegState::Kill);
784 if (CI.InstClass == BUFFER_STORE_OFFEN)
785 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
787 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
788 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
789 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
790 .addImm(CI.GLC0) // glc
791 .addImm(CI.SLC0) // slc
793 .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
795 moveInstsAfter(MIB, CI.InstsToMove);
797 MachineBasicBlock::iterator Next = std::next(CI.I);
798 CI.I->eraseFromParent();
799 CI.Paired->eraseFromParent();
803 // Scan through looking for adjacent LDS operations with constant offsets from
804 // the same base register. We rely on the scheduler to do the hard work of
805 // clustering nearby loads, and assume these are all adjacent.
806 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
807 bool Modified = false;
809 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
810 MachineInstr &MI = *I;
812 // Don't combine if volatile.
813 if (MI.hasOrderedMemoryRef()) {
820 unsigned Opc = MI.getOpcode();
821 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
822 Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
824 CI.InstClass = DS_READ_WRITE;
826 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
828 if (findMatchingInst(CI)) {
830 I = mergeRead2Pair(CI);
836 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
837 Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
838 Opc == AMDGPU::DS_WRITE_B64_gfx9) {
839 CI.InstClass = DS_READ_WRITE;
841 = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
843 if (findMatchingInst(CI)) {
845 I = mergeWrite2Pair(CI);
852 if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
853 (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
854 Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
855 // EltSize is in units of the offset encoding.
856 CI.InstClass = S_BUFFER_LOAD_IMM;
857 CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
858 CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
859 if (findMatchingInst(CI)) {
861 I = mergeSBufferLoadImmPair(CI);
869 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
870 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
871 Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
872 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
873 if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
874 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
875 CI.InstClass = BUFFER_LOAD_OFFEN;
877 CI.InstClass = BUFFER_LOAD_OFFSET;
880 CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
881 Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
882 if (findMatchingInst(CI)) {
884 I = mergeBufferLoadPair(CI);
893 bool StoreIsX2, IsOffen;
894 if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
895 CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
898 if (findMatchingInst(CI)) {
900 I = mergeBufferStorePair(CI);
915 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
916 if (skipFunction(MF.getFunction()))
919 STM = &MF.getSubtarget<SISubtarget>();
920 if (!STM->loadStoreOptEnabled())
923 TII = STM->getInstrInfo();
924 TRI = &TII->getRegisterInfo();
926 MRI = &MF.getRegInfo();
927 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
929 assert(MRI->isSSA() && "Must be run on SSA");
931 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
933 bool Modified = false;
935 for (MachineBasicBlock &MBB : MF) {
937 Modified |= optimizeBlock(MBB);
939 // Run again to convert x2 to x4.
941 Modified |= optimizeBlock(MBB);