1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
48 GCNSubtarget::~GCNSubtarget() = default;
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52 StringRef GPU, StringRef FS) {
53 SmallString<256> FullFS("+promote-alloca,");
55 ParseSubtargetFeatures(GPU, FullFS);
57 // FIXME: I don't think think Evergreen has any useful support for
58 // denormals, but should be checked. Should we issue a warning somewhere
59 // if someone tries to enable these?
60 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61 FP32Denormals = false;
64 HasMulU24 = getGeneration() >= EVERGREEN;
65 HasMulI24 = hasCaymanISA();
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72 StringRef GPU, StringRef FS) {
73 // Determine default and user-specified characteristics
74 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75 // enabled, but some instructions do not respect them and they run at the
76 // double precision rate, so don't enable by default.
78 // We want to be able to turn these off, but making this a subtarget feature
79 // for SI has the unhelpful behavior that it unsets everything else if you
82 // Similarly we want enable-prt-strict-null to be on by default and not to
83 // unset everything else if it is disabled
85 // Assuming ECC is enabled is the conservative default.
86 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
88 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
91 // FIXME: I don't think think Evergreen has any useful support for
92 // denormals, but should be checked. Should we issue a warning somewhere
93 // if someone tries to enable these?
94 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95 FullFS += "+fp64-fp16-denormals,";
97 FullFS += "-fp32-denormals,";
100 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
102 // Disable mutually exclusive bits.
103 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105 FullFS += "-wavefrontsize16,";
106 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107 FullFS += "-wavefrontsize32,";
108 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109 FullFS += "-wavefrontsize64,";
114 ParseSubtargetFeatures(GPU, FullFS);
116 // We don't support FP64 for EG/NI atm.
117 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
119 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121 // variants of MUBUF instructions.
122 if (!hasAddr64() && !FS.contains("flat-for-global")) {
123 FlatForGlobal = true;
126 // Set defaults if needed.
127 if (MaxPrivateElementSize == 0)
128 MaxPrivateElementSize = 4;
130 if (LDSBankCount == 0)
133 if (TT.getArch() == Triple::amdgcn) {
134 if (LocalMemorySize == 0)
135 LocalMemorySize = 32768;
137 // Do something sensible for unspecified target.
138 if (!HasMovrel && !HasVGPRIndexMode)
142 // Don't crash on invalid devices.
143 if (WavefrontSize == 0)
146 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
148 if (DoesNotSupportXNACK && EnableXNACK) {
149 ToggleFeature(AMDGPU::FeatureXNACK);
153 // ECC is on by default, but turn it off if the hardware doesn't support it
154 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
156 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157 ToggleFeature(AMDGPU::FeatureSRAMECC);
158 EnableSRAMECC = false;
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
166 Has16BitInsts(false),
167 HasMadMixInsts(false),
168 FP32Denormals(false),
171 HasVOP3PInsts(false),
174 HasInv2PiInlineImm(false),
175 HasFminFmaxLegacy(true),
176 EnablePromoteAlloca(false),
177 HasTrigReducedRange(false),
182 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
183 const GCNTargetMachine &TM) :
184 AMDGPUGenSubtargetInfo(TT, GPU, FS),
187 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
188 InstrItins(getInstrItineraryForCPU(GPU)),
190 MaxPrivateElementSize(0),
193 HalfRate64Ops(false),
195 FP64FP16Denormals(false),
196 FlatForGlobal(false),
197 AutoWaitcntBeforeBarrier(false),
199 UnalignedScratchAccess(false),
200 UnalignedBufferAccess(false),
202 HasApertureRegs(false),
204 DoesNotSupportXNACK(false),
208 EnableLoadStoreOpt(false),
209 EnableUnsafeDSOffsetFolding(false),
210 EnableSIScheduler(false),
212 EnablePRTStrictNull(false),
221 GFX7GFX8GFX9Insts(false),
223 HasSMemRealTime(false),
225 HasFmaMixInsts(false),
227 HasVGPRIndexMode(false),
228 HasScalarStores(false),
229 HasScalarAtomics(false),
231 HasSDWAScalar(false),
234 HasSDWAOutModsVOPC(false),
238 HasNSAEncoding(false),
247 HasPkFmacF16Inst(false),
248 HasAtomicFaddInsts(false),
249 EnableSRAMECC(false),
250 DoesNotSupportSRAMECC(false),
251 HasNoSdstCMPX(false),
253 HasRegisterBanking(false),
254 HasVOP3Literal(false),
255 HasNoDataDepHazard(false),
256 FlatAddressSpace(false),
257 FlatInstOffsets(false),
258 FlatGlobalInsts(false),
259 FlatScratchInsts(false),
260 ScalarFlatScratchInsts(false),
261 AddNoCarryInsts(false),
262 HasUnpackedD16VMem(false),
263 LDSMisalignedBug(false),
265 ScalarizeGlobal(false),
267 HasVcmpxPermlaneHazard(false),
268 HasVMEMtoScalarWriteHazard(false),
269 HasSMEMtoVectorWriteHazard(false),
270 HasInstFwdPrefetchBug(false),
271 HasVcmpxExecWARHazard(false),
272 HasLdsBranchVmemWARHazard(false),
273 HasNSAtoVMEMBug(false),
274 HasOffset3fBug(false),
275 HasFlatSegmentOffsetBug(false),
277 FeatureDisable(false),
278 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
280 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
281 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
282 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
283 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
284 InstSelector.reset(new AMDGPUInstructionSelector(
285 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
289 if (getGeneration() < GFX10)
293 case AMDGPU::V_LSHLREV_B64:
294 case AMDGPU::V_LSHLREV_B64_gfx10:
295 case AMDGPU::V_LSHL_B64:
296 case AMDGPU::V_LSHRREV_B64:
297 case AMDGPU::V_LSHRREV_B64_gfx10:
298 case AMDGPU::V_LSHR_B64:
299 case AMDGPU::V_ASHRREV_I64:
300 case AMDGPU::V_ASHRREV_I64_gfx10:
301 case AMDGPU::V_ASHR_I64:
308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
309 const Function &F) const {
311 return getLocalMemorySize();
312 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314 if (!WorkGroupsPerCu)
316 unsigned MaxWaves = getMaxWavesPerEU();
317 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
320 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
321 const Function &F) const {
322 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
323 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
324 if (!WorkGroupsPerCu)
326 unsigned MaxWaves = getMaxWavesPerEU();
327 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
328 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
329 NumWaves = std::min(NumWaves, MaxWaves);
330 NumWaves = std::max(NumWaves, 1u);
335 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
336 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
337 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
340 std::pair<unsigned, unsigned>
341 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
343 case CallingConv::AMDGPU_CS:
344 case CallingConv::AMDGPU_KERNEL:
345 case CallingConv::SPIR_KERNEL:
346 return std::make_pair(getWavefrontSize() * 2,
347 std::max(getWavefrontSize() * 4, 256u));
348 case CallingConv::AMDGPU_VS:
349 case CallingConv::AMDGPU_LS:
350 case CallingConv::AMDGPU_HS:
351 case CallingConv::AMDGPU_ES:
352 case CallingConv::AMDGPU_GS:
353 case CallingConv::AMDGPU_PS:
354 return std::make_pair(1, getWavefrontSize());
356 return std::make_pair(1, 16 * getWavefrontSize());
360 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
361 const Function &F) const {
362 // FIXME: 1024 if function.
363 // Default minimum/maximum flat work group sizes.
364 std::pair<unsigned, unsigned> Default =
365 getDefaultFlatWorkGroupSize(F.getCallingConv());
367 // Requested minimum/maximum flat work group sizes.
368 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
369 F, "amdgpu-flat-work-group-size", Default);
371 // Make sure requested minimum is less than requested maximum.
372 if (Requested.first > Requested.second)
375 // Make sure requested values do not violate subtarget's specifications.
376 if (Requested.first < getMinFlatWorkGroupSize())
378 if (Requested.second > getMaxFlatWorkGroupSize())
384 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
385 const Function &F) const {
386 // Default minimum/maximum number of waves per execution unit.
387 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
389 // Default/requested minimum/maximum flat work group sizes.
390 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
392 // If minimum/maximum flat work group sizes were explicitly requested using
393 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
394 // number of waves per execution unit to values implied by requested
395 // minimum/maximum flat work group sizes.
396 unsigned MinImpliedByFlatWorkGroupSize =
397 getMaxWavesPerEU(FlatWorkGroupSizes.second);
398 bool RequestedFlatWorkGroupSize = false;
400 if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
401 Default.first = MinImpliedByFlatWorkGroupSize;
402 RequestedFlatWorkGroupSize = true;
405 // Requested minimum/maximum number of waves per execution unit.
406 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
407 F, "amdgpu-waves-per-eu", Default, true);
409 // Make sure requested minimum is less than requested maximum.
410 if (Requested.second && Requested.first > Requested.second)
413 // Make sure requested values do not violate subtarget's specifications.
414 if (Requested.first < getMinWavesPerEU() ||
415 Requested.first > getMaxWavesPerEU())
417 if (Requested.second > getMaxWavesPerEU())
420 // Make sure requested values are compatible with values implied by requested
421 // minimum/maximum flat work group sizes.
422 if (RequestedFlatWorkGroupSize &&
423 Requested.first < MinImpliedByFlatWorkGroupSize)
429 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
430 Function *Kernel = I->getParent()->getParent();
431 unsigned MinSize = 0;
432 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
433 bool IdQuery = false;
435 // If reqd_work_group_size is present it narrows value down.
436 if (auto *CI = dyn_cast<CallInst>(I)) {
437 const Function *F = CI->getCalledFunction();
439 unsigned Dim = UINT_MAX;
440 switch (F->getIntrinsicID()) {
441 case Intrinsic::amdgcn_workitem_id_x:
442 case Intrinsic::r600_read_tidig_x:
445 case Intrinsic::r600_read_local_size_x:
448 case Intrinsic::amdgcn_workitem_id_y:
449 case Intrinsic::r600_read_tidig_y:
452 case Intrinsic::r600_read_local_size_y:
455 case Intrinsic::amdgcn_workitem_id_z:
456 case Intrinsic::r600_read_tidig_z:
459 case Intrinsic::r600_read_local_size_z:
466 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
467 if (Node->getNumOperands() == 3)
468 MinSize = MaxSize = mdconst::extract<ConstantInt>(
469 Node->getOperand(Dim))->getZExtValue();
477 // Range metadata is [Lo, Hi). For ID query we need to pass max size
478 // as Hi. For size query we need to pass Hi + 1.
484 MDBuilder MDB(I->getContext());
485 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
487 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
491 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
492 unsigned &MaxAlign) const {
493 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
494 F.getCallingConv() == CallingConv::SPIR_KERNEL);
496 const DataLayout &DL = F.getParent()->getDataLayout();
497 uint64_t ExplicitArgBytes = 0;
500 for (const Argument &Arg : F.args()) {
501 Type *ArgTy = Arg.getType();
503 unsigned Align = DL.getABITypeAlignment(ArgTy);
504 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
505 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
506 MaxAlign = std::max(MaxAlign, Align);
509 return ExplicitArgBytes;
512 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
513 unsigned &MaxAlign) const {
514 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
516 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
518 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
519 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
520 if (ImplicitBytes != 0) {
521 unsigned Alignment = getAlignmentForImplicitArgPtr();
522 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525 // Being able to dereference past the end is useful for emitting scalar loads.
526 return alignTo(TotalSize, 4);
529 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
530 const TargetMachine &TM) :
531 R600GenSubtargetInfo(TT, GPU, FS),
534 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
538 HasVertexCache(false),
543 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
544 InstrItins(getInstrItineraryForCPU(GPU)) { }
546 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
547 unsigned NumRegionInstrs) const {
548 // Track register pressure so the scheduler can try to decrease
549 // pressure once register usage is above the threshold defined by
550 // SIRegisterInfo::getRegPressureSetLimit()
551 Policy.ShouldTrackPressure = true;
553 // Enabling both top down and bottom up scheduling seems to give us less
554 // register spills than just using one of these approaches on its own.
555 Policy.OnlyTopDown = false;
556 Policy.OnlyBottomUp = false;
558 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
559 if (!enableSIScheduler())
560 Policy.ShouldTrackLaneMasks = true;
563 bool GCNSubtarget::hasMadF16() const {
564 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
568 if (getGeneration() >= AMDGPUSubtarget::GFX10)
571 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
593 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
615 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
616 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
617 if (getGeneration() >= AMDGPUSubtarget::GFX10)
618 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
620 if (MFI.hasFlatScratchInit()) {
621 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
622 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
623 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
624 return 4; // FLAT_SCRATCH, VCC (in that order).
627 if (isXNACKEnabled())
628 return 4; // XNACK, VCC (in that order).
632 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
633 const Function &F = MF.getFunction();
634 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
636 // Compute maximum number of SGPRs function can use using default/requested
637 // minimum number of waves per execution unit.
638 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
639 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
640 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
642 // Check if maximum number of SGPRs was explicitly requested using
643 // "amdgpu-num-sgpr" attribute.
644 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
645 unsigned Requested = AMDGPU::getIntegerAttribute(
646 F, "amdgpu-num-sgpr", MaxNumSGPRs);
648 // Make sure requested value does not violate subtarget's specifications.
649 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
652 // If more SGPRs are required to support the input user/system SGPRs,
653 // increase to accommodate them.
655 // FIXME: This really ends up using the requested number of SGPRs + number
656 // of reserved special registers in total. Theoretically you could re-use
657 // the last input registers for these special registers, but this would
658 // require a lot of complexity to deal with the weird aliasing.
659 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
660 if (Requested && Requested < InputNumSGPRs)
661 Requested = InputNumSGPRs;
663 // Make sure requested value is compatible with values implied by
664 // default/requested minimum/maximum number of waves per execution unit.
665 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
667 if (WavesPerEU.second &&
668 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
672 MaxNumSGPRs = Requested;
675 if (hasSGPRInitBug())
676 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
678 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
679 MaxAddressableNumSGPRs);
682 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
683 const Function &F = MF.getFunction();
684 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
686 // Compute maximum number of VGPRs function can use using default/requested
687 // minimum number of waves per execution unit.
688 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
689 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
691 // Check if maximum number of VGPRs was explicitly requested using
692 // "amdgpu-num-vgpr" attribute.
693 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
694 unsigned Requested = AMDGPU::getIntegerAttribute(
695 F, "amdgpu-num-vgpr", MaxNumVGPRs);
697 // Make sure requested value is compatible with values implied by
698 // default/requested minimum/maximum number of waves per execution unit.
699 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
701 if (WavesPerEU.second &&
702 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
706 MaxNumVGPRs = Requested;
713 struct MemOpClusterMutation : ScheduleDAGMutation {
714 const SIInstrInfo *TII;
716 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
718 void apply(ScheduleDAGInstrs *DAG) override {
719 SUnit *SUa = nullptr;
720 // Search for two consequent memory operations and link them
721 // to prevent scheduler from moving them apart.
722 // In DAG pre-process SUnits are in the original order of
723 // the instructions before scheduling.
724 for (SUnit &SU : DAG->SUnits) {
725 MachineInstr &MI2 = *SU.getInstr();
726 if (!MI2.mayLoad() && !MI2.mayStore()) {
735 MachineInstr &MI1 = *SUa->getInstr();
736 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
737 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
738 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
739 (TII->isDS(MI1) && TII->isDS(MI2))) {
740 SU.addPredBarrier(SUa);
742 for (const SDep &SI : SU.Preds) {
743 if (SI.getSUnit() != SUa)
744 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
747 if (&SU != &DAG->ExitSU) {
748 for (const SDep &SI : SUa->Succs) {
749 if (SI.getSUnit() != &SU)
750 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
760 struct FillMFMAShadowMutation : ScheduleDAGMutation {
761 const SIInstrInfo *TII;
765 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
767 bool isSALU(const SUnit *SU) const {
768 const MachineInstr *MI = SU->getInstr();
769 return MI && TII->isSALU(*MI) && !MI->isTerminator();
772 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
773 if (Pred->NodeNum < Succ->NodeNum)
776 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
778 for (unsigned I = 0; I < Succs.size(); ++I) {
779 for (const SDep &SI : Succs[I]->Succs) {
780 const SUnit *SU = SI.getSUnit();
781 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
786 SmallPtrSet<const SUnit*, 32> Visited;
787 while (!Preds.empty()) {
788 const SUnit *SU = Preds.pop_back_val();
789 if (llvm::find(Succs, SU) != Succs.end())
792 for (const SDep &SI : SU->Preds)
793 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
794 Preds.push_back(SI.getSUnit());
800 // Link as much SALU intructions in chain as possible. Return the size
801 // of the chain. Links up to MaxChain instructions.
802 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
803 SmallPtrSetImpl<SUnit *> &Visited) const {
804 SmallVector<SUnit *, 8> Worklist({To});
807 while (!Worklist.empty() && MaxChain-- > 0) {
808 SUnit *SU = Worklist.pop_back_val();
809 if (!Visited.insert(SU).second)
812 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
813 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
815 if (SU->addPred(SDep(From, SDep::Artificial), false))
818 for (SDep &SI : From->Succs) {
819 SUnit *SUv = SI.getSUnit();
820 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
821 SUv->addPred(SDep(SU, SDep::Artificial), false);
824 for (SDep &SI : SU->Succs) {
825 SUnit *Succ = SI.getSUnit();
826 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
827 Worklist.push_back(Succ);
834 void apply(ScheduleDAGInstrs *DAGInstrs) override {
835 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
836 if (!ST.hasMAIInsts() || DisablePowerSched)
838 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
839 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
840 if (!TSchedModel || DAG->SUnits.empty())
843 // Scan for MFMA long latency instructions and try to add a dependency
844 // of available SALU instructions to give them a chance to fill MFMA
845 // shadow. That is desirable to fill MFMA shadow with SALU instructions
846 // rather than VALU to prevent power consumption bursts and throttle.
847 auto LastSALU = DAG->SUnits.begin();
848 auto E = DAG->SUnits.end();
849 SmallPtrSet<SUnit*, 32> Visited;
850 for (SUnit &SU : DAG->SUnits) {
851 MachineInstr &MAI = *SU.getInstr();
852 if (!TII->isMAI(MAI) ||
853 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
854 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
857 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
859 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
860 dbgs() << "Need " << Lat
861 << " instructions to cover latency.\n");
863 // Find up to Lat independent scalar instructions as early as
864 // possible such that they can be scheduled after this MFMA.
865 for ( ; Lat && LastSALU != E; ++LastSALU) {
866 if (Visited.count(&*LastSALU))
869 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
872 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
879 void GCNSubtarget::getPostRAMutations(
880 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
881 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
882 Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
885 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
886 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
887 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
889 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
893 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
894 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
896 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));