1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// Implements the AMDGPU specific subclass of TargetSubtarget.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUSubtarget.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPURegisterBankInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/MC/MCSubtargetInfo.h"
27 #include "llvm/IR/MDBuilder.h"
28 #include "llvm/CodeGen/TargetFrameLowering.h"
33 #define DEBUG_TYPE "amdgpu-subtarget"
35 #define GET_SUBTARGETINFO_TARGET_DESC
36 #define GET_SUBTARGETINFO_CTOR
37 #define AMDGPUSubtarget GCNSubtarget
38 #include "AMDGPUGenSubtargetInfo.inc"
39 #define GET_SUBTARGETINFO_TARGET_DESC
40 #define GET_SUBTARGETINFO_CTOR
41 #undef AMDGPUSubtarget
42 #include "R600GenSubtargetInfo.inc"
44 GCNSubtarget::~GCNSubtarget() = default;
47 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
48 StringRef GPU, StringRef FS) {
49 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,");
51 ParseSubtargetFeatures(GPU, FullFS);
53 // FIXME: I don't think think Evergreen has any useful support for
54 // denormals, but should be checked. Should we issue a warning somewhere
55 // if someone tries to enable these?
56 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
57 FP32Denormals = false;
60 HasMulU24 = getGeneration() >= EVERGREEN;
61 HasMulI24 = hasCaymanISA();
67 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
68 StringRef GPU, StringRef FS) {
69 // Determine default and user-specified characteristics
70 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
71 // enabled, but some instructions do not respect them and they run at the
72 // double precision rate, so don't enable by default.
74 // We want to be able to turn these off, but making this a subtarget feature
75 // for SI has the unhelpful behavior that it unsets everything else if you
78 SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
80 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
81 FullFS += "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
83 // FIXME: I don't think think Evergreen has any useful support for
84 // denormals, but should be checked. Should we issue a warning somewhere
85 // if someone tries to enable these?
86 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
87 FullFS += "+fp64-fp16-denormals,";
89 FullFS += "-fp32-denormals,";
94 ParseSubtargetFeatures(GPU, FullFS);
96 // We don't support FP64 for EG/NI atm.
97 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
99 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
100 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
101 // variants of MUBUF instructions.
102 if (!hasAddr64() && !FS.contains("flat-for-global")) {
103 FlatForGlobal = true;
106 // Set defaults if needed.
107 if (MaxPrivateElementSize == 0)
108 MaxPrivateElementSize = 4;
110 if (LDSBankCount == 0)
113 if (TT.getArch() == Triple::amdgcn) {
114 if (LocalMemorySize == 0)
115 LocalMemorySize = 32768;
117 // Do something sensible for unspecified target.
118 if (!HasMovrel && !HasVGPRIndexMode)
122 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
127 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
128 const FeatureBitset &FeatureBits) :
130 SubtargetFeatureBits(FeatureBits),
131 Has16BitInsts(false),
132 HasMadMixInsts(false),
133 FP32Denormals(false),
136 HasVOP3PInsts(false),
139 HasFminFmaxLegacy(true),
140 EnablePromoteAlloca(false),
145 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
146 const GCNTargetMachine &TM) :
147 AMDGPUGenSubtargetInfo(TT, GPU, FS),
148 AMDGPUSubtarget(TT, getFeatureBits()),
150 Gen(SOUTHERN_ISLANDS),
151 IsaVersion(ISAVersion0_0_0),
153 MaxPrivateElementSize(0),
156 HalfRate64Ops(false),
158 FP64FP16Denormals(false),
160 FlatForGlobal(false),
161 AutoWaitcntBeforeBarrier(false),
163 UnalignedScratchAccess(false),
164 UnalignedBufferAccess(false),
166 HasApertureRegs(false),
169 DebuggerInsertNops(false),
170 DebuggerEmitPrologue(false),
172 EnableHugePrivateBuffer(false),
173 EnableVGPRSpilling(false),
174 EnableLoadStoreOpt(false),
175 EnableUnsafeDSOffsetFolding(false),
176 EnableSIScheduler(false),
185 HasSMemRealTime(false),
187 HasFmaMixInsts(false),
189 HasVGPRIndexMode(false),
190 HasScalarStores(false),
191 HasScalarAtomics(false),
192 HasInv2PiInlineImm(false),
194 HasSDWAScalar(false),
197 HasSDWAOutModsVOPC(false),
200 D16PreservesUnusedBits(false),
201 FlatAddressSpace(false),
202 FlatInstOffsets(false),
203 FlatGlobalInsts(false),
204 FlatScratchInsts(false),
205 AddNoCarryInsts(false),
206 HasUnpackedD16VMem(false),
208 ScalarizeGlobal(false),
210 FeatureDisable(false),
211 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
213 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
214 AS = AMDGPU::getAMDGPUAS(TT);
215 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
216 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
217 RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
218 InstSelector.reset(new AMDGPUInstructionSelector(
219 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
222 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
223 const Function &F) const {
225 return getLocalMemorySize();
226 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
227 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
228 unsigned MaxWaves = getMaxWavesPerEU();
229 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
232 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
233 const Function &F) const {
234 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
235 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
236 unsigned MaxWaves = getMaxWavesPerEU();
237 unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
238 unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
239 NumWaves = std::min(NumWaves, MaxWaves);
240 NumWaves = std::max(NumWaves, 1u);
245 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
246 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
247 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
250 std::pair<unsigned, unsigned>
251 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
253 case CallingConv::AMDGPU_CS:
254 case CallingConv::AMDGPU_KERNEL:
255 case CallingConv::SPIR_KERNEL:
256 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
257 case CallingConv::AMDGPU_VS:
258 case CallingConv::AMDGPU_LS:
259 case CallingConv::AMDGPU_HS:
260 case CallingConv::AMDGPU_ES:
261 case CallingConv::AMDGPU_GS:
262 case CallingConv::AMDGPU_PS:
263 return std::make_pair(1, getWavefrontSize());
265 return std::make_pair(1, 16 * getWavefrontSize());
269 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
270 const Function &F) const {
271 // FIXME: 1024 if function.
272 // Default minimum/maximum flat work group sizes.
273 std::pair<unsigned, unsigned> Default =
274 getDefaultFlatWorkGroupSize(F.getCallingConv());
276 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
277 // starts using "amdgpu-flat-work-group-size" attribute.
278 Default.second = AMDGPU::getIntegerAttribute(
279 F, "amdgpu-max-work-group-size", Default.second);
280 Default.first = std::min(Default.first, Default.second);
282 // Requested minimum/maximum flat work group sizes.
283 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
284 F, "amdgpu-flat-work-group-size", Default);
286 // Make sure requested minimum is less than requested maximum.
287 if (Requested.first > Requested.second)
290 // Make sure requested values do not violate subtarget's specifications.
291 if (Requested.first < getMinFlatWorkGroupSize())
293 if (Requested.second > getMaxFlatWorkGroupSize())
299 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
300 const Function &F) const {
301 // Default minimum/maximum number of waves per execution unit.
302 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
304 // Default/requested minimum/maximum flat work group sizes.
305 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
307 // If minimum/maximum flat work group sizes were explicitly requested using
308 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
309 // number of waves per execution unit to values implied by requested
310 // minimum/maximum flat work group sizes.
311 unsigned MinImpliedByFlatWorkGroupSize =
312 getMaxWavesPerEU(FlatWorkGroupSizes.second);
313 bool RequestedFlatWorkGroupSize = false;
315 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
316 // starts using "amdgpu-flat-work-group-size" attribute.
317 if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
318 F.hasFnAttribute("amdgpu-flat-work-group-size")) {
319 Default.first = MinImpliedByFlatWorkGroupSize;
320 RequestedFlatWorkGroupSize = true;
323 // Requested minimum/maximum number of waves per execution unit.
324 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
325 F, "amdgpu-waves-per-eu", Default, true);
327 // Make sure requested minimum is less than requested maximum.
328 if (Requested.second && Requested.first > Requested.second)
331 // Make sure requested values do not violate subtarget's specifications.
332 if (Requested.first < getMinWavesPerEU() ||
333 Requested.first > getMaxWavesPerEU())
335 if (Requested.second > getMaxWavesPerEU())
338 // Make sure requested values are compatible with values implied by requested
339 // minimum/maximum flat work group sizes.
340 if (RequestedFlatWorkGroupSize &&
341 Requested.first < MinImpliedByFlatWorkGroupSize)
347 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
348 Function *Kernel = I->getParent()->getParent();
349 unsigned MinSize = 0;
350 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
351 bool IdQuery = false;
353 // If reqd_work_group_size is present it narrows value down.
354 if (auto *CI = dyn_cast<CallInst>(I)) {
355 const Function *F = CI->getCalledFunction();
357 unsigned Dim = UINT_MAX;
358 switch (F->getIntrinsicID()) {
359 case Intrinsic::amdgcn_workitem_id_x:
360 case Intrinsic::r600_read_tidig_x:
363 case Intrinsic::r600_read_local_size_x:
366 case Intrinsic::amdgcn_workitem_id_y:
367 case Intrinsic::r600_read_tidig_y:
370 case Intrinsic::r600_read_local_size_y:
373 case Intrinsic::amdgcn_workitem_id_z:
374 case Intrinsic::r600_read_tidig_z:
377 case Intrinsic::r600_read_local_size_z:
384 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
385 if (Node->getNumOperands() == 3)
386 MinSize = MaxSize = mdconst::extract<ConstantInt>(
387 Node->getOperand(Dim))->getZExtValue();
395 // Range metadata is [Lo, Hi). For ID query we need to pass max size
396 // as Hi. For size query we need to pass Hi + 1.
402 MDBuilder MDB(I->getContext());
403 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
405 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
409 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
410 unsigned &MaxAlign) const {
411 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
412 F.getCallingConv() == CallingConv::SPIR_KERNEL);
414 const DataLayout &DL = F.getParent()->getDataLayout();
415 uint64_t ExplicitArgBytes = 0;
418 for (const Argument &Arg : F.args()) {
419 Type *ArgTy = Arg.getType();
421 unsigned Align = DL.getABITypeAlignment(ArgTy);
422 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
423 ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
424 MaxAlign = std::max(MaxAlign, Align);
427 return ExplicitArgBytes;
430 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
431 unsigned &MaxAlign) const {
432 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
434 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
436 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
437 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
438 if (ImplicitBytes != 0) {
439 unsigned Alignment = getAlignmentForImplicitArgPtr();
440 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
443 // Being able to dereference past the end is useful for emitting scalar loads.
444 return alignTo(TotalSize, 4);
447 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
448 const TargetMachine &TM) :
449 R600GenSubtargetInfo(TT, GPU, FS),
450 AMDGPUSubtarget(TT, getFeatureBits()),
452 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
457 HasVertexCache(false),
462 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
463 InstrItins(getInstrItineraryForCPU(GPU)),
464 AS (AMDGPU::getAMDGPUAS(TT)) { }
466 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
467 unsigned NumRegionInstrs) const {
468 // Track register pressure so the scheduler can try to decrease
469 // pressure once register usage is above the threshold defined by
470 // SIRegisterInfo::getRegPressureSetLimit()
471 Policy.ShouldTrackPressure = true;
473 // Enabling both top down and bottom up scheduling seems to give us less
474 // register spills than just using one of these approaches on its own.
475 Policy.OnlyTopDown = false;
476 Policy.OnlyBottomUp = false;
478 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
479 if (!enableSIScheduler())
480 Policy.ShouldTrackLaneMasks = true;
483 bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
484 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
487 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
488 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
510 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
532 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
533 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
534 if (MFI.hasFlatScratchInit()) {
535 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
536 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
537 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
538 return 4; // FLAT_SCRATCH, VCC (in that order).
541 if (isXNACKEnabled())
542 return 4; // XNACK, VCC (in that order).
546 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
547 const Function &F = MF.getFunction();
548 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
550 // Compute maximum number of SGPRs function can use using default/requested
551 // minimum number of waves per execution unit.
552 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
553 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
554 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
556 // Check if maximum number of SGPRs was explicitly requested using
557 // "amdgpu-num-sgpr" attribute.
558 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
559 unsigned Requested = AMDGPU::getIntegerAttribute(
560 F, "amdgpu-num-sgpr", MaxNumSGPRs);
562 // Make sure requested value does not violate subtarget's specifications.
563 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
566 // If more SGPRs are required to support the input user/system SGPRs,
567 // increase to accommodate them.
569 // FIXME: This really ends up using the requested number of SGPRs + number
570 // of reserved special registers in total. Theoretically you could re-use
571 // the last input registers for these special registers, but this would
572 // require a lot of complexity to deal with the weird aliasing.
573 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
574 if (Requested && Requested < InputNumSGPRs)
575 Requested = InputNumSGPRs;
577 // Make sure requested value is compatible with values implied by
578 // default/requested minimum/maximum number of waves per execution unit.
579 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
581 if (WavesPerEU.second &&
582 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
586 MaxNumSGPRs = Requested;
589 if (hasSGPRInitBug())
590 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
592 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
593 MaxAddressableNumSGPRs);
596 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
597 const Function &F = MF.getFunction();
598 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
600 // Compute maximum number of VGPRs function can use using default/requested
601 // minimum number of waves per execution unit.
602 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
603 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
605 // Check if maximum number of VGPRs was explicitly requested using
606 // "amdgpu-num-vgpr" attribute.
607 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
608 unsigned Requested = AMDGPU::getIntegerAttribute(
609 F, "amdgpu-num-vgpr", MaxNumVGPRs);
611 // Make sure requested value is compatible with values implied by
612 // default/requested minimum/maximum number of waves per execution unit.
613 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
615 if (WavesPerEU.second &&
616 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
620 MaxNumVGPRs = Requested;
627 struct MemOpClusterMutation : ScheduleDAGMutation {
628 const SIInstrInfo *TII;
630 MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
632 void apply(ScheduleDAGInstrs *DAGInstrs) override {
633 ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
635 SUnit *SUa = nullptr;
636 // Search for two consequent memory operations and link them
637 // to prevent scheduler from moving them apart.
638 // In DAG pre-process SUnits are in the original order of
639 // the instructions before scheduling.
640 for (SUnit &SU : DAG->SUnits) {
641 MachineInstr &MI2 = *SU.getInstr();
642 if (!MI2.mayLoad() && !MI2.mayStore()) {
651 MachineInstr &MI1 = *SUa->getInstr();
652 if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
653 (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
654 (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
655 (TII->isDS(MI1) && TII->isDS(MI2))) {
656 SU.addPredBarrier(SUa);
658 for (const SDep &SI : SU.Preds) {
659 if (SI.getSUnit() != SUa)
660 SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
663 if (&SU != &DAG->ExitSU) {
664 for (const SDep &SI : SUa->Succs) {
665 if (SI.getSUnit() != &SU)
666 SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
677 void GCNSubtarget::getPostRAMutations(
678 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
679 Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
682 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
683 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
684 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
686 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
689 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
690 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
691 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
693 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));