1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the AArch64 specific subclass of TargetSubtarget.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64Subtarget.h"
16 #include "AArch64InstrInfo.h"
17 #include "AArch64PBQPRegAlloc.h"
18 #include "AArch64TargetMachine.h"
19 #include "GISel/AArch64CallLowering.h"
20 #include "GISel/AArch64LegalizerInfo.h"
21 #include "GISel/AArch64RegisterBankInfo.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/IR/GlobalValue.h"
26 #include "llvm/Support/AArch64TargetParser.h"
27 #include "llvm/Support/TargetParser.h"
31 #define DEBUG_TYPE "aarch64-subtarget"
33 #define GET_SUBTARGETINFO_CTOR
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #include "AArch64GenSubtargetInfo.inc"
38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
39 "converter pass"), cl::init(true), cl::Hidden);
41 // If OS supports TBI, use this flag to enable it.
43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
44 "an address is ignored"), cl::init(false), cl::Hidden);
47 UseNonLazyBind("aarch64-enable-nonlazybind",
48 cl::desc("Call nonlazybind functions via direct GOT load"),
49 cl::init(false), cl::Hidden);
51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
52 cl::desc("Enable the use of AA during codegen."));
54 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
55 StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
56 // Determine default and user-specified characteristics
58 if (CPUString.empty())
59 CPUString = "generic";
61 if (TuneCPUString.empty())
62 TuneCPUString = CPUString;
64 ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
65 initializeProperties();
70 void AArch64Subtarget::initializeProperties() {
71 // Initialize CPU specific properties. We should add a tablegen feature for
72 // this in the future so we can specify it together with the subtarget
74 switch (ARMProcFamily) {
84 PrefFunctionLogAlignment = 4;
87 MaxInterleaveFactor = 4;
88 PrefFunctionLogAlignment = 4;
91 PrefFunctionLogAlignment = 3;
103 PrefFunctionLogAlignment = 4;
108 PrefFunctionLogAlignment = 4;
113 PrefFunctionLogAlignment = 3;
114 PrefLoopLogAlignment = 2;
115 MaxInterleaveFactor = 4;
116 PrefetchDistance = 128;
117 MinPrefetchStride = 1024;
118 MaxPrefetchIterationsAhead = 4;
128 PrefetchDistance = 280;
129 MinPrefetchStride = 2048;
130 MaxPrefetchIterationsAhead = 3;
133 MaxInterleaveFactor = 4;
134 MaxJumpTableSize = 20;
135 PrefFunctionLogAlignment = 5;
136 PrefLoopLogAlignment = 4;
139 MaxInterleaveFactor = 4;
140 // FIXME: remove this to enable 64-bit SLP if performance looks good.
141 MinVectorRegisterBitWidth = 128;
143 PrefetchDistance = 820;
144 MinPrefetchStride = 2048;
145 MaxPrefetchIterationsAhead = 8;
148 MaxInterleaveFactor = 4;
149 VectorInsertExtractBaseCost = 2;
151 PrefetchDistance = 740;
152 MinPrefetchStride = 1024;
153 MaxPrefetchIterationsAhead = 11;
154 // FIXME: remove this to enable 64-bit SLP if performance looks good.
155 MinVectorRegisterBitWidth = 128;
158 PrefFunctionLogAlignment = 3;
161 PrefFunctionLogAlignment = 4;
162 PrefLoopLogAlignment = 5;
163 MaxBytesForLoopAlignment = 16;
166 PrefFunctionLogAlignment = 4;
167 PrefLoopLogAlignment = 5;
168 MaxBytesForLoopAlignment = 16;
172 PrefFunctionLogAlignment = 4;
173 PrefLoopLogAlignment = 5;
174 MaxBytesForLoopAlignment = 16;
178 PrefFunctionLogAlignment = 4;
180 MaxInterleaveFactor = 4;
183 MaxInterleaveFactor = 4;
184 // FIXME: remove this to enable 64-bit SLP if performance looks good.
185 MinVectorRegisterBitWidth = 128;
189 PrefFunctionLogAlignment = 3;
190 PrefLoopLogAlignment = 2;
191 MaxInterleaveFactor = 4;
192 PrefetchDistance = 128;
193 MinPrefetchStride = 1024;
194 MaxPrefetchIterationsAhead = 4;
195 // FIXME: remove this to enable 64-bit SLP if performance looks good.
196 MinVectorRegisterBitWidth = 128;
203 PrefFunctionLogAlignment = 3;
204 PrefLoopLogAlignment = 2;
205 // FIXME: remove this to enable 64-bit SLP if performance looks good.
206 MinVectorRegisterBitWidth = 128;
210 PrefFunctionLogAlignment = 4;
211 PrefLoopLogAlignment = 2;
215 PrefFunctionLogAlignment = 4;
216 PrefLoopLogAlignment = 2;
217 MaxInterleaveFactor = 4;
218 PrefetchDistance = 128;
219 MinPrefetchStride = 1024;
220 MaxPrefetchIterationsAhead = 4;
221 // FIXME: remove this to enable 64-bit SLP if performance looks good.
222 MinVectorRegisterBitWidth = 128;
226 PrefFunctionLogAlignment = 6;
227 PrefLoopLogAlignment = 6;
228 MaxInterleaveFactor = 4;
233 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
234 const std::string &TuneCPU,
235 const std::string &FS,
236 const TargetMachine &TM, bool LittleEndian,
237 unsigned MinSVEVectorSizeInBitsOverride,
238 unsigned MaxSVEVectorSizeInBitsOverride)
239 : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
240 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
241 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
242 IsLittle(LittleEndian),
243 MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
244 MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
245 InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
247 if (AArch64::isX18ReservedByDefault(TT))
248 ReserveXRegister.set(18);
250 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
251 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
252 Legalizer.reset(new AArch64LegalizerInfo(*this));
254 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
256 // FIXME: At this point, we can't rely on Subtarget having RBI.
257 // It's awkward to mix passing RBI and the Subtarget; should we pass
259 InstSelector.reset(createAArch64InstructionSelector(
260 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
262 RegBankInfo.reset(RBI);
265 const CallLowering *AArch64Subtarget::getCallLowering() const {
266 return CallLoweringInfo.get();
269 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
270 return InlineAsmLoweringInfo.get();
273 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
274 return InstSelector.get();
277 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
278 return Legalizer.get();
281 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
282 return RegBankInfo.get();
285 /// Find the target operand flags that describe how a global value should be
286 /// referenced for the current subtarget.
288 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
289 const TargetMachine &TM) const {
290 // MachO large model always goes via a GOT, simply to get a single 8-byte
291 // absolute relocation on all global addresses.
292 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
293 return AArch64II::MO_GOT;
295 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
296 if (GV->hasDLLImportStorageClass())
297 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
298 if (getTargetTriple().isOSWindows())
299 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
300 return AArch64II::MO_GOT;
303 // The small code model's direct accesses use ADRP, which cannot
304 // necessarily produce the value 0 (if the code is above 4GB).
305 // Same for the tiny code model, where we have a pc relative LDR.
306 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
307 GV->hasExternalWeakLinkage())
308 return AArch64II::MO_GOT;
310 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
311 // that their nominal addresses are tagged and outside of the code model. In
312 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
313 // tag if necessary based on MO_TAGGED.
314 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
315 return AArch64II::MO_NC | AArch64II::MO_TAGGED;
317 return AArch64II::MO_NO_FLAG;
320 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
321 const GlobalValue *GV, const TargetMachine &TM) const {
322 // MachO large model always goes via a GOT, because we don't have the
323 // relocations available to do anything else..
324 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
325 !GV->hasInternalLinkage())
326 return AArch64II::MO_GOT;
328 // NonLazyBind goes via GOT unless we know it's available locally.
329 auto *F = dyn_cast<Function>(GV);
330 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
331 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
332 return AArch64II::MO_GOT;
334 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
335 if (getTargetTriple().isOSWindows())
336 return ClassifyGlobalReference(GV, TM);
338 return AArch64II::MO_NO_FLAG;
341 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
342 unsigned NumRegionInstrs) const {
343 // LNT run (at least on Cyclone) showed reasonably significant gains for
344 // bi-directional scheduling. 253.perlbmk.
345 Policy.OnlyTopDown = false;
346 Policy.OnlyBottomUp = false;
347 // Enabling or Disabling the latency heuristic is a close call: It seems to
348 // help nearly no benchmark on out-of-order architectures, on the other hand
349 // it regresses register pressure on a few benchmarking.
350 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
353 bool AArch64Subtarget::enableEarlyIfConversion() const {
354 return EnableEarlyIfConvert;
357 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
358 if (!UseAddressTopByteIgnored)
361 if (TargetTriple.isiOS()) {
362 return TargetTriple.getiOSVersion() >= VersionTuple(8);
368 std::unique_ptr<PBQPRAConstraint>
369 AArch64Subtarget::getCustomPBQPConstraints() const {
370 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
373 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
374 // We usually compute max call frame size after ISel. Do the computation now
375 // if the .mir file didn't specify it. Note that this will probably give you
376 // bogus values after PEI has eliminated the callframe setup/destroy pseudo
377 // instructions, specify explicitly if you need it to be correct.
378 MachineFrameInfo &MFI = MF.getFrameInfo();
379 if (!MFI.isMaxCallFrameSizeComputed())
380 MFI.computeMaxCallFrameSize(MF);
383 bool AArch64Subtarget::useAA() const { return UseAA; }