contrib/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.cpp

   1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the AArch64 specific subclass of TargetSubtarget.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "AArch64Subtarget.h"
  14
  15 #include "AArch64.h"
  16 #include "AArch64InstrInfo.h"
  17 #include "AArch64PBQPRegAlloc.h"
  18 #include "AArch64TargetMachine.h"
  19 #include "GISel/AArch64CallLowering.h"
  20 #include "GISel/AArch64LegalizerInfo.h"
  21 #include "GISel/AArch64RegisterBankInfo.h"
  22 #include "MCTargetDesc/AArch64AddressingModes.h"
  23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
  24 #include "llvm/CodeGen/MachineScheduler.h"
  25 #include "llvm/IR/GlobalValue.h"
  26 #include "llvm/Support/AArch64TargetParser.h"
  27 #include "llvm/Support/TargetParser.h"
  28
  29 using namespace llvm;
  30
  31 #define DEBUG_TYPE "aarch64-subtarget"
  32
  33 #define GET_SUBTARGETINFO_CTOR
  34 #define GET_SUBTARGETINFO_TARGET_DESC
  35 #include "AArch64GenSubtargetInfo.inc"
  36
  37 static cl::opt<bool>
  38 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
  39                      "converter pass"), cl::init(true), cl::Hidden);
  40
  41 // If OS supports TBI, use this flag to enable it.
  42 static cl::opt<bool>
  43 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
  44                          "an address is ignored"), cl::init(false), cl::Hidden);
  45
  46 static cl::opt<bool>
  47     UseNonLazyBind("aarch64-enable-nonlazybind",
  48                    cl::desc("Call nonlazybind functions via direct GOT load"),
  49                    cl::init(false), cl::Hidden);
  50
  51 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
  52                            cl::desc("Enable the use of AA during codegen."));
  53
  54 AArch64Subtarget &AArch64Subtarget::initializeSubtargetDependencies(
  55     StringRef FS, StringRef CPUString, StringRef TuneCPUString) {
  56   // Determine default and user-specified characteristics
  57
  58   if (CPUString.empty())
  59     CPUString = "generic";
  60
  61   if (TuneCPUString.empty())
  62     TuneCPUString = CPUString;
  63
  64   ParseSubtargetFeatures(CPUString, TuneCPUString, FS);
  65   initializeProperties();
  66
  67   return *this;
  68 }
  69
  70 void AArch64Subtarget::initializeProperties() {
  71   // Initialize CPU specific properties. We should add a tablegen feature for
  72   // this in the future so we can specify it together with the subtarget
  73   // features.
  74   switch (ARMProcFamily) {
  75   case Others:
  76     break;
  77   case Carmel:
  78     CacheLineSize = 64;
  79     break;
  80   case CortexA35:
  81     break;
  82   case CortexA53:
  83   case CortexA55:
  84     PrefFunctionLogAlignment = 4;
  85     break;
  86   case CortexA57:
  87     MaxInterleaveFactor = 4;
  88     PrefFunctionLogAlignment = 4;
  89     break;
  90   case CortexA65:
  91     PrefFunctionLogAlignment = 3;
  92     break;
  93   case CortexA72:
  94   case CortexA73:
  95   case CortexA75:
  96   case CortexA76:
  97   case CortexA77:
  98   case CortexA78:
  99   case CortexA78C:
 100   case CortexR82:
 101   case CortexX1:
 102   case CortexX1C:
 103     PrefFunctionLogAlignment = 4;
 104     break;
 105   case CortexA510:
 106   case CortexA710:
 107   case CortexX2:
 108     PrefFunctionLogAlignment = 4;
 109     VScaleForTuning = 1;
 110     break;
 111   case A64FX:
 112     CacheLineSize = 256;
 113     PrefFunctionLogAlignment = 3;
 114     PrefLoopLogAlignment = 2;
 115     MaxInterleaveFactor = 4;
 116     PrefetchDistance = 128;
 117     MinPrefetchStride = 1024;
 118     MaxPrefetchIterationsAhead = 4;
 119     VScaleForTuning = 4;
 120     break;
 121   case AppleA7:
 122   case AppleA10:
 123   case AppleA11:
 124   case AppleA12:
 125   case AppleA13:
 126   case AppleA14:
 127     CacheLineSize = 64;
 128     PrefetchDistance = 280;
 129     MinPrefetchStride = 2048;
 130     MaxPrefetchIterationsAhead = 3;
 131     break;
 132   case ExynosM3:
 133     MaxInterleaveFactor = 4;
 134     MaxJumpTableSize = 20;
 135     PrefFunctionLogAlignment = 5;
 136     PrefLoopLogAlignment = 4;
 137     break;
 138   case Falkor:
 139     MaxInterleaveFactor = 4;
 140     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 141     MinVectorRegisterBitWidth = 128;
 142     CacheLineSize = 128;
 143     PrefetchDistance = 820;
 144     MinPrefetchStride = 2048;
 145     MaxPrefetchIterationsAhead = 8;
 146     break;
 147   case Kryo:
 148     MaxInterleaveFactor = 4;
 149     VectorInsertExtractBaseCost = 2;
 150     CacheLineSize = 128;
 151     PrefetchDistance = 740;
 152     MinPrefetchStride = 1024;
 153     MaxPrefetchIterationsAhead = 11;
 154     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 155     MinVectorRegisterBitWidth = 128;
 156     break;
 157   case NeoverseE1:
 158     PrefFunctionLogAlignment = 3;
 159     break;
 160   case NeoverseN1:
 161     PrefFunctionLogAlignment = 4;
 162     PrefLoopLogAlignment = 5;
 163     MaxBytesForLoopAlignment = 16;
 164     break;
 165   case NeoverseN2:
 166     PrefFunctionLogAlignment = 4;
 167     PrefLoopLogAlignment = 5;
 168     MaxBytesForLoopAlignment = 16;
 169     VScaleForTuning = 1;
 170     break;
 171   case NeoverseV1:
 172     PrefFunctionLogAlignment = 4;
 173     PrefLoopLogAlignment = 5;
 174     MaxBytesForLoopAlignment = 16;
 175     VScaleForTuning = 2;
 176     break;
 177   case Neoverse512TVB:
 178     PrefFunctionLogAlignment = 4;
 179     VScaleForTuning = 1;
 180     MaxInterleaveFactor = 4;
 181     break;
 182   case Saphira:
 183     MaxInterleaveFactor = 4;
 184     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 185     MinVectorRegisterBitWidth = 128;
 186     break;
 187   case ThunderX2T99:
 188     CacheLineSize = 64;
 189     PrefFunctionLogAlignment = 3;
 190     PrefLoopLogAlignment = 2;
 191     MaxInterleaveFactor = 4;
 192     PrefetchDistance = 128;
 193     MinPrefetchStride = 1024;
 194     MaxPrefetchIterationsAhead = 4;
 195     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 196     MinVectorRegisterBitWidth = 128;
 197     break;
 198   case ThunderX:
 199   case ThunderXT88:
 200   case ThunderXT81:
 201   case ThunderXT83:
 202     CacheLineSize = 128;
 203     PrefFunctionLogAlignment = 3;
 204     PrefLoopLogAlignment = 2;
 205     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 206     MinVectorRegisterBitWidth = 128;
 207     break;
 208   case TSV110:
 209     CacheLineSize = 64;
 210     PrefFunctionLogAlignment = 4;
 211     PrefLoopLogAlignment = 2;
 212     break;
 213   case ThunderX3T110:
 214     CacheLineSize = 64;
 215     PrefFunctionLogAlignment = 4;
 216     PrefLoopLogAlignment = 2;
 217     MaxInterleaveFactor = 4;
 218     PrefetchDistance = 128;
 219     MinPrefetchStride = 1024;
 220     MaxPrefetchIterationsAhead = 4;
 221     // FIXME: remove this to enable 64-bit SLP if performance looks good.
 222     MinVectorRegisterBitWidth = 128;
 223     break;
 224   case Ampere1:
 225     CacheLineSize = 64;
 226     PrefFunctionLogAlignment = 6;
 227     PrefLoopLogAlignment = 6;
 228     MaxInterleaveFactor = 4;
 229     break;
 230   }
 231 }
 232
 233 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
 234                                    const std::string &TuneCPU,
 235                                    const std::string &FS,
 236                                    const TargetMachine &TM, bool LittleEndian,
 237                                    unsigned MinSVEVectorSizeInBitsOverride,
 238                                    unsigned MaxSVEVectorSizeInBitsOverride)
 239     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
 240       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
 241       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
 242       IsLittle(LittleEndian),
 243       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
 244       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
 245       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
 246       TLInfo(TM, *this) {
 247   if (AArch64::isX18ReservedByDefault(TT))
 248     ReserveXRegister.set(18);
 249
 250   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
 251   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
 252   Legalizer.reset(new AArch64LegalizerInfo(*this));
 253
 254   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
 255
 256   // FIXME: At this point, we can't rely on Subtarget having RBI.
 257   // It's awkward to mix passing RBI and the Subtarget; should we pass
 258   // TII/TRI as well?
 259   InstSelector.reset(createAArch64InstructionSelector(
 260       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
 261
 262   RegBankInfo.reset(RBI);
 263 }
 264
 265 const CallLowering *AArch64Subtarget::getCallLowering() const {
 266   return CallLoweringInfo.get();
 267 }
 268
 269 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
 270   return InlineAsmLoweringInfo.get();
 271 }
 272
 273 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
 274   return InstSelector.get();
 275 }
 276
 277 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
 278   return Legalizer.get();
 279 }
 280
 281 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
 282   return RegBankInfo.get();
 283 }
 284
 285 /// Find the target operand flags that describe how a global value should be
 286 /// referenced for the current subtarget.
 287 unsigned
 288 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
 289                                           const TargetMachine &TM) const {
 290   // MachO large model always goes via a GOT, simply to get a single 8-byte
 291   // absolute relocation on all global addresses.
 292   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
 293     return AArch64II::MO_GOT;
 294
 295   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
 296     if (GV->hasDLLImportStorageClass())
 297       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
 298     if (getTargetTriple().isOSWindows())
 299       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
 300     return AArch64II::MO_GOT;
 301   }
 302
 303   // The small code model's direct accesses use ADRP, which cannot
 304   // necessarily produce the value 0 (if the code is above 4GB).
 305   // Same for the tiny code model, where we have a pc relative LDR.
 306   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
 307       GV->hasExternalWeakLinkage())
 308     return AArch64II::MO_GOT;
 309
 310   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
 311   // that their nominal addresses are tagged and outside of the code model. In
 312   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
 313   // tag if necessary based on MO_TAGGED.
 314   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
 315     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
 316
 317   return AArch64II::MO_NO_FLAG;
 318 }
 319
 320 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
 321     const GlobalValue *GV, const TargetMachine &TM) const {
 322   // MachO large model always goes via a GOT, because we don't have the
 323   // relocations available to do anything else..
 324   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
 325       !GV->hasInternalLinkage())
 326     return AArch64II::MO_GOT;
 327
 328   // NonLazyBind goes via GOT unless we know it's available locally.
 329   auto *F = dyn_cast<Function>(GV);
 330   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
 331       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
 332     return AArch64II::MO_GOT;
 333
 334   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
 335   if (getTargetTriple().isOSWindows())
 336     return ClassifyGlobalReference(GV, TM);
 337
 338   return AArch64II::MO_NO_FLAG;
 339 }
 340
 341 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
 342                                            unsigned NumRegionInstrs) const {
 343   // LNT run (at least on Cyclone) showed reasonably significant gains for
 344   // bi-directional scheduling. 253.perlbmk.
 345   Policy.OnlyTopDown = false;
 346   Policy.OnlyBottomUp = false;
 347   // Enabling or Disabling the latency heuristic is a close call: It seems to
 348   // help nearly no benchmark on out-of-order architectures, on the other hand
 349   // it regresses register pressure on a few benchmarking.
 350   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 351 }
 352
 353 bool AArch64Subtarget::enableEarlyIfConversion() const {
 354   return EnableEarlyIfConvert;
 355 }
 356
 357 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 358   if (!UseAddressTopByteIgnored)
 359     return false;
 360
 361   if (TargetTriple.isiOS()) {
 362     return TargetTriple.getiOSVersion() >= VersionTuple(8);
 363   }
 364
 365   return false;
 366 }
 367
 368 std::unique_ptr<PBQPRAConstraint>
 369 AArch64Subtarget::getCustomPBQPConstraints() const {
 370   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
 371 }
 372
 373 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
 374   // We usually compute max call frame size after ISel. Do the computation now
 375   // if the .mir file didn't specify it. Note that this will probably give you
 376   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
 377   // instructions, specify explicitly if you need it to be correct.
 378   MachineFrameInfo &MFI = MF.getFrameInfo();
 379   if (!MFI.isMaxCallFrameSizeComputed())
 380     MFI.computeMaxCallFrameSize(MF);
 381 }
 382
 383 bool AArch64Subtarget::useAA() const { return UseAA; }