contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This pass adds target attributes to functions which use intrinsics
  10 /// which will impact calling convention lowering.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPUSubtarget.h"
  16 #include "Utils/AMDGPUBaseInfo.h"
  17 #include "llvm/ADT/SmallPtrSet.h"
  18 #include "llvm/ADT/SmallVector.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/Triple.h"
  21 #include "llvm/Analysis/CallGraph.h"
  22 #include "llvm/Analysis/CallGraphSCCPass.h"
  23 #include "llvm/CodeGen/TargetPassConfig.h"
  24 #include "llvm/IR/Constant.h"
  25 #include "llvm/IR/Constants.h"
  26 #include "llvm/IR/Function.h"
  27 #include "llvm/IR/Instruction.h"
  28 #include "llvm/IR/Instructions.h"
  29 #include "llvm/IR/Intrinsics.h"
  30 #include "llvm/IR/Module.h"
  31 #include "llvm/IR/Type.h"
  32 #include "llvm/IR/Use.h"
  33 #include "llvm/Pass.h"
  34 #include "llvm/Support/Casting.h"
  35 #include "llvm/Support/ErrorHandling.h"
  36 #include "llvm/Target/TargetMachine.h"
  37
  38 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  39
  40 using namespace llvm;
  41
  42 namespace {
  43
  44 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  45 private:
  46   const TargetMachine *TM = nullptr;
  47   SmallVector<CallGraphNode*, 8> NodeList;
  48
  49   bool addFeatureAttributes(Function &F);
  50   bool processUniformWorkGroupAttribute();
  51   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
  52
  53 public:
  54   static char ID;
  55
  56   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  57
  58   bool doInitialization(CallGraph &CG) override;
  59   bool runOnSCC(CallGraphSCC &SCC) override;
  60
  61   StringRef getPassName() const override {
  62     return "AMDGPU Annotate Kernel Features";
  63   }
  64
  65   void getAnalysisUsage(AnalysisUsage &AU) const override {
  66     AU.setPreservesAll();
  67     CallGraphSCCPass::getAnalysisUsage(AU);
  68   }
  69
  70   static bool visitConstantExpr(const ConstantExpr *CE);
  71   static bool visitConstantExprsRecursively(
  72     const Constant *EntryC,
  73     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
  74     bool HasApertureRegs);
  75 };
  76
  77 } // end anonymous namespace
  78
  79 char AMDGPUAnnotateKernelFeatures::ID = 0;
  80
  81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  82
  83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  84                 "Add AMDGPU function attributes", false, false)
  85
  86
  87 // The queue ptr is only needed when casting to flat, not from it.
  88 static bool castRequiresQueuePtr(unsigned SrcAS) {
  89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
  90 }
  91
  92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
  93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
  94 }
  95
  96 static bool isDSAddress(const Constant *C) {
  97   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
  98   if (!GV)
  99     return false;
 100   unsigned AS = GV->getAddressSpace();
 101   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
 102 }
 103
 104 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
 105   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
 106     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
 107     return castRequiresQueuePtr(SrcAS);
 108   }
 109
 110   return false;
 111 }
 112
 113 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 114   const Constant *EntryC,
 115   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
 116   bool IsFunc, bool HasApertureRegs) {
 117
 118   if (!ConstantExprVisited.insert(EntryC).second)
 119     return false;
 120
 121   SmallVector<const Constant *, 16> Stack;
 122   Stack.push_back(EntryC);
 123
 124   while (!Stack.empty()) {
 125     const Constant *C = Stack.pop_back_val();
 126
 127     // We need to trap on DS globals in non-entry functions.
 128     if (IsFunc && isDSAddress(C))
 129       return true;
 130
 131     // Check this constant expression.
 132     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 133       if (!HasApertureRegs && visitConstantExpr(CE))
 134         return true;
 135     }
 136
 137     // Visit all sub-expressions.
 138     for (const Use &U : C->operands()) {
 139       const auto *OpC = dyn_cast<Constant>(U);
 140       if (!OpC)
 141         continue;
 142
 143       if (!ConstantExprVisited.insert(OpC).second)
 144         continue;
 145
 146       Stack.push_back(OpC);
 147     }
 148   }
 149
 150   return false;
 151 }
 152
 153 // We do not need to note the x workitem or workgroup id because they are always
 154 // initialized.
 155 //
 156 // TODO: We should not add the attributes if the known compile time workgroup
 157 // size is 1 for y/z.
 158 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 159                                      bool &NonKernelOnly,
 160                                      bool &IsQueuePtr) {
 161   switch (ID) {
 162   case Intrinsic::amdgcn_workitem_id_x:
 163     NonKernelOnly = true;
 164     return "amdgpu-work-item-id-x";
 165   case Intrinsic::amdgcn_workgroup_id_x:
 166     NonKernelOnly = true;
 167     return "amdgpu-work-group-id-x";
 168   case Intrinsic::amdgcn_workitem_id_y:
 169   case Intrinsic::r600_read_tidig_y:
 170     return "amdgpu-work-item-id-y";
 171   case Intrinsic::amdgcn_workitem_id_z:
 172   case Intrinsic::r600_read_tidig_z:
 173     return "amdgpu-work-item-id-z";
 174   case Intrinsic::amdgcn_workgroup_id_y:
 175   case Intrinsic::r600_read_tgid_y:
 176     return "amdgpu-work-group-id-y";
 177   case Intrinsic::amdgcn_workgroup_id_z:
 178   case Intrinsic::r600_read_tgid_z:
 179     return "amdgpu-work-group-id-z";
 180   case Intrinsic::amdgcn_dispatch_ptr:
 181     return "amdgpu-dispatch-ptr";
 182   case Intrinsic::amdgcn_dispatch_id:
 183     return "amdgpu-dispatch-id";
 184   case Intrinsic::amdgcn_kernarg_segment_ptr:
 185     return "amdgpu-kernarg-segment-ptr";
 186   case Intrinsic::amdgcn_implicitarg_ptr:
 187     return "amdgpu-implicitarg-ptr";
 188   case Intrinsic::amdgcn_queue_ptr:
 189   case Intrinsic::amdgcn_is_shared:
 190   case Intrinsic::amdgcn_is_private:
 191     // TODO: Does not require queue ptr on gfx9+
 192   case Intrinsic::trap:
 193   case Intrinsic::debugtrap:
 194     IsQueuePtr = true;
 195     return "amdgpu-queue-ptr";
 196   default:
 197     return "";
 198   }
 199 }
 200
 201 static bool handleAttr(Function &Parent, const Function &Callee,
 202                        StringRef Name) {
 203   if (Callee.hasFnAttribute(Name)) {
 204     Parent.addFnAttr(Name);
 205     return true;
 206   }
 207   return false;
 208 }
 209
 210 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 211                                    bool &NeedQueuePtr) {
 212   // X ids unnecessarily propagated to kernels.
 213   static constexpr StringLiteral AttrNames[] = {
 214       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
 215       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
 216       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
 217       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
 218       "amdgpu-implicitarg-ptr"};
 219
 220   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 221     NeedQueuePtr = true;
 222
 223   for (StringRef AttrName : AttrNames)
 224     handleAttr(Parent, Callee, AttrName);
 225 }
 226
 227 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
 228   bool Changed = false;
 229
 230   for (auto *Node : reverse(NodeList)) {
 231     Function *Caller = Node->getFunction();
 232
 233     for (auto I : *Node) {
 234       Function *Callee = std::get<1>(I)->getFunction();
 235       if (Callee)
 236         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
 237     }
 238   }
 239
 240   return Changed;
 241 }
 242
 243 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 244        Function &Caller, Function &Callee) {
 245
 246   // Check for externally defined function
 247   if (!Callee.hasExactDefinition()) {
 248     Callee.addFnAttr("uniform-work-group-size", "false");
 249     if (!Caller.hasFnAttribute("uniform-work-group-size"))
 250       Caller.addFnAttr("uniform-work-group-size", "false");
 251
 252     return true;
 253   }
 254   // Check if the Caller has the attribute
 255   if (Caller.hasFnAttribute("uniform-work-group-size")) {
 256     // Check if the value of the attribute is true
 257     if (Caller.getFnAttribute("uniform-work-group-size")
 258         .getValueAsString().equals("true")) {
 259       // Propagate the attribute to the Callee, if it does not have it
 260       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
 261         Callee.addFnAttr("uniform-work-group-size", "true");
 262         return true;
 263       }
 264     } else {
 265       Callee.addFnAttr("uniform-work-group-size", "false");
 266       return true;
 267     }
 268   } else {
 269     // If the attribute is absent, set it as false
 270     Caller.addFnAttr("uniform-work-group-size", "false");
 271     Callee.addFnAttr("uniform-work-group-size", "false");
 272     return true;
 273   }
 274   return false;
 275 }
 276
 277 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 278   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 279   bool HasApertureRegs = ST.hasApertureRegs();
 280   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 281
 282   bool HaveStackObjects = false;
 283   bool Changed = false;
 284   bool NeedQueuePtr = false;
 285   bool HaveCall = false;
 286   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 287
 288   for (BasicBlock &BB : F) {
 289     for (Instruction &I : BB) {
 290       if (isa<AllocaInst>(I)) {
 291         HaveStackObjects = true;
 292         continue;
 293       }
 294
 295       if (auto *CB = dyn_cast<CallBase>(&I)) {
 296         const Function *Callee =
 297             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
 298
 299         // TODO: Do something with indirect calls.
 300         if (!Callee) {
 301           if (!CB->isInlineAsm())
 302             HaveCall = true;
 303           continue;
 304         }
 305
 306         Intrinsic::ID IID = Callee->getIntrinsicID();
 307         if (IID == Intrinsic::not_intrinsic) {
 308           HaveCall = true;
 309           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 310           Changed = true;
 311         } else {
 312           bool NonKernelOnly = false;
 313
 314           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
 315             F.addFnAttr("amdgpu-kernarg-segment-ptr");
 316           } else {
 317             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
 318                                                      NeedQueuePtr);
 319             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 320               F.addFnAttr(AttrName);
 321               Changed = true;
 322             }
 323           }
 324         }
 325       }
 326
 327       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
 328         continue;
 329
 330       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 331         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
 332           NeedQueuePtr = true;
 333           continue;
 334         }
 335       }
 336
 337       for (const Use &U : I.operands()) {
 338         const auto *OpC = dyn_cast<Constant>(U);
 339         if (!OpC)
 340           continue;
 341
 342         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
 343                                           HasApertureRegs)) {
 344           NeedQueuePtr = true;
 345           break;
 346         }
 347       }
 348     }
 349   }
 350
 351   if (NeedQueuePtr) {
 352     F.addFnAttr("amdgpu-queue-ptr");
 353     Changed = true;
 354   }
 355
 356   // TODO: We could refine this to captured pointers that could possibly be
 357   // accessed by flat instructions. For now this is mostly a poor way of
 358   // estimating whether there are calls before argument lowering.
 359   if (!IsFunc && HaveCall) {
 360     F.addFnAttr("amdgpu-calls");
 361     Changed = true;
 362   }
 363
 364   if (HaveStackObjects) {
 365     F.addFnAttr("amdgpu-stack-objects");
 366     Changed = true;
 367   }
 368
 369   return Changed;
 370 }
 371
 372 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 373   bool Changed = false;
 374
 375   for (CallGraphNode *I : SCC) {
 376     // Build a list of CallGraphNodes from most number of uses to least
 377     if (I->getNumReferences())
 378       NodeList.push_back(I);
 379     else {
 380       processUniformWorkGroupAttribute();
 381       NodeList.clear();
 382     }
 383
 384     Function *F = I->getFunction();
 385     // Add feature attributes
 386     if (!F || F->isDeclaration())
 387       continue;
 388     Changed |= addFeatureAttributes(*F);
 389   }
 390
 391   return Changed;
 392 }
 393
 394 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 395   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 396   if (!TPC)
 397     report_fatal_error("TargetMachine is required");
 398
 399   TM = &TPC->getTM<TargetMachine>();
 400   return false;
 401 }
 402
 403 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 404   return new AMDGPUAnnotateKernelFeatures();
 405 }