contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp

   1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file This pass adds target attributes to functions which use intrinsics
  11 /// which will impact calling convention lowering.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "AMDGPUSubtarget.h"
  17 #include "Utils/AMDGPUBaseInfo.h"
  18 #include "llvm/ADT/SmallPtrSet.h"
  19 #include "llvm/ADT/SmallVector.h"
  20 #include "llvm/ADT/StringRef.h"
  21 #include "llvm/ADT/Triple.h"
  22 #include "llvm/Analysis/CallGraph.h"
  23 #include "llvm/Analysis/CallGraphSCCPass.h"
  24 #include "llvm/CodeGen/TargetPassConfig.h"
  25 #include "llvm/IR/CallSite.h"
  26 #include "llvm/IR/Constant.h"
  27 #include "llvm/IR/Constants.h"
  28 #include "llvm/IR/Function.h"
  29 #include "llvm/IR/Instruction.h"
  30 #include "llvm/IR/Instructions.h"
  31 #include "llvm/IR/Intrinsics.h"
  32 #include "llvm/IR/Module.h"
  33 #include "llvm/IR/Type.h"
  34 #include "llvm/IR/Use.h"
  35 #include "llvm/Pass.h"
  36 #include "llvm/Support/Casting.h"
  37 #include "llvm/Support/ErrorHandling.h"
  38 #include "llvm/Target/TargetMachine.h"
  39
  40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
  41
  42 using namespace llvm;
  43
  44 namespace {
  45
  46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
  47 private:
  48   const TargetMachine *TM = nullptr;
  49   AMDGPUAS AS;
  50
  51   bool addFeatureAttributes(Function &F);
  52
  53 public:
  54   static char ID;
  55
  56   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
  57
  58   bool doInitialization(CallGraph &CG) override;
  59   bool runOnSCC(CallGraphSCC &SCC) override;
  60
  61   StringRef getPassName() const override {
  62     return "AMDGPU Annotate Kernel Features";
  63   }
  64
  65   void getAnalysisUsage(AnalysisUsage &AU) const override {
  66     AU.setPreservesAll();
  67     CallGraphSCCPass::getAnalysisUsage(AU);
  68   }
  69
  70   static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
  71   static bool visitConstantExprsRecursively(
  72     const Constant *EntryC,
  73     SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
  74     AMDGPUAS AS);
  75 };
  76
  77 } // end anonymous namespace
  78
  79 char AMDGPUAnnotateKernelFeatures::ID = 0;
  80
  81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
  82
  83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
  84                 "Add AMDGPU function attributes", false, false)
  85
  86
  87 // The queue ptr is only needed when casting to flat, not from it.
  88 static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
  89   return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
  90 }
  91
  92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
  93     const AMDGPUAS &AS) {
  94   return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
  95 }
  96
  97 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
  98     AMDGPUAS AS) {
  99   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
 100     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
 101     return castRequiresQueuePtr(SrcAS, AS);
 102   }
 103
 104   return false;
 105 }
 106
 107 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 108   const Constant *EntryC,
 109   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
 110   AMDGPUAS AS) {
 111
 112   if (!ConstantExprVisited.insert(EntryC).second)
 113     return false;
 114
 115   SmallVector<const Constant *, 16> Stack;
 116   Stack.push_back(EntryC);
 117
 118   while (!Stack.empty()) {
 119     const Constant *C = Stack.pop_back_val();
 120
 121     // Check this constant expression.
 122     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
 123       if (visitConstantExpr(CE, AS))
 124         return true;
 125     }
 126
 127     // Visit all sub-expressions.
 128     for (const Use &U : C->operands()) {
 129       const auto *OpC = dyn_cast<Constant>(U);
 130       if (!OpC)
 131         continue;
 132
 133       if (!ConstantExprVisited.insert(OpC).second)
 134         continue;
 135
 136       Stack.push_back(OpC);
 137     }
 138   }
 139
 140   return false;
 141 }
 142
 143 // We do not need to note the x workitem or workgroup id because they are always
 144 // initialized.
 145 //
 146 // TODO: We should not add the attributes if the known compile time workgroup
 147 // size is 1 for y/z.
 148 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
 149                                      bool &NonKernelOnly,
 150                                      bool &IsQueuePtr) {
 151   switch (ID) {
 152   case Intrinsic::amdgcn_workitem_id_x:
 153     NonKernelOnly = true;
 154     return "amdgpu-work-item-id-x";
 155   case Intrinsic::amdgcn_workgroup_id_x:
 156     NonKernelOnly = true;
 157     return "amdgpu-work-group-id-x";
 158   case Intrinsic::amdgcn_workitem_id_y:
 159   case Intrinsic::r600_read_tidig_y:
 160     return "amdgpu-work-item-id-y";
 161   case Intrinsic::amdgcn_workitem_id_z:
 162   case Intrinsic::r600_read_tidig_z:
 163     return "amdgpu-work-item-id-z";
 164   case Intrinsic::amdgcn_workgroup_id_y:
 165   case Intrinsic::r600_read_tgid_y:
 166     return "amdgpu-work-group-id-y";
 167   case Intrinsic::amdgcn_workgroup_id_z:
 168   case Intrinsic::r600_read_tgid_z:
 169     return "amdgpu-work-group-id-z";
 170   case Intrinsic::amdgcn_dispatch_ptr:
 171     return "amdgpu-dispatch-ptr";
 172   case Intrinsic::amdgcn_dispatch_id:
 173     return "amdgpu-dispatch-id";
 174   case Intrinsic::amdgcn_kernarg_segment_ptr:
 175     return "amdgpu-kernarg-segment-ptr";
 176   case Intrinsic::amdgcn_implicitarg_ptr:
 177     return "amdgpu-implicitarg-ptr";
 178   case Intrinsic::amdgcn_queue_ptr:
 179   case Intrinsic::trap:
 180   case Intrinsic::debugtrap:
 181     IsQueuePtr = true;
 182     return "amdgpu-queue-ptr";
 183   default:
 184     return "";
 185   }
 186 }
 187
 188 static bool handleAttr(Function &Parent, const Function &Callee,
 189                        StringRef Name) {
 190   if (Callee.hasFnAttribute(Name)) {
 191     Parent.addFnAttr(Name);
 192     return true;
 193   }
 194
 195   return false;
 196 }
 197
 198 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
 199                                    bool &NeedQueuePtr) {
 200   // X ids unnecessarily propagated to kernels.
 201   static const StringRef AttrNames[] = {
 202     { "amdgpu-work-item-id-x" },
 203     { "amdgpu-work-item-id-y" },
 204     { "amdgpu-work-item-id-z" },
 205     { "amdgpu-work-group-id-x" },
 206     { "amdgpu-work-group-id-y" },
 207     { "amdgpu-work-group-id-z" },
 208     { "amdgpu-dispatch-ptr" },
 209     { "amdgpu-dispatch-id" },
 210     { "amdgpu-kernarg-segment-ptr" },
 211     { "amdgpu-implicitarg-ptr" }
 212   };
 213
 214   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
 215     NeedQueuePtr = true;
 216
 217   for (StringRef AttrName : AttrNames)
 218     handleAttr(Parent, Callee, AttrName);
 219 }
 220
 221 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 222   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
 223   bool HasFlat = ST.hasFlatAddressSpace();
 224   bool HasApertureRegs = ST.hasApertureRegs();
 225   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 226
 227   bool Changed = false;
 228   bool NeedQueuePtr = false;
 229   bool HaveCall = false;
 230   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
 231
 232   for (BasicBlock &BB : F) {
 233     for (Instruction &I : BB) {
 234       CallSite CS(&I);
 235       if (CS) {
 236         Function *Callee = CS.getCalledFunction();
 237
 238         // TODO: Do something with indirect calls.
 239         if (!Callee) {
 240           if (!CS.isInlineAsm())
 241             HaveCall = true;
 242           continue;
 243         }
 244
 245         Intrinsic::ID IID = Callee->getIntrinsicID();
 246         if (IID == Intrinsic::not_intrinsic) {
 247           HaveCall = true;
 248           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
 249           Changed = true;
 250         } else {
 251           bool NonKernelOnly = false;
 252           StringRef AttrName = intrinsicToAttrName(IID,
 253                                                    NonKernelOnly, NeedQueuePtr);
 254           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
 255             F.addFnAttr(AttrName);
 256             Changed = true;
 257           }
 258         }
 259       }
 260
 261       if (NeedQueuePtr || HasApertureRegs)
 262         continue;
 263
 264       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
 265         if (castRequiresQueuePtr(ASC, AS)) {
 266           NeedQueuePtr = true;
 267           continue;
 268         }
 269       }
 270
 271       for (const Use &U : I.operands()) {
 272         const auto *OpC = dyn_cast<Constant>(U);
 273         if (!OpC)
 274           continue;
 275
 276         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
 277           NeedQueuePtr = true;
 278           break;
 279         }
 280       }
 281     }
 282   }
 283
 284   if (NeedQueuePtr) {
 285     F.addFnAttr("amdgpu-queue-ptr");
 286     Changed = true;
 287   }
 288
 289   // TODO: We could refine this to captured pointers that could possibly be
 290   // accessed by flat instructions. For now this is mostly a poor way of
 291   // estimating whether there are calls before argument lowering.
 292   if (HasFlat && !IsFunc && HaveCall) {
 293     F.addFnAttr("amdgpu-flat-scratch");
 294     Changed = true;
 295   }
 296
 297   return Changed;
 298 }
 299
 300 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
 301   Module &M = SCC.getCallGraph().getModule();
 302   Triple TT(M.getTargetTriple());
 303
 304   bool Changed = false;
 305   for (CallGraphNode *I : SCC) {
 306     Function *F = I->getFunction();
 307     if (!F || F->isDeclaration())
 308       continue;
 309
 310     Changed |= addFeatureAttributes(*F);
 311   }
 312
 313   return Changed;
 314 }
 315
 316 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
 317   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
 318   if (!TPC)
 319     report_fatal_error("TargetMachine is required");
 320
 321   AS = AMDGPU::getAMDGPUAS(CG.getModule());
 322   TM = &TPC->getTM<TargetMachine>();
 323   return false;
 324 }
 325
 326 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
 327   return new AMDGPUAnnotateKernelFeatures();
 328 }