contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

   1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This pass eliminates allocas by either converting them into vectors or
  10 // by migrating them to local address space.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "AMDGPU.h"
  15 #include "AMDGPUSubtarget.h"
  16 #include "Utils/AMDGPUBaseInfo.h"
  17 #include "llvm/ADT/APInt.h"
  18 #include "llvm/ADT/None.h"
  19 #include "llvm/ADT/STLExtras.h"
  20 #include "llvm/ADT/StringRef.h"
  21 #include "llvm/ADT/Triple.h"
  22 #include "llvm/ADT/Twine.h"
  23 #include "llvm/Analysis/CaptureTracking.h"
  24 #include "llvm/Analysis/ValueTracking.h"
  25 #include "llvm/CodeGen/TargetPassConfig.h"
  26 #include "llvm/IR/Attributes.h"
  27 #include "llvm/IR/BasicBlock.h"
  28 #include "llvm/IR/Constant.h"
  29 #include "llvm/IR/Constants.h"
  30 #include "llvm/IR/DataLayout.h"
  31 #include "llvm/IR/DerivedTypes.h"
  32 #include "llvm/IR/Function.h"
  33 #include "llvm/IR/GlobalValue.h"
  34 #include "llvm/IR/GlobalVariable.h"
  35 #include "llvm/IR/IRBuilder.h"
  36 #include "llvm/IR/Instruction.h"
  37 #include "llvm/IR/Instructions.h"
  38 #include "llvm/IR/IntrinsicInst.h"
  39 #include "llvm/IR/Intrinsics.h"
  40 #include "llvm/IR/IntrinsicsAMDGPU.h"
  41 #include "llvm/IR/IntrinsicsR600.h"
  42 #include "llvm/IR/LLVMContext.h"
  43 #include "llvm/IR/Metadata.h"
  44 #include "llvm/IR/Module.h"
  45 #include "llvm/IR/Type.h"
  46 #include "llvm/IR/User.h"
  47 #include "llvm/IR/Value.h"
  48 #include "llvm/Pass.h"
  49 #include "llvm/Support/Casting.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Support/raw_ostream.h"
  54 #include "llvm/Target/TargetMachine.h"
  55 #include <algorithm>
  56 #include <cassert>
  57 #include <cstdint>
  58 #include <map>
  59 #include <tuple>
  60 #include <utility>
  61 #include <vector>
  62
  63 #define DEBUG_TYPE "amdgpu-promote-alloca"
  64
  65 using namespace llvm;
  66
  67 namespace {
  68
  69 static cl::opt<bool> DisablePromoteAllocaToVector(
  70   "disable-promote-alloca-to-vector",
  71   cl::desc("Disable promote alloca to vector"),
  72   cl::init(false));
  73
  74 static cl::opt<bool> DisablePromoteAllocaToLDS(
  75   "disable-promote-alloca-to-lds",
  76   cl::desc("Disable promote alloca to LDS"),
  77   cl::init(false));
  78
  79 // FIXME: This can create globals so should be a module pass.
  80 class AMDGPUPromoteAlloca : public FunctionPass {
  81 private:
  82   const TargetMachine *TM;
  83   Module *Mod = nullptr;
  84   const DataLayout *DL = nullptr;
  85
  86   // FIXME: This should be per-kernel.
  87   uint32_t LocalMemLimit = 0;
  88   uint32_t CurrentLocalMemUsage = 0;
  89
  90   bool IsAMDGCN = false;
  91   bool IsAMDHSA = false;
  92
  93   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
  94   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
  95
  96   /// BaseAlloca is the alloca root the search started from.
  97   /// Val may be that alloca or a recursive user of it.
  98   bool collectUsesWithPtrTypes(Value *BaseAlloca,
  99                                Value *Val,
 100                                std::vector<Value*> &WorkList) const;
 101
 102   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
 103   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
 104   /// Returns true if both operands are derived from the same alloca. Val should
 105   /// be the same value as one of the input operands of UseInst.
 106   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
 107                                        Instruction *UseInst,
 108                                        int OpIdx0, int OpIdx1) const;
 109
 110   /// Check whether we have enough local memory for promotion.
 111   bool hasSufficientLocalMem(const Function &F);
 112
 113 public:
 114   static char ID;
 115
 116   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
 117
 118   bool doInitialization(Module &M) override;
 119   bool runOnFunction(Function &F) override;
 120
 121   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 122
 123   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
 124
 125   void getAnalysisUsage(AnalysisUsage &AU) const override {
 126     AU.setPreservesCFG();
 127     FunctionPass::getAnalysisUsage(AU);
 128   }
 129 };
 130
 131 } // end anonymous namespace
 132
 133 char AMDGPUPromoteAlloca::ID = 0;
 134
 135 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
 136                 "AMDGPU promote alloca to vector or LDS", false, false)
 137
 138 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 139
 140 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 141   Mod = &M;
 142   DL = &Mod->getDataLayout();
 143
 144   return false;
 145 }
 146
 147 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 148   if (skipFunction(F))
 149     return false;
 150
 151   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
 152     TM = &TPC->getTM<TargetMachine>();
 153   else
 154     return false;
 155
 156   const Triple &TT = TM->getTargetTriple();
 157   IsAMDGCN = TT.getArch() == Triple::amdgcn;
 158   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 159
 160   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 161   if (!ST.isPromoteAllocaEnabled())
 162     return false;
 163
 164   bool SufficientLDS = hasSufficientLocalMem(F);
 165   bool Changed = false;
 166   BasicBlock &EntryBB = *F.begin();
 167
 168   SmallVector<AllocaInst *, 16> Allocas;
 169   for (Instruction &I : EntryBB) {
 170     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
 171       Allocas.push_back(AI);
 172   }
 173
 174   for (AllocaInst *AI : Allocas) {
 175     if (handleAlloca(*AI, SufficientLDS))
 176       Changed = true;
 177   }
 178
 179   return Changed;
 180 }
 181
 182 std::pair<Value *, Value *>
 183 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 184   const Function &F = *Builder.GetInsertBlock()->getParent();
 185   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 186
 187   if (!IsAMDHSA) {
 188     Function *LocalSizeYFn
 189       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
 190     Function *LocalSizeZFn
 191       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
 192
 193     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
 194     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
 195
 196     ST.makeLIDRangeMetadata(LocalSizeY);
 197     ST.makeLIDRangeMetadata(LocalSizeZ);
 198
 199     return std::make_pair(LocalSizeY, LocalSizeZ);
 200   }
 201
 202   // We must read the size out of the dispatch pointer.
 203   assert(IsAMDGCN);
 204
 205   // We are indexing into this struct, and want to extract the workgroup_size_*
 206   // fields.
 207   //
 208   //   typedef struct hsa_kernel_dispatch_packet_s {
 209   //     uint16_t header;
 210   //     uint16_t setup;
 211   //     uint16_t workgroup_size_x ;
 212   //     uint16_t workgroup_size_y;
 213   //     uint16_t workgroup_size_z;
 214   //     uint16_t reserved0;
 215   //     uint32_t grid_size_x ;
 216   //     uint32_t grid_size_y ;
 217   //     uint32_t grid_size_z;
 218   //
 219   //     uint32_t private_segment_size;
 220   //     uint32_t group_segment_size;
 221   //     uint64_t kernel_object;
 222   //
 223   // #ifdef HSA_LARGE_MODEL
 224   //     void *kernarg_address;
 225   // #elif defined HSA_LITTLE_ENDIAN
 226   //     void *kernarg_address;
 227   //     uint32_t reserved1;
 228   // #else
 229   //     uint32_t reserved1;
 230   //     void *kernarg_address;
 231   // #endif
 232   //     uint64_t reserved2;
 233   //     hsa_signal_t completion_signal; // uint64_t wrapper
 234   //   } hsa_kernel_dispatch_packet_t
 235   //
 236   Function *DispatchPtrFn
 237     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 238
 239   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
 240   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
 241   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
 242
 243   // Size of the dispatch packet struct.
 244   DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
 245
 246   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
 247   Value *CastDispatchPtr = Builder.CreateBitCast(
 248     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
 249
 250   // We could do a single 64-bit load here, but it's likely that the basic
 251   // 32-bit and extract sequence is already present, and it is probably easier
 252   // to CSE this. The loads should be mergable later anyway.
 253   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
 254   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
 255
 256   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
 257   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
 258
 259   MDNode *MD = MDNode::get(Mod->getContext(), None);
 260   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
 261   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
 262   ST.makeLIDRangeMetadata(LoadZU);
 263
 264   // Extract y component. Upper half of LoadZU should be zero already.
 265   Value *Y = Builder.CreateLShr(LoadXY, 16);
 266
 267   return std::make_pair(Y, LoadZU);
 268 }
 269
 270 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
 271   const AMDGPUSubtarget &ST =
 272       AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
 273   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
 274
 275   switch (N) {
 276   case 0:
 277     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
 278                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
 279     break;
 280   case 1:
 281     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
 282                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
 283     break;
 284
 285   case 2:
 286     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
 287                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
 288     break;
 289   default:
 290     llvm_unreachable("invalid dimension");
 291   }
 292
 293   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
 294   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
 295   ST.makeLIDRangeMetadata(CI);
 296
 297   return CI;
 298 }
 299
 300 static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
 301   return VectorType::get(ArrayTy->getElementType(),
 302                          ArrayTy->getNumElements());
 303 }
 304
 305 static Value *
 306 calculateVectorIndex(Value *Ptr,
 307                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
 308   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 309
 310   auto I = GEPIdx.find(GEP);
 311   return I == GEPIdx.end() ? nullptr : I->second;
 312 }
 313
 314 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 315   // FIXME we only support simple cases
 316   if (GEP->getNumOperands() != 3)
 317     return nullptr;
 318
 319   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
 320   if (!I0 || !I0->isZero())
 321     return nullptr;
 322
 323   return GEP->getOperand(2);
 324 }
 325
 326 // Not an instruction handled below to turn into a vector.
 327 //
 328 // TODO: Check isTriviallyVectorizable for calls and handle other
 329 // instructions.
 330 static bool canVectorizeInst(Instruction *Inst, User *User) {
 331   switch (Inst->getOpcode()) {
 332   case Instruction::Load: {
 333     // Currently only handle the case where the Pointer Operand is a GEP.
 334     // Also we could not vectorize volatile or atomic loads.
 335     LoadInst *LI = cast<LoadInst>(Inst);
 336     if (isa<AllocaInst>(User) &&
 337         LI->getPointerOperandType() == User->getType() &&
 338         isa<VectorType>(LI->getType()))
 339       return true;
 340     return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
 341   }
 342   case Instruction::BitCast:
 343     return true;
 344   case Instruction::Store: {
 345     // Must be the stored pointer operand, not a stored value, plus
 346     // since it should be canonical form, the User should be a GEP.
 347     // Also we could not vectorize volatile or atomic stores.
 348     StoreInst *SI = cast<StoreInst>(Inst);
 349     if (isa<AllocaInst>(User) &&
 350         SI->getPointerOperandType() == User->getType() &&
 351         isa<VectorType>(SI->getValueOperand()->getType()))
 352       return true;
 353     return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
 354   }
 355   default:
 356     return false;
 357   }
 358 }
 359
 360 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 361
 362   if (DisablePromoteAllocaToVector) {
 363     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
 364     return false;
 365   }
 366
 367   Type *AT = Alloca->getAllocatedType();
 368   SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
 369
 370   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 371
 372   // FIXME: There is no reason why we can't support larger arrays, we
 373   // are just being conservative for now.
 374   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
 375   // could also be promoted but we don't currently handle this case
 376   if (!AllocaTy ||
 377       AllocaTy->getNumElements() > 16 ||
 378       AllocaTy->getNumElements() < 2 ||
 379       !VectorType::isValidElementType(AllocaTy->getElementType())) {
 380     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
 381     return false;
 382   }
 383
 384   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
 385   std::vector<Value*> WorkList;
 386   for (User *AllocaUser : Alloca->users()) {
 387     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
 388     if (!GEP) {
 389       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
 390         return false;
 391
 392       WorkList.push_back(AllocaUser);
 393       continue;
 394     }
 395
 396     Value *Index = GEPToVectorIndex(GEP);
 397
 398     // If we can't compute a vector index from this GEP, then we can't
 399     // promote this alloca to vector.
 400     if (!Index) {
 401       LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
 402                         << '\n');
 403       return false;
 404     }
 405
 406     GEPVectorIdx[GEP] = Index;
 407     for (User *GEPUser : AllocaUser->users()) {
 408       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
 409         return false;
 410
 411       WorkList.push_back(GEPUser);
 412     }
 413   }
 414
 415   VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
 416   if (!VectorTy)
 417     VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
 418
 419   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
 420                     << *VectorTy << '\n');
 421
 422   for (Value *V : WorkList) {
 423     Instruction *Inst = cast<Instruction>(V);
 424     IRBuilder<> Builder(Inst);
 425     switch (Inst->getOpcode()) {
 426     case Instruction::Load: {
 427       if (Inst->getType() == AT)
 428         break;
 429
 430       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 431       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
 432       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 433
 434       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 435       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
 436       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
 437       Inst->replaceAllUsesWith(ExtractElement);
 438       Inst->eraseFromParent();
 439       break;
 440     }
 441     case Instruction::Store: {
 442       StoreInst *SI = cast<StoreInst>(Inst);
 443       if (SI->getValueOperand()->getType() == AT)
 444         break;
 445
 446       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 447       Value *Ptr = SI->getPointerOperand();
 448       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 449       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 450       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
 451       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
 452                                                        SI->getValueOperand(),
 453                                                        Index);
 454       Builder.CreateStore(NewVecValue, BitCast);
 455       Inst->eraseFromParent();
 456       break;
 457     }
 458     case Instruction::BitCast:
 459     case Instruction::AddrSpaceCast:
 460       break;
 461
 462     default:
 463       llvm_unreachable("Inconsistency in instructions promotable to vector");
 464     }
 465   }
 466   return true;
 467 }
 468
 469 static bool isCallPromotable(CallInst *CI) {
 470   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
 471   if (!II)
 472     return false;
 473
 474   switch (II->getIntrinsicID()) {
 475   case Intrinsic::memcpy:
 476   case Intrinsic::memmove:
 477   case Intrinsic::memset:
 478   case Intrinsic::lifetime_start:
 479   case Intrinsic::lifetime_end:
 480   case Intrinsic::invariant_start:
 481   case Intrinsic::invariant_end:
 482   case Intrinsic::launder_invariant_group:
 483   case Intrinsic::strip_invariant_group:
 484   case Intrinsic::objectsize:
 485     return true;
 486   default:
 487     return false;
 488   }
 489 }
 490
 491 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
 492                                                           Value *Val,
 493                                                           Instruction *Inst,
 494                                                           int OpIdx0,
 495                                                           int OpIdx1) const {
 496   // Figure out which operand is the one we might not be promoting.
 497   Value *OtherOp = Inst->getOperand(OpIdx0);
 498   if (Val == OtherOp)
 499     OtherOp = Inst->getOperand(OpIdx1);
 500
 501   if (isa<ConstantPointerNull>(OtherOp))
 502     return true;
 503
 504   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
 505   if (!isa<AllocaInst>(OtherObj))
 506     return false;
 507
 508   // TODO: We should be able to replace undefs with the right pointer type.
 509
 510   // TODO: If we know the other base object is another promotable
 511   // alloca, not necessarily this alloca, we can do this. The
 512   // important part is both must have the same address space at
 513   // the end.
 514   if (OtherObj != BaseAlloca) {
 515     LLVM_DEBUG(
 516         dbgs() << "Found a binary instruction with another alloca object\n");
 517     return false;
 518   }
 519
 520   return true;
 521 }
 522
 523 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
 524   Value *BaseAlloca,
 525   Value *Val,
 526   std::vector<Value*> &WorkList) const {
 527
 528   for (User *User : Val->users()) {
 529     if (is_contained(WorkList, User))
 530       continue;
 531
 532     if (CallInst *CI = dyn_cast<CallInst>(User)) {
 533       if (!isCallPromotable(CI))
 534         return false;
 535
 536       WorkList.push_back(User);
 537       continue;
 538     }
 539
 540     Instruction *UseInst = cast<Instruction>(User);
 541     if (UseInst->getOpcode() == Instruction::PtrToInt)
 542       return false;
 543
 544     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
 545       if (LI->isVolatile())
 546         return false;
 547
 548       continue;
 549     }
 550
 551     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
 552       if (SI->isVolatile())
 553         return false;
 554
 555       // Reject if the stored value is not the pointer operand.
 556       if (SI->getPointerOperand() != Val)
 557         return false;
 558     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
 559       if (RMW->isVolatile())
 560         return false;
 561     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
 562       if (CAS->isVolatile())
 563         return false;
 564     }
 565
 566     // Only promote a select if we know that the other select operand
 567     // is from another pointer that will also be promoted.
 568     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
 569       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
 570         return false;
 571
 572       // May need to rewrite constant operands.
 573       WorkList.push_back(ICmp);
 574     }
 575
 576     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
 577       // Give up if the pointer may be captured.
 578       if (PointerMayBeCaptured(UseInst, true, true))
 579         return false;
 580       // Don't collect the users of this.
 581       WorkList.push_back(User);
 582       continue;
 583     }
 584
 585     if (!User->getType()->isPointerTy())
 586       continue;
 587
 588     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
 589       // Be conservative if an address could be computed outside the bounds of
 590       // the alloca.
 591       if (!GEP->isInBounds())
 592         return false;
 593     }
 594
 595     // Only promote a select if we know that the other select operand is from
 596     // another pointer that will also be promoted.
 597     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
 598       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
 599         return false;
 600     }
 601
 602     // Repeat for phis.
 603     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
 604       // TODO: Handle more complex cases. We should be able to replace loops
 605       // over arrays.
 606       switch (Phi->getNumIncomingValues()) {
 607       case 1:
 608         break;
 609       case 2:
 610         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
 611           return false;
 612         break;
 613       default:
 614         return false;
 615       }
 616     }
 617
 618     WorkList.push_back(User);
 619     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
 620       return false;
 621   }
 622
 623   return true;
 624 }
 625
 626 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
 627
 628   FunctionType *FTy = F.getFunctionType();
 629   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
 630
 631   // If the function has any arguments in the local address space, then it's
 632   // possible these arguments require the entire local memory space, so
 633   // we cannot use local memory in the pass.
 634   for (Type *ParamTy : FTy->params()) {
 635     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
 636     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
 637       LocalMemLimit = 0;
 638       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
 639                            "local memory disabled.\n");
 640       return false;
 641     }
 642   }
 643
 644   LocalMemLimit = ST.getLocalMemorySize();
 645   if (LocalMemLimit == 0)
 646     return false;
 647
 648   const DataLayout &DL = Mod->getDataLayout();
 649
 650   // Check how much local memory is being used by global objects
 651   CurrentLocalMemUsage = 0;
 652   for (GlobalVariable &GV : Mod->globals()) {
 653     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
 654       continue;
 655
 656     for (const User *U : GV.users()) {
 657       const Instruction *Use = dyn_cast<Instruction>(U);
 658       if (!Use)
 659         continue;
 660
 661       if (Use->getParent()->getParent() == &F) {
 662         unsigned Align = GV.getAlignment();
 663         if (Align == 0)
 664           Align = DL.getABITypeAlignment(GV.getValueType());
 665
 666         // FIXME: Try to account for padding here. The padding is currently
 667         // determined from the inverse order of uses in the function. I'm not
 668         // sure if the use list order is in any way connected to this, so the
 669         // total reported size is likely incorrect.
 670         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
 671         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
 672         CurrentLocalMemUsage += AllocSize;
 673         break;
 674       }
 675     }
 676   }
 677
 678   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
 679                                                           F);
 680
 681   // Restrict local memory usage so that we don't drastically reduce occupancy,
 682   // unless it is already significantly reduced.
 683
 684   // TODO: Have some sort of hint or other heuristics to guess occupancy based
 685   // on other factors..
 686   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
 687   if (OccupancyHint == 0)
 688     OccupancyHint = 7;
 689
 690   // Clamp to max value.
 691   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
 692
 693   // Check the hint but ignore it if it's obviously wrong from the existing LDS
 694   // usage.
 695   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
 696
 697
 698   // Round up to the next tier of usage.
 699   unsigned MaxSizeWithWaveCount
 700     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
 701
 702   // Program is possibly broken by using more local mem than available.
 703   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
 704     return false;
 705
 706   LocalMemLimit = MaxSizeWithWaveCount;
 707
 708   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
 709                     << " bytes of LDS\n"
 710                     << "  Rounding size to " << MaxSizeWithWaveCount
 711                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
 712                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
 713                     << " available for promotion\n");
 714
 715   return true;
 716 }
 717
 718 // FIXME: Should try to pick the most likely to be profitable allocas first.
 719 bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
 720   // Array allocations are probably not worth handling, since an allocation of
 721   // the array type is the canonical form.
 722   if (!I.isStaticAlloca() || I.isArrayAllocation())
 723     return false;
 724
 725   IRBuilder<> Builder(&I);
 726
 727   // First try to replace the alloca with a vector
 728   Type *AllocaTy = I.getAllocatedType();
 729
 730   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 731
 732   if (tryPromoteAllocaToVector(&I))
 733     return true; // Promoted to vector.
 734
 735   if (DisablePromoteAllocaToLDS)
 736     return false;
 737
 738   const Function &ContainingFunction = *I.getParent()->getParent();
 739   CallingConv::ID CC = ContainingFunction.getCallingConv();
 740
 741   // Don't promote the alloca to LDS for shader calling conventions as the work
 742   // item ID intrinsics are not supported for these calling conventions.
 743   // Furthermore not all LDS is available for some of the stages.
 744   switch (CC) {
 745   case CallingConv::AMDGPU_KERNEL:
 746   case CallingConv::SPIR_KERNEL:
 747     break;
 748   default:
 749     LLVM_DEBUG(
 750         dbgs()
 751         << " promote alloca to LDS not supported with calling convention.\n");
 752     return false;
 753   }
 754
 755   // Not likely to have sufficient local memory for promotion.
 756   if (!SufficientLDS)
 757     return false;
 758
 759   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
 760   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 761
 762   const DataLayout &DL = Mod->getDataLayout();
 763
 764   unsigned Align = I.getAlignment();
 765   if (Align == 0)
 766     Align = DL.getABITypeAlignment(I.getAllocatedType());
 767
 768   // FIXME: This computed padding is likely wrong since it depends on inverse
 769   // usage order.
 770   //
 771   // FIXME: It is also possible that if we're allowed to use all of the memory
 772   // could could end up using more than the maximum due to alignment padding.
 773
 774   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
 775   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
 776   NewSize += AllocSize;
 777
 778   if (NewSize > LocalMemLimit) {
 779     LLVM_DEBUG(dbgs() << "  " << AllocSize
 780                       << " bytes of local memory not available to promote\n");
 781     return false;
 782   }
 783
 784   CurrentLocalMemUsage = NewSize;
 785
 786   std::vector<Value*> WorkList;
 787
 788   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
 789     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
 790     return false;
 791   }
 792
 793   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
 794
 795   Function *F = I.getParent()->getParent();
 796
 797   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
 798   GlobalVariable *GV = new GlobalVariable(
 799       *Mod, GVTy, false, GlobalValue::InternalLinkage,
 800       UndefValue::get(GVTy),
 801       Twine(F->getName()) + Twine('.') + I.getName(),
 802       nullptr,
 803       GlobalVariable::NotThreadLocal,
 804       AMDGPUAS::LOCAL_ADDRESS);
 805   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 806   GV->setAlignment(MaybeAlign(I.getAlignment()));
 807
 808   Value *TCntY, *TCntZ;
 809
 810   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
 811   Value *TIdX = getWorkitemID(Builder, 0);
 812   Value *TIdY = getWorkitemID(Builder, 1);
 813   Value *TIdZ = getWorkitemID(Builder, 2);
 814
 815   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
 816   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
 817   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
 818   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
 819   TID = Builder.CreateAdd(TID, TIdZ);
 820
 821   Value *Indices[] = {
 822     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
 823     TID
 824   };
 825
 826   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
 827   I.mutateType(Offset->getType());
 828   I.replaceAllUsesWith(Offset);
 829   I.eraseFromParent();
 830
 831   for (Value *V : WorkList) {
 832     CallInst *Call = dyn_cast<CallInst>(V);
 833     if (!Call) {
 834       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
 835         Value *Src0 = CI->getOperand(0);
 836         Type *EltTy = Src0->getType()->getPointerElementType();
 837         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 838
 839         if (isa<ConstantPointerNull>(CI->getOperand(0)))
 840           CI->setOperand(0, ConstantPointerNull::get(NewTy));
 841
 842         if (isa<ConstantPointerNull>(CI->getOperand(1)))
 843           CI->setOperand(1, ConstantPointerNull::get(NewTy));
 844
 845         continue;
 846       }
 847
 848       // The operand's value should be corrected on its own and we don't want to
 849       // touch the users.
 850       if (isa<AddrSpaceCastInst>(V))
 851         continue;
 852
 853       Type *EltTy = V->getType()->getPointerElementType();
 854       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 855
 856       // FIXME: It doesn't really make sense to try to do this for all
 857       // instructions.
 858       V->mutateType(NewTy);
 859
 860       // Adjust the types of any constant operands.
 861       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
 862         if (isa<ConstantPointerNull>(SI->getOperand(1)))
 863           SI->setOperand(1, ConstantPointerNull::get(NewTy));
 864
 865         if (isa<ConstantPointerNull>(SI->getOperand(2)))
 866           SI->setOperand(2, ConstantPointerNull::get(NewTy));
 867       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
 868         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
 869           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
 870             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
 871         }
 872       }
 873
 874       continue;
 875     }
 876
 877     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
 878     Builder.SetInsertPoint(Intr);
 879     switch (Intr->getIntrinsicID()) {
 880     case Intrinsic::lifetime_start:
 881     case Intrinsic::lifetime_end:
 882       // These intrinsics are for address space 0 only
 883       Intr->eraseFromParent();
 884       continue;
 885     case Intrinsic::memcpy: {
 886       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
 887       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
 888                            MemCpy->getRawSource(), MemCpy->getSourceAlign(),
 889                            MemCpy->getLength(), MemCpy->isVolatile());
 890       Intr->eraseFromParent();
 891       continue;
 892     }
 893     case Intrinsic::memmove: {
 894       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
 895       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
 896                             MemMove->getRawSource(), MemMove->getSourceAlign(),
 897                             MemMove->getLength(), MemMove->isVolatile());
 898       Intr->eraseFromParent();
 899       continue;
 900     }
 901     case Intrinsic::memset: {
 902       MemSetInst *MemSet = cast<MemSetInst>(Intr);
 903       Builder.CreateMemSet(
 904           MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
 905           MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
 906       Intr->eraseFromParent();
 907       continue;
 908     }
 909     case Intrinsic::invariant_start:
 910     case Intrinsic::invariant_end:
 911     case Intrinsic::launder_invariant_group:
 912     case Intrinsic::strip_invariant_group:
 913       Intr->eraseFromParent();
 914       // FIXME: I think the invariant marker should still theoretically apply,
 915       // but the intrinsics need to be changed to accept pointers with any
 916       // address space.
 917       continue;
 918     case Intrinsic::objectsize: {
 919       Value *Src = Intr->getOperand(0);
 920       Type *SrcTy = Src->getType()->getPointerElementType();
 921       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
 922         Intrinsic::objectsize,
 923         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
 924       );
 925
 926       CallInst *NewCall = Builder.CreateCall(
 927           ObjectSize,
 928           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
 929       Intr->replaceAllUsesWith(NewCall);
 930       Intr->eraseFromParent();
 931       continue;
 932     }
 933     default:
 934       Intr->print(errs());
 935       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
 936     }
 937   }
 938   return true;
 939 }
 940
 941 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
 942   return new AMDGPUPromoteAlloca();
 943 }