contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

   1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass eliminates allocas by either converting them into vectors or
  11 // by migrating them to local address space.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "AMDGPUSubtarget.h"
  17 #include "llvm/Analysis/ValueTracking.h"
  18 #include "llvm/IR/IRBuilder.h"
  19 #include "llvm/IR/IntrinsicInst.h"
  20 #include "llvm/IR/MDBuilder.h"
  21 #include "llvm/Support/Debug.h"
  22 #include "llvm/Support/raw_ostream.h"
  23
  24 #define DEBUG_TYPE "amdgpu-promote-alloca"
  25
  26 using namespace llvm;
  27
  28 namespace {
  29
  30 // FIXME: This can create globals so should be a module pass.
  31 class AMDGPUPromoteAlloca : public FunctionPass {
  32 private:
  33   const TargetMachine *TM;
  34   Module *Mod;
  35   const DataLayout *DL;
  36   MDNode *MaxWorkGroupSizeRange;
  37
  38   // FIXME: This should be per-kernel.
  39   uint32_t LocalMemLimit;
  40   uint32_t CurrentLocalMemUsage;
  41
  42   bool IsAMDGCN;
  43   bool IsAMDHSA;
  44
  45   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
  46   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
  47
  48   /// BaseAlloca is the alloca root the search started from.
  49   /// Val may be that alloca or a recursive user of it.
  50   bool collectUsesWithPtrTypes(Value *BaseAlloca,
  51                                Value *Val,
  52                                std::vector<Value*> &WorkList) const;
  53
  54   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
  55   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
  56   /// Returns true if both operands are derived from the same alloca. Val should
  57   /// be the same value as one of the input operands of UseInst.
  58   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
  59                                        Instruction *UseInst,
  60                                        int OpIdx0, int OpIdx1) const;
  61
  62 public:
  63   static char ID;
  64
  65   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
  66     FunctionPass(ID),
  67     TM(TM_),
  68     Mod(nullptr),
  69     DL(nullptr),
  70     MaxWorkGroupSizeRange(nullptr),
  71     LocalMemLimit(0),
  72     CurrentLocalMemUsage(0),
  73     IsAMDGCN(false),
  74     IsAMDHSA(false) { }
  75
  76   bool doInitialization(Module &M) override;
  77   bool runOnFunction(Function &F) override;
  78
  79   const char *getPassName() const override {
  80     return "AMDGPU Promote Alloca";
  81   }
  82
  83   void handleAlloca(AllocaInst &I);
  84
  85   void getAnalysisUsage(AnalysisUsage &AU) const override {
  86     AU.setPreservesCFG();
  87     FunctionPass::getAnalysisUsage(AU);
  88   }
  89 };
  90
  91 } // End anonymous namespace
  92
  93 char AMDGPUPromoteAlloca::ID = 0;
  94
  95 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
  96                    "AMDGPU promote alloca to vector or LDS", false, false)
  97
  98 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
  99
 100
 101 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 102   if (!TM)
 103     return false;
 104
 105   Mod = &M;
 106   DL = &Mod->getDataLayout();
 107
 108   // The maximum workitem id.
 109   //
 110   // FIXME: Should get as subtarget property. Usually runtime enforced max is
 111   // 256.
 112   MDBuilder MDB(Mod->getContext());
 113   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
 114
 115   const Triple &TT = TM->getTargetTriple();
 116
 117   IsAMDGCN = TT.getArch() == Triple::amdgcn;
 118   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 119
 120   return false;
 121 }
 122
 123 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 124   if (!TM || skipFunction(F))
 125     return false;
 126
 127   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
 128   if (!ST.isPromoteAllocaEnabled())
 129     return false;
 130
 131   FunctionType *FTy = F.getFunctionType();
 132
 133   // If the function has any arguments in the local address space, then it's
 134   // possible these arguments require the entire local memory space, so
 135   // we cannot use local memory in the pass.
 136   for (Type *ParamTy : FTy->params()) {
 137     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
 138     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
 139       LocalMemLimit = 0;
 140       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
 141                       "local memory disabled.\n");
 142       return false;
 143     }
 144   }
 145
 146   LocalMemLimit = ST.getLocalMemorySize();
 147   if (LocalMemLimit == 0)
 148     return false;
 149
 150   const DataLayout &DL = Mod->getDataLayout();
 151
 152   // Check how much local memory is being used by global objects
 153   CurrentLocalMemUsage = 0;
 154   for (GlobalVariable &GV : Mod->globals()) {
 155     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
 156       continue;
 157
 158     for (const User *U : GV.users()) {
 159       const Instruction *Use = dyn_cast<Instruction>(U);
 160       if (!Use)
 161         continue;
 162
 163       if (Use->getParent()->getParent() == &F) {
 164         unsigned Align = GV.getAlignment();
 165         if (Align == 0)
 166           Align = DL.getABITypeAlignment(GV.getValueType());
 167
 168         // FIXME: Try to account for padding here. The padding is currently
 169         // determined from the inverse order of uses in the function. I'm not
 170         // sure if the use list order is in any way connected to this, so the
 171         // total reported size is likely incorrect.
 172         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
 173         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
 174         CurrentLocalMemUsage += AllocSize;
 175         break;
 176       }
 177     }
 178   }
 179
 180   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
 181
 182   // Restrict local memory usage so that we don't drastically reduce occupancy,
 183   // unless it is already significantly reduced.
 184
 185   // TODO: Have some sort of hint or other heuristics to guess occupancy based
 186   // on other factors..
 187   unsigned OccupancyHint
 188     = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
 189   if (OccupancyHint == 0)
 190     OccupancyHint = 7;
 191
 192   // Clamp to max value.
 193   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
 194
 195   // Check the hint but ignore it if it's obviously wrong from the existing LDS
 196   // usage.
 197   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
 198
 199
 200   // Round up to the next tier of usage.
 201   unsigned MaxSizeWithWaveCount
 202     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
 203
 204   // Program is possibly broken by using more local mem than available.
 205   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
 206     return false;
 207
 208   LocalMemLimit = MaxSizeWithWaveCount;
 209
 210   DEBUG(
 211     dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
 212     << "  Rounding size to " << MaxSizeWithWaveCount
 213     << " with a maximum occupancy of " << MaxOccupancy << '\n'
 214     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
 215     << " available for promotion\n"
 216   );
 217
 218   BasicBlock &EntryBB = *F.begin();
 219   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
 220     AllocaInst *AI = dyn_cast<AllocaInst>(I);
 221
 222     ++I;
 223     if (AI)
 224       handleAlloca(*AI);
 225   }
 226
 227   return true;
 228 }
 229
 230 std::pair<Value *, Value *>
 231 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 232   if (!IsAMDHSA) {
 233     Function *LocalSizeYFn
 234       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
 235     Function *LocalSizeZFn
 236       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
 237
 238     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
 239     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
 240
 241     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 242     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 243
 244     return std::make_pair(LocalSizeY, LocalSizeZ);
 245   }
 246
 247   // We must read the size out of the dispatch pointer.
 248   assert(IsAMDGCN);
 249
 250   // We are indexing into this struct, and want to extract the workgroup_size_*
 251   // fields.
 252   //
 253   //   typedef struct hsa_kernel_dispatch_packet_s {
 254   //     uint16_t header;
 255   //     uint16_t setup;
 256   //     uint16_t workgroup_size_x ;
 257   //     uint16_t workgroup_size_y;
 258   //     uint16_t workgroup_size_z;
 259   //     uint16_t reserved0;
 260   //     uint32_t grid_size_x ;
 261   //     uint32_t grid_size_y ;
 262   //     uint32_t grid_size_z;
 263   //
 264   //     uint32_t private_segment_size;
 265   //     uint32_t group_segment_size;
 266   //     uint64_t kernel_object;
 267   //
 268   // #ifdef HSA_LARGE_MODEL
 269   //     void *kernarg_address;
 270   // #elif defined HSA_LITTLE_ENDIAN
 271   //     void *kernarg_address;
 272   //     uint32_t reserved1;
 273   // #else
 274   //     uint32_t reserved1;
 275   //     void *kernarg_address;
 276   // #endif
 277   //     uint64_t reserved2;
 278   //     hsa_signal_t completion_signal; // uint64_t wrapper
 279   //   } hsa_kernel_dispatch_packet_t
 280   //
 281   Function *DispatchPtrFn
 282     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 283
 284   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
 285   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
 286   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
 287
 288   // Size of the dispatch packet struct.
 289   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
 290
 291   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
 292   Value *CastDispatchPtr = Builder.CreateBitCast(
 293     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
 294
 295   // We could do a single 64-bit load here, but it's likely that the basic
 296   // 32-bit and extract sequence is already present, and it is probably easier
 297   // to CSE this. The loads should be mergable later anyway.
 298   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
 299   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
 300
 301   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
 302   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
 303
 304   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
 305   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
 306   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
 307   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 308
 309   // Extract y component. Upper half of LoadZU should be zero already.
 310   Value *Y = Builder.CreateLShr(LoadXY, 16);
 311
 312   return std::make_pair(Y, LoadZU);
 313 }
 314
 315 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
 316   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 317
 318   switch (N) {
 319   case 0:
 320     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
 321       : Intrinsic::r600_read_tidig_x;
 322     break;
 323   case 1:
 324     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
 325       : Intrinsic::r600_read_tidig_y;
 326     break;
 327
 328   case 2:
 329     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
 330       : Intrinsic::r600_read_tidig_z;
 331     break;
 332   default:
 333     llvm_unreachable("invalid dimension");
 334   }
 335
 336   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
 337   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
 338   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 339
 340   return CI;
 341 }
 342
 343 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
 344   return VectorType::get(ArrayTy->getArrayElementType(),
 345                          ArrayTy->getArrayNumElements());
 346 }
 347
 348 static Value *
 349 calculateVectorIndex(Value *Ptr,
 350                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
 351   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 352
 353   auto I = GEPIdx.find(GEP);
 354   return I == GEPIdx.end() ? nullptr : I->second;
 355 }
 356
 357 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 358   // FIXME we only support simple cases
 359   if (GEP->getNumOperands() != 3)
 360     return nullptr;
 361
 362   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
 363   if (!I0 || !I0->isZero())
 364     return nullptr;
 365
 366   return GEP->getOperand(2);
 367 }
 368
 369 // Not an instruction handled below to turn into a vector.
 370 //
 371 // TODO: Check isTriviallyVectorizable for calls and handle other
 372 // instructions.
 373 static bool canVectorizeInst(Instruction *Inst, User *User) {
 374   switch (Inst->getOpcode()) {
 375   case Instruction::Load:
 376   case Instruction::BitCast:
 377   case Instruction::AddrSpaceCast:
 378     return true;
 379   case Instruction::Store: {
 380     // Must be the stored pointer operand, not a stored value.
 381     StoreInst *SI = cast<StoreInst>(Inst);
 382     return SI->getPointerOperand() == User;
 383   }
 384   default:
 385     return false;
 386   }
 387 }
 388
 389 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 390   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 391
 392   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 393
 394   // FIXME: There is no reason why we can't support larger arrays, we
 395   // are just being conservative for now.
 396   if (!AllocaTy ||
 397       AllocaTy->getElementType()->isVectorTy() ||
 398       AllocaTy->getNumElements() > 4 ||
 399       AllocaTy->getNumElements() < 2) {
 400     DEBUG(dbgs() << "  Cannot convert type to vector\n");
 401     return false;
 402   }
 403
 404   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
 405   std::vector<Value*> WorkList;
 406   for (User *AllocaUser : Alloca->users()) {
 407     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
 408     if (!GEP) {
 409       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
 410         return false;
 411
 412       WorkList.push_back(AllocaUser);
 413       continue;
 414     }
 415
 416     Value *Index = GEPToVectorIndex(GEP);
 417
 418     // If we can't compute a vector index from this GEP, then we can't
 419     // promote this alloca to vector.
 420     if (!Index) {
 421       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
 422       return false;
 423     }
 424
 425     GEPVectorIdx[GEP] = Index;
 426     for (User *GEPUser : AllocaUser->users()) {
 427       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
 428         return false;
 429
 430       WorkList.push_back(GEPUser);
 431     }
 432   }
 433
 434   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
 435
 436   DEBUG(dbgs() << "  Converting alloca to vector "
 437         << *AllocaTy << " -> " << *VectorTy << '\n');
 438
 439   for (Value *V : WorkList) {
 440     Instruction *Inst = cast<Instruction>(V);
 441     IRBuilder<> Builder(Inst);
 442     switch (Inst->getOpcode()) {
 443     case Instruction::Load: {
 444       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 445       Value *Ptr = Inst->getOperand(0);
 446       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 447
 448       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 449       Value *VecValue = Builder.CreateLoad(BitCast);
 450       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
 451       Inst->replaceAllUsesWith(ExtractElement);
 452       Inst->eraseFromParent();
 453       break;
 454     }
 455     case Instruction::Store: {
 456       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 457
 458       Value *Ptr = Inst->getOperand(1);
 459       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 460       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 461       Value *VecValue = Builder.CreateLoad(BitCast);
 462       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
 463                                                        Inst->getOperand(0),
 464                                                        Index);
 465       Builder.CreateStore(NewVecValue, BitCast);
 466       Inst->eraseFromParent();
 467       break;
 468     }
 469     case Instruction::BitCast:
 470     case Instruction::AddrSpaceCast:
 471       break;
 472
 473     default:
 474       llvm_unreachable("Inconsistency in instructions promotable to vector");
 475     }
 476   }
 477   return true;
 478 }
 479
 480 static bool isCallPromotable(CallInst *CI) {
 481   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
 482   if (!II)
 483     return false;
 484
 485   switch (II->getIntrinsicID()) {
 486   case Intrinsic::memcpy:
 487   case Intrinsic::memmove:
 488   case Intrinsic::memset:
 489   case Intrinsic::lifetime_start:
 490   case Intrinsic::lifetime_end:
 491   case Intrinsic::invariant_start:
 492   case Intrinsic::invariant_end:
 493   case Intrinsic::invariant_group_barrier:
 494   case Intrinsic::objectsize:
 495     return true;
 496   default:
 497     return false;
 498   }
 499 }
 500
 501 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
 502                                                           Value *Val,
 503                                                           Instruction *Inst,
 504                                                           int OpIdx0,
 505                                                           int OpIdx1) const {
 506   // Figure out which operand is the one we might not be promoting.
 507   Value *OtherOp = Inst->getOperand(OpIdx0);
 508   if (Val == OtherOp)
 509     OtherOp = Inst->getOperand(OpIdx1);
 510
 511   if (isa<ConstantPointerNull>(OtherOp))
 512     return true;
 513
 514   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
 515   if (!isa<AllocaInst>(OtherObj))
 516     return false;
 517
 518   // TODO: We should be able to replace undefs with the right pointer type.
 519
 520   // TODO: If we know the other base object is another promotable
 521   // alloca, not necessarily this alloca, we can do this. The
 522   // important part is both must have the same address space at
 523   // the end.
 524   if (OtherObj != BaseAlloca) {
 525     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
 526     return false;
 527   }
 528
 529   return true;
 530 }
 531
 532 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
 533   Value *BaseAlloca,
 534   Value *Val,
 535   std::vector<Value*> &WorkList) const {
 536
 537   for (User *User : Val->users()) {
 538     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
 539       continue;
 540
 541     if (CallInst *CI = dyn_cast<CallInst>(User)) {
 542       if (!isCallPromotable(CI))
 543         return false;
 544
 545       WorkList.push_back(User);
 546       continue;
 547     }
 548
 549     Instruction *UseInst = cast<Instruction>(User);
 550     if (UseInst->getOpcode() == Instruction::PtrToInt)
 551       return false;
 552
 553     if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
 554       if (LI->isVolatile())
 555         return false;
 556
 557       continue;
 558     }
 559
 560     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
 561       if (SI->isVolatile())
 562         return false;
 563
 564       // Reject if the stored value is not the pointer operand.
 565       if (SI->getPointerOperand() != Val)
 566         return false;
 567     } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
 568       if (RMW->isVolatile())
 569         return false;
 570     } else if (AtomicCmpXchgInst *CAS
 571                = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
 572       if (CAS->isVolatile())
 573         return false;
 574     }
 575
 576     // Only promote a select if we know that the other select operand
 577     // is from another pointer that will also be promoted.
 578     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
 579       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
 580         return false;
 581
 582       // May need to rewrite constant operands.
 583       WorkList.push_back(ICmp);
 584     }
 585
 586     if (!User->getType()->isPointerTy())
 587       continue;
 588
 589     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
 590       // Be conservative if an address could be computed outside the bounds of
 591       // the alloca.
 592       if (!GEP->isInBounds())
 593         return false;
 594     }
 595
 596     // Only promote a select if we know that the other select operand is from
 597     // another pointer that will also be promoted.
 598     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
 599       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
 600         return false;
 601     }
 602
 603     // Repeat for phis.
 604     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
 605       // TODO: Handle more complex cases. We should be able to replace loops
 606       // over arrays.
 607       switch (Phi->getNumIncomingValues()) {
 608       case 1:
 609         break;
 610       case 2:
 611         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
 612           return false;
 613         break;
 614       default:
 615         return false;
 616       }
 617     }
 618
 619     WorkList.push_back(User);
 620     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
 621       return false;
 622   }
 623
 624   return true;
 625 }
 626
 627 // FIXME: Should try to pick the most likely to be profitable allocas first.
 628 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 629   // Array allocations are probably not worth handling, since an allocation of
 630   // the array type is the canonical form.
 631   if (!I.isStaticAlloca() || I.isArrayAllocation())
 632     return;
 633
 634   IRBuilder<> Builder(&I);
 635
 636   // First try to replace the alloca with a vector
 637   Type *AllocaTy = I.getAllocatedType();
 638
 639   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 640
 641   if (tryPromoteAllocaToVector(&I)) {
 642     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
 643     return;
 644   }
 645
 646   const Function &ContainingFunction = *I.getParent()->getParent();
 647
 648   // Don't promote the alloca to LDS for shader calling conventions as the work
 649   // item ID intrinsics are not supported for these calling conventions.
 650   // Furthermore not all LDS is available for some of the stages.
 651   if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
 652     return;
 653
 654   // FIXME: We should also try to get this value from the reqd_work_group_size
 655   // function attribute if it is available.
 656   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
 657
 658   const DataLayout &DL = Mod->getDataLayout();
 659
 660   unsigned Align = I.getAlignment();
 661   if (Align == 0)
 662     Align = DL.getABITypeAlignment(I.getAllocatedType());
 663
 664   // FIXME: This computed padding is likely wrong since it depends on inverse
 665   // usage order.
 666   //
 667   // FIXME: It is also possible that if we're allowed to use all of the memory
 668   // could could end up using more than the maximum due to alignment padding.
 669
 670   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
 671   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
 672   NewSize += AllocSize;
 673
 674   if (NewSize > LocalMemLimit) {
 675     DEBUG(dbgs() << "  " << AllocSize
 676           << " bytes of local memory not available to promote\n");
 677     return;
 678   }
 679
 680   CurrentLocalMemUsage = NewSize;
 681
 682   std::vector<Value*> WorkList;
 683
 684   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
 685     DEBUG(dbgs() << " Do not know how to convert all uses\n");
 686     return;
 687   }
 688
 689   DEBUG(dbgs() << "Promoting alloca to local memory\n");
 690
 691   Function *F = I.getParent()->getParent();
 692
 693   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
 694   GlobalVariable *GV = new GlobalVariable(
 695       *Mod, GVTy, false, GlobalValue::InternalLinkage,
 696       UndefValue::get(GVTy),
 697       Twine(F->getName()) + Twine('.') + I.getName(),
 698       nullptr,
 699       GlobalVariable::NotThreadLocal,
 700       AMDGPUAS::LOCAL_ADDRESS);
 701   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 702   GV->setAlignment(I.getAlignment());
 703
 704   Value *TCntY, *TCntZ;
 705
 706   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
 707   Value *TIdX = getWorkitemID(Builder, 0);
 708   Value *TIdY = getWorkitemID(Builder, 1);
 709   Value *TIdZ = getWorkitemID(Builder, 2);
 710
 711   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
 712   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
 713   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
 714   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
 715   TID = Builder.CreateAdd(TID, TIdZ);
 716
 717   Value *Indices[] = {
 718     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
 719     TID
 720   };
 721
 722   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
 723   I.mutateType(Offset->getType());
 724   I.replaceAllUsesWith(Offset);
 725   I.eraseFromParent();
 726
 727   for (Value *V : WorkList) {
 728     CallInst *Call = dyn_cast<CallInst>(V);
 729     if (!Call) {
 730       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
 731         Value *Src0 = CI->getOperand(0);
 732         Type *EltTy = Src0->getType()->getPointerElementType();
 733         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 734
 735         if (isa<ConstantPointerNull>(CI->getOperand(0)))
 736           CI->setOperand(0, ConstantPointerNull::get(NewTy));
 737
 738         if (isa<ConstantPointerNull>(CI->getOperand(1)))
 739           CI->setOperand(1, ConstantPointerNull::get(NewTy));
 740
 741         continue;
 742       }
 743
 744       // The operand's value should be corrected on its own.
 745       if (isa<AddrSpaceCastInst>(V))
 746         continue;
 747
 748       Type *EltTy = V->getType()->getPointerElementType();
 749       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 750
 751       // FIXME: It doesn't really make sense to try to do this for all
 752       // instructions.
 753       V->mutateType(NewTy);
 754
 755       // Adjust the types of any constant operands.
 756       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
 757         if (isa<ConstantPointerNull>(SI->getOperand(1)))
 758           SI->setOperand(1, ConstantPointerNull::get(NewTy));
 759
 760         if (isa<ConstantPointerNull>(SI->getOperand(2)))
 761           SI->setOperand(2, ConstantPointerNull::get(NewTy));
 762       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
 763         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
 764           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
 765             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
 766         }
 767       }
 768
 769       continue;
 770     }
 771
 772     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
 773     Builder.SetInsertPoint(Intr);
 774     switch (Intr->getIntrinsicID()) {
 775     case Intrinsic::lifetime_start:
 776     case Intrinsic::lifetime_end:
 777       // These intrinsics are for address space 0 only
 778       Intr->eraseFromParent();
 779       continue;
 780     case Intrinsic::memcpy: {
 781       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
 782       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
 783                            MemCpy->getLength(), MemCpy->getAlignment(),
 784                            MemCpy->isVolatile());
 785       Intr->eraseFromParent();
 786       continue;
 787     }
 788     case Intrinsic::memmove: {
 789       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
 790       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
 791                             MemMove->getLength(), MemMove->getAlignment(),
 792                             MemMove->isVolatile());
 793       Intr->eraseFromParent();
 794       continue;
 795     }
 796     case Intrinsic::memset: {
 797       MemSetInst *MemSet = cast<MemSetInst>(Intr);
 798       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
 799                            MemSet->getLength(), MemSet->getAlignment(),
 800                            MemSet->isVolatile());
 801       Intr->eraseFromParent();
 802       continue;
 803     }
 804     case Intrinsic::invariant_start:
 805     case Intrinsic::invariant_end:
 806     case Intrinsic::invariant_group_barrier:
 807       Intr->eraseFromParent();
 808       // FIXME: I think the invariant marker should still theoretically apply,
 809       // but the intrinsics need to be changed to accept pointers with any
 810       // address space.
 811       continue;
 812     case Intrinsic::objectsize: {
 813       Value *Src = Intr->getOperand(0);
 814       Type *SrcTy = Src->getType()->getPointerElementType();
 815       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
 816         Intrinsic::objectsize,
 817         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
 818       );
 819
 820       CallInst *NewCall
 821         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
 822       Intr->replaceAllUsesWith(NewCall);
 823       Intr->eraseFromParent();
 824       continue;
 825     }
 826     default:
 827       Intr->dump();
 828       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
 829     }
 830   }
 831 }
 832
 833 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
 834   return new AMDGPUPromoteAlloca(TM);
 835 }