contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

   1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This pass eliminates allocas by either converting them into vectors or
  11 // by migrating them to local address space.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "AMDGPU.h"
  16 #include "AMDGPUSubtarget.h"
  17 #include "llvm/Analysis/ValueTracking.h"
  18 #include "llvm/IR/IRBuilder.h"
  19 #include "llvm/IR/IntrinsicInst.h"
  20 #include "llvm/IR/MDBuilder.h"
  21 #include "llvm/Support/Debug.h"
  22 #include "llvm/Support/raw_ostream.h"
  23
  24 #define DEBUG_TYPE "amdgpu-promote-alloca"
  25
  26 using namespace llvm;
  27
  28 namespace {
  29
  30 // FIXME: This can create globals so should be a module pass.
  31 class AMDGPUPromoteAlloca : public FunctionPass {
  32 private:
  33   const TargetMachine *TM;
  34   Module *Mod;
  35   const DataLayout *DL;
  36   MDNode *MaxWorkGroupSizeRange;
  37
  38   // FIXME: This should be per-kernel.
  39   uint32_t LocalMemLimit;
  40   uint32_t CurrentLocalMemUsage;
  41
  42   bool IsAMDGCN;
  43   bool IsAMDHSA;
  44
  45   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
  46   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
  47
  48   /// BaseAlloca is the alloca root the search started from.
  49   /// Val may be that alloca or a recursive user of it.
  50   bool collectUsesWithPtrTypes(Value *BaseAlloca,
  51                                Value *Val,
  52                                std::vector<Value*> &WorkList) const;
  53
  54   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
  55   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
  56   /// Returns true if both operands are derived from the same alloca. Val should
  57   /// be the same value as one of the input operands of UseInst.
  58   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
  59                                        Instruction *UseInst,
  60                                        int OpIdx0, int OpIdx1) const;
  61
  62 public:
  63   static char ID;
  64
  65   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
  66     FunctionPass(ID),
  67     TM(TM_),
  68     Mod(nullptr),
  69     DL(nullptr),
  70     MaxWorkGroupSizeRange(nullptr),
  71     LocalMemLimit(0),
  72     CurrentLocalMemUsage(0),
  73     IsAMDGCN(false),
  74     IsAMDHSA(false) { }
  75
  76   bool doInitialization(Module &M) override;
  77   bool runOnFunction(Function &F) override;
  78
  79   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
  80
  81   void handleAlloca(AllocaInst &I);
  82
  83   void getAnalysisUsage(AnalysisUsage &AU) const override {
  84     AU.setPreservesCFG();
  85     FunctionPass::getAnalysisUsage(AU);
  86   }
  87 };
  88
  89 } // End anonymous namespace
  90
  91 char AMDGPUPromoteAlloca::ID = 0;
  92
  93 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
  94                    "AMDGPU promote alloca to vector or LDS", false, false)
  95
  96 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
  97
  98
  99 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
 100   if (!TM)
 101     return false;
 102
 103   Mod = &M;
 104   DL = &Mod->getDataLayout();
 105
 106   // The maximum workitem id.
 107   //
 108   // FIXME: Should get as subtarget property. Usually runtime enforced max is
 109   // 256.
 110   MDBuilder MDB(Mod->getContext());
 111   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
 112
 113   const Triple &TT = TM->getTargetTriple();
 114
 115   IsAMDGCN = TT.getArch() == Triple::amdgcn;
 116   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 117
 118   return false;
 119 }
 120
 121 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 122   if (!TM || skipFunction(F))
 123     return false;
 124
 125   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
 126   if (!ST.isPromoteAllocaEnabled())
 127     return false;
 128
 129   FunctionType *FTy = F.getFunctionType();
 130
 131   // If the function has any arguments in the local address space, then it's
 132   // possible these arguments require the entire local memory space, so
 133   // we cannot use local memory in the pass.
 134   for (Type *ParamTy : FTy->params()) {
 135     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
 136     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
 137       LocalMemLimit = 0;
 138       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
 139                       "local memory disabled.\n");
 140       return false;
 141     }
 142   }
 143
 144   LocalMemLimit = ST.getLocalMemorySize();
 145   if (LocalMemLimit == 0)
 146     return false;
 147
 148   const DataLayout &DL = Mod->getDataLayout();
 149
 150   // Check how much local memory is being used by global objects
 151   CurrentLocalMemUsage = 0;
 152   for (GlobalVariable &GV : Mod->globals()) {
 153     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
 154       continue;
 155
 156     for (const User *U : GV.users()) {
 157       const Instruction *Use = dyn_cast<Instruction>(U);
 158       if (!Use)
 159         continue;
 160
 161       if (Use->getParent()->getParent() == &F) {
 162         unsigned Align = GV.getAlignment();
 163         if (Align == 0)
 164           Align = DL.getABITypeAlignment(GV.getValueType());
 165
 166         // FIXME: Try to account for padding here. The padding is currently
 167         // determined from the inverse order of uses in the function. I'm not
 168         // sure if the use list order is in any way connected to this, so the
 169         // total reported size is likely incorrect.
 170         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
 171         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
 172         CurrentLocalMemUsage += AllocSize;
 173         break;
 174       }
 175     }
 176   }
 177
 178   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
 179
 180   // Restrict local memory usage so that we don't drastically reduce occupancy,
 181   // unless it is already significantly reduced.
 182
 183   // TODO: Have some sort of hint or other heuristics to guess occupancy based
 184   // on other factors..
 185   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
 186   if (OccupancyHint == 0)
 187     OccupancyHint = 7;
 188
 189   // Clamp to max value.
 190   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
 191
 192   // Check the hint but ignore it if it's obviously wrong from the existing LDS
 193   // usage.
 194   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
 195
 196
 197   // Round up to the next tier of usage.
 198   unsigned MaxSizeWithWaveCount
 199     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
 200
 201   // Program is possibly broken by using more local mem than available.
 202   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
 203     return false;
 204
 205   LocalMemLimit = MaxSizeWithWaveCount;
 206
 207   DEBUG(
 208     dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
 209     << "  Rounding size to " << MaxSizeWithWaveCount
 210     << " with a maximum occupancy of " << MaxOccupancy << '\n'
 211     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
 212     << " available for promotion\n"
 213   );
 214
 215   BasicBlock &EntryBB = *F.begin();
 216   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
 217     AllocaInst *AI = dyn_cast<AllocaInst>(I);
 218
 219     ++I;
 220     if (AI)
 221       handleAlloca(*AI);
 222   }
 223
 224   return true;
 225 }
 226
 227 std::pair<Value *, Value *>
 228 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 229   if (!IsAMDHSA) {
 230     Function *LocalSizeYFn
 231       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
 232     Function *LocalSizeZFn
 233       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
 234
 235     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
 236     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
 237
 238     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 239     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 240
 241     return std::make_pair(LocalSizeY, LocalSizeZ);
 242   }
 243
 244   // We must read the size out of the dispatch pointer.
 245   assert(IsAMDGCN);
 246
 247   // We are indexing into this struct, and want to extract the workgroup_size_*
 248   // fields.
 249   //
 250   //   typedef struct hsa_kernel_dispatch_packet_s {
 251   //     uint16_t header;
 252   //     uint16_t setup;
 253   //     uint16_t workgroup_size_x ;
 254   //     uint16_t workgroup_size_y;
 255   //     uint16_t workgroup_size_z;
 256   //     uint16_t reserved0;
 257   //     uint32_t grid_size_x ;
 258   //     uint32_t grid_size_y ;
 259   //     uint32_t grid_size_z;
 260   //
 261   //     uint32_t private_segment_size;
 262   //     uint32_t group_segment_size;
 263   //     uint64_t kernel_object;
 264   //
 265   // #ifdef HSA_LARGE_MODEL
 266   //     void *kernarg_address;
 267   // #elif defined HSA_LITTLE_ENDIAN
 268   //     void *kernarg_address;
 269   //     uint32_t reserved1;
 270   // #else
 271   //     uint32_t reserved1;
 272   //     void *kernarg_address;
 273   // #endif
 274   //     uint64_t reserved2;
 275   //     hsa_signal_t completion_signal; // uint64_t wrapper
 276   //   } hsa_kernel_dispatch_packet_t
 277   //
 278   Function *DispatchPtrFn
 279     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 280
 281   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
 282   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
 283   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
 284
 285   // Size of the dispatch packet struct.
 286   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
 287
 288   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
 289   Value *CastDispatchPtr = Builder.CreateBitCast(
 290     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
 291
 292   // We could do a single 64-bit load here, but it's likely that the basic
 293   // 32-bit and extract sequence is already present, and it is probably easier
 294   // to CSE this. The loads should be mergable later anyway.
 295   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
 296   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
 297
 298   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
 299   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
 300
 301   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
 302   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
 303   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
 304   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 305
 306   // Extract y component. Upper half of LoadZU should be zero already.
 307   Value *Y = Builder.CreateLShr(LoadXY, 16);
 308
 309   return std::make_pair(Y, LoadZU);
 310 }
 311
 312 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
 313   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 314
 315   switch (N) {
 316   case 0:
 317     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
 318       : Intrinsic::r600_read_tidig_x;
 319     break;
 320   case 1:
 321     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
 322       : Intrinsic::r600_read_tidig_y;
 323     break;
 324
 325   case 2:
 326     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
 327       : Intrinsic::r600_read_tidig_z;
 328     break;
 329   default:
 330     llvm_unreachable("invalid dimension");
 331   }
 332
 333   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
 334   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
 335   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
 336
 337   return CI;
 338 }
 339
 340 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
 341   return VectorType::get(ArrayTy->getArrayElementType(),
 342                          ArrayTy->getArrayNumElements());
 343 }
 344
 345 static Value *
 346 calculateVectorIndex(Value *Ptr,
 347                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
 348   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
 349
 350   auto I = GEPIdx.find(GEP);
 351   return I == GEPIdx.end() ? nullptr : I->second;
 352 }
 353
 354 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 355   // FIXME we only support simple cases
 356   if (GEP->getNumOperands() != 3)
 357     return nullptr;
 358
 359   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
 360   if (!I0 || !I0->isZero())
 361     return nullptr;
 362
 363   return GEP->getOperand(2);
 364 }
 365
 366 // Not an instruction handled below to turn into a vector.
 367 //
 368 // TODO: Check isTriviallyVectorizable for calls and handle other
 369 // instructions.
 370 static bool canVectorizeInst(Instruction *Inst, User *User) {
 371   switch (Inst->getOpcode()) {
 372   case Instruction::Load:
 373   case Instruction::BitCast:
 374   case Instruction::AddrSpaceCast:
 375     return true;
 376   case Instruction::Store: {
 377     // Must be the stored pointer operand, not a stored value.
 378     StoreInst *SI = cast<StoreInst>(Inst);
 379     return SI->getPointerOperand() == User;
 380   }
 381   default:
 382     return false;
 383   }
 384 }
 385
 386 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
 387   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 388
 389   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 390
 391   // FIXME: There is no reason why we can't support larger arrays, we
 392   // are just being conservative for now.
 393   if (!AllocaTy ||
 394       AllocaTy->getElementType()->isVectorTy() ||
 395       AllocaTy->getNumElements() > 4 ||
 396       AllocaTy->getNumElements() < 2) {
 397     DEBUG(dbgs() << "  Cannot convert type to vector\n");
 398     return false;
 399   }
 400
 401   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
 402   std::vector<Value*> WorkList;
 403   for (User *AllocaUser : Alloca->users()) {
 404     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
 405     if (!GEP) {
 406       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
 407         return false;
 408
 409       WorkList.push_back(AllocaUser);
 410       continue;
 411     }
 412
 413     Value *Index = GEPToVectorIndex(GEP);
 414
 415     // If we can't compute a vector index from this GEP, then we can't
 416     // promote this alloca to vector.
 417     if (!Index) {
 418       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
 419       return false;
 420     }
 421
 422     GEPVectorIdx[GEP] = Index;
 423     for (User *GEPUser : AllocaUser->users()) {
 424       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
 425         return false;
 426
 427       WorkList.push_back(GEPUser);
 428     }
 429   }
 430
 431   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
 432
 433   DEBUG(dbgs() << "  Converting alloca to vector "
 434         << *AllocaTy << " -> " << *VectorTy << '\n');
 435
 436   for (Value *V : WorkList) {
 437     Instruction *Inst = cast<Instruction>(V);
 438     IRBuilder<> Builder(Inst);
 439     switch (Inst->getOpcode()) {
 440     case Instruction::Load: {
 441       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 442       Value *Ptr = Inst->getOperand(0);
 443       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 444
 445       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 446       Value *VecValue = Builder.CreateLoad(BitCast);
 447       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
 448       Inst->replaceAllUsesWith(ExtractElement);
 449       Inst->eraseFromParent();
 450       break;
 451     }
 452     case Instruction::Store: {
 453       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
 454
 455       Value *Ptr = Inst->getOperand(1);
 456       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 457       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
 458       Value *VecValue = Builder.CreateLoad(BitCast);
 459       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
 460                                                        Inst->getOperand(0),
 461                                                        Index);
 462       Builder.CreateStore(NewVecValue, BitCast);
 463       Inst->eraseFromParent();
 464       break;
 465     }
 466     case Instruction::BitCast:
 467     case Instruction::AddrSpaceCast:
 468       break;
 469
 470     default:
 471       llvm_unreachable("Inconsistency in instructions promotable to vector");
 472     }
 473   }
 474   return true;
 475 }
 476
 477 static bool isCallPromotable(CallInst *CI) {
 478   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
 479   if (!II)
 480     return false;
 481
 482   switch (II->getIntrinsicID()) {
 483   case Intrinsic::memcpy:
 484   case Intrinsic::memmove:
 485   case Intrinsic::memset:
 486   case Intrinsic::lifetime_start:
 487   case Intrinsic::lifetime_end:
 488   case Intrinsic::invariant_start:
 489   case Intrinsic::invariant_end:
 490   case Intrinsic::invariant_group_barrier:
 491   case Intrinsic::objectsize:
 492     return true;
 493   default:
 494     return false;
 495   }
 496 }
 497
 498 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
 499                                                           Value *Val,
 500                                                           Instruction *Inst,
 501                                                           int OpIdx0,
 502                                                           int OpIdx1) const {
 503   // Figure out which operand is the one we might not be promoting.
 504   Value *OtherOp = Inst->getOperand(OpIdx0);
 505   if (Val == OtherOp)
 506     OtherOp = Inst->getOperand(OpIdx1);
 507
 508   if (isa<ConstantPointerNull>(OtherOp))
 509     return true;
 510
 511   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
 512   if (!isa<AllocaInst>(OtherObj))
 513     return false;
 514
 515   // TODO: We should be able to replace undefs with the right pointer type.
 516
 517   // TODO: If we know the other base object is another promotable
 518   // alloca, not necessarily this alloca, we can do this. The
 519   // important part is both must have the same address space at
 520   // the end.
 521   if (OtherObj != BaseAlloca) {
 522     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
 523     return false;
 524   }
 525
 526   return true;
 527 }
 528
 529 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
 530   Value *BaseAlloca,
 531   Value *Val,
 532   std::vector<Value*> &WorkList) const {
 533
 534   for (User *User : Val->users()) {
 535     if (is_contained(WorkList, User))
 536       continue;
 537
 538     if (CallInst *CI = dyn_cast<CallInst>(User)) {
 539       if (!isCallPromotable(CI))
 540         return false;
 541
 542       WorkList.push_back(User);
 543       continue;
 544     }
 545
 546     Instruction *UseInst = cast<Instruction>(User);
 547     if (UseInst->getOpcode() == Instruction::PtrToInt)
 548       return false;
 549
 550     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
 551       if (LI->isVolatile())
 552         return false;
 553
 554       continue;
 555     }
 556
 557     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
 558       if (SI->isVolatile())
 559         return false;
 560
 561       // Reject if the stored value is not the pointer operand.
 562       if (SI->getPointerOperand() != Val)
 563         return false;
 564     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
 565       if (RMW->isVolatile())
 566         return false;
 567     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
 568       if (CAS->isVolatile())
 569         return false;
 570     }
 571
 572     // Only promote a select if we know that the other select operand
 573     // is from another pointer that will also be promoted.
 574     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
 575       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
 576         return false;
 577
 578       // May need to rewrite constant operands.
 579       WorkList.push_back(ICmp);
 580     }
 581
 582     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
 583       // Don't collect the users of this.
 584       WorkList.push_back(User);
 585       continue;
 586     }
 587
 588     if (!User->getType()->isPointerTy())
 589       continue;
 590
 591     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
 592       // Be conservative if an address could be computed outside the bounds of
 593       // the alloca.
 594       if (!GEP->isInBounds())
 595         return false;
 596     }
 597
 598     // Only promote a select if we know that the other select operand is from
 599     // another pointer that will also be promoted.
 600     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
 601       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
 602         return false;
 603     }
 604
 605     // Repeat for phis.
 606     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
 607       // TODO: Handle more complex cases. We should be able to replace loops
 608       // over arrays.
 609       switch (Phi->getNumIncomingValues()) {
 610       case 1:
 611         break;
 612       case 2:
 613         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
 614           return false;
 615         break;
 616       default:
 617         return false;
 618       }
 619     }
 620
 621     WorkList.push_back(User);
 622     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
 623       return false;
 624   }
 625
 626   return true;
 627 }
 628
 629 // FIXME: Should try to pick the most likely to be profitable allocas first.
 630 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 631   // Array allocations are probably not worth handling, since an allocation of
 632   // the array type is the canonical form.
 633   if (!I.isStaticAlloca() || I.isArrayAllocation())
 634     return;
 635
 636   IRBuilder<> Builder(&I);
 637
 638   // First try to replace the alloca with a vector
 639   Type *AllocaTy = I.getAllocatedType();
 640
 641   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 642
 643   if (tryPromoteAllocaToVector(&I)) {
 644     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
 645     return;
 646   }
 647
 648   const Function &ContainingFunction = *I.getParent()->getParent();
 649
 650   // Don't promote the alloca to LDS for shader calling conventions as the work
 651   // item ID intrinsics are not supported for these calling conventions.
 652   // Furthermore not all LDS is available for some of the stages.
 653   if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
 654     return;
 655
 656   const AMDGPUSubtarget &ST =
 657     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
 658   // FIXME: We should also try to get this value from the reqd_work_group_size
 659   // function attribute if it is available.
 660   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 661
 662   const DataLayout &DL = Mod->getDataLayout();
 663
 664   unsigned Align = I.getAlignment();
 665   if (Align == 0)
 666     Align = DL.getABITypeAlignment(I.getAllocatedType());
 667
 668   // FIXME: This computed padding is likely wrong since it depends on inverse
 669   // usage order.
 670   //
 671   // FIXME: It is also possible that if we're allowed to use all of the memory
 672   // could could end up using more than the maximum due to alignment padding.
 673
 674   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
 675   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
 676   NewSize += AllocSize;
 677
 678   if (NewSize > LocalMemLimit) {
 679     DEBUG(dbgs() << "  " << AllocSize
 680           << " bytes of local memory not available to promote\n");
 681     return;
 682   }
 683
 684   CurrentLocalMemUsage = NewSize;
 685
 686   std::vector<Value*> WorkList;
 687
 688   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
 689     DEBUG(dbgs() << " Do not know how to convert all uses\n");
 690     return;
 691   }
 692
 693   DEBUG(dbgs() << "Promoting alloca to local memory\n");
 694
 695   Function *F = I.getParent()->getParent();
 696
 697   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
 698   GlobalVariable *GV = new GlobalVariable(
 699       *Mod, GVTy, false, GlobalValue::InternalLinkage,
 700       UndefValue::get(GVTy),
 701       Twine(F->getName()) + Twine('.') + I.getName(),
 702       nullptr,
 703       GlobalVariable::NotThreadLocal,
 704       AMDGPUAS::LOCAL_ADDRESS);
 705   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 706   GV->setAlignment(I.getAlignment());
 707
 708   Value *TCntY, *TCntZ;
 709
 710   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
 711   Value *TIdX = getWorkitemID(Builder, 0);
 712   Value *TIdY = getWorkitemID(Builder, 1);
 713   Value *TIdZ = getWorkitemID(Builder, 2);
 714
 715   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
 716   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
 717   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
 718   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
 719   TID = Builder.CreateAdd(TID, TIdZ);
 720
 721   Value *Indices[] = {
 722     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
 723     TID
 724   };
 725
 726   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
 727   I.mutateType(Offset->getType());
 728   I.replaceAllUsesWith(Offset);
 729   I.eraseFromParent();
 730
 731   for (Value *V : WorkList) {
 732     CallInst *Call = dyn_cast<CallInst>(V);
 733     if (!Call) {
 734       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
 735         Value *Src0 = CI->getOperand(0);
 736         Type *EltTy = Src0->getType()->getPointerElementType();
 737         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 738
 739         if (isa<ConstantPointerNull>(CI->getOperand(0)))
 740           CI->setOperand(0, ConstantPointerNull::get(NewTy));
 741
 742         if (isa<ConstantPointerNull>(CI->getOperand(1)))
 743           CI->setOperand(1, ConstantPointerNull::get(NewTy));
 744
 745         continue;
 746       }
 747
 748       // The operand's value should be corrected on its own and we don't want to
 749       // touch the users.
 750       if (isa<AddrSpaceCastInst>(V))
 751         continue;
 752
 753       Type *EltTy = V->getType()->getPointerElementType();
 754       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
 755
 756       // FIXME: It doesn't really make sense to try to do this for all
 757       // instructions.
 758       V->mutateType(NewTy);
 759
 760       // Adjust the types of any constant operands.
 761       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
 762         if (isa<ConstantPointerNull>(SI->getOperand(1)))
 763           SI->setOperand(1, ConstantPointerNull::get(NewTy));
 764
 765         if (isa<ConstantPointerNull>(SI->getOperand(2)))
 766           SI->setOperand(2, ConstantPointerNull::get(NewTy));
 767       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
 768         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
 769           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
 770             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
 771         }
 772       }
 773
 774       continue;
 775     }
 776
 777     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
 778     Builder.SetInsertPoint(Intr);
 779     switch (Intr->getIntrinsicID()) {
 780     case Intrinsic::lifetime_start:
 781     case Intrinsic::lifetime_end:
 782       // These intrinsics are for address space 0 only
 783       Intr->eraseFromParent();
 784       continue;
 785     case Intrinsic::memcpy: {
 786       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
 787       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
 788                            MemCpy->getLength(), MemCpy->getAlignment(),
 789                            MemCpy->isVolatile());
 790       Intr->eraseFromParent();
 791       continue;
 792     }
 793     case Intrinsic::memmove: {
 794       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
 795       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
 796                             MemMove->getLength(), MemMove->getAlignment(),
 797                             MemMove->isVolatile());
 798       Intr->eraseFromParent();
 799       continue;
 800     }
 801     case Intrinsic::memset: {
 802       MemSetInst *MemSet = cast<MemSetInst>(Intr);
 803       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
 804                            MemSet->getLength(), MemSet->getAlignment(),
 805                            MemSet->isVolatile());
 806       Intr->eraseFromParent();
 807       continue;
 808     }
 809     case Intrinsic::invariant_start:
 810     case Intrinsic::invariant_end:
 811     case Intrinsic::invariant_group_barrier:
 812       Intr->eraseFromParent();
 813       // FIXME: I think the invariant marker should still theoretically apply,
 814       // but the intrinsics need to be changed to accept pointers with any
 815       // address space.
 816       continue;
 817     case Intrinsic::objectsize: {
 818       Value *Src = Intr->getOperand(0);
 819       Type *SrcTy = Src->getType()->getPointerElementType();
 820       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
 821         Intrinsic::objectsize,
 822         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
 823       );
 824
 825       CallInst *NewCall
 826         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
 827       Intr->replaceAllUsesWith(NewCall);
 828       Intr->eraseFromParent();
 829       continue;
 830     }
 831     default:
 832       Intr->dump();
 833       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
 834     }
 835   }
 836 }
 837
 838 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
 839   return new AMDGPUPromoteAlloca(TM);
 840 }