contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp

   1 //===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file contains a pass (at IR level) to replace atomic instructions with
  11 // __atomic_* library calls, or target specific instruction which implement the
  12 // same semantics in a way which better fits the target backend.  This can
  13 // include the use of (intrinsic-based) load-linked/store-conditional loops,
  14 // AtomicCmpXchg, or type coercions.
  15 //
  16 //===----------------------------------------------------------------------===//
  17
  18 #include "llvm/CodeGen/AtomicExpandUtils.h"
  19 #include "llvm/CodeGen/Passes.h"
  20 #include "llvm/IR/Function.h"
  21 #include "llvm/IR/IRBuilder.h"
  22 #include "llvm/IR/InstIterator.h"
  23 #include "llvm/IR/Instructions.h"
  24 #include "llvm/IR/Intrinsics.h"
  25 #include "llvm/IR/Module.h"
  26 #include "llvm/Support/Debug.h"
  27 #include "llvm/Support/raw_ostream.h"
  28 #include "llvm/Target/TargetLowering.h"
  29 #include "llvm/Target/TargetMachine.h"
  30 #include "llvm/Target/TargetSubtargetInfo.h"
  31
  32 using namespace llvm;
  33
  34 #define DEBUG_TYPE "atomic-expand"
  35
  36 namespace {
  37   class AtomicExpand: public FunctionPass {
  38     const TargetMachine *TM;
  39     const TargetLowering *TLI;
  40   public:
  41     static char ID; // Pass identification, replacement for typeid
  42     explicit AtomicExpand(const TargetMachine *TM = nullptr)
  43       : FunctionPass(ID), TM(TM), TLI(nullptr) {
  44       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
  45     }
  46
  47     bool runOnFunction(Function &F) override;
  48
  49   private:
  50     bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
  51                                bool IsStore, bool IsLoad);
  52     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
  53     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
  54     bool tryExpandAtomicLoad(LoadInst *LI);
  55     bool expandAtomicLoadToLL(LoadInst *LI);
  56     bool expandAtomicLoadToCmpXchg(LoadInst *LI);
  57     StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
  58     bool expandAtomicStore(StoreInst *SI);
  59     bool tryExpandAtomicRMW(AtomicRMWInst *AI);
  60     Value *
  61     insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
  62                       AtomicOrdering MemOpOrder,
  63                       function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
  64     void expandAtomicOpToLLSC(
  65         Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
  66         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
  67     void expandPartwordAtomicRMW(
  68         AtomicRMWInst *I,
  69         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
  70     void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
  71
  72     AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
  73     static Value *insertRMWCmpXchgLoop(
  74         IRBuilder<> &Builder, Type *ResultType, Value *Addr,
  75         AtomicOrdering MemOpOrder,
  76         function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
  77         CreateCmpXchgInstFun CreateCmpXchg);
  78
  79     bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
  80     bool isIdempotentRMW(AtomicRMWInst *AI);
  81     bool simplifyIdempotentRMW(AtomicRMWInst *AI);
  82
  83     bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
  84                                  Value *PointerOperand, Value *ValueOperand,
  85                                  Value *CASExpected, AtomicOrdering Ordering,
  86                                  AtomicOrdering Ordering2,
  87                                  ArrayRef<RTLIB::Libcall> Libcalls);
  88     void expandAtomicLoadToLibcall(LoadInst *LI);
  89     void expandAtomicStoreToLibcall(StoreInst *LI);
  90     void expandAtomicRMWToLibcall(AtomicRMWInst *I);
  91     void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
  92
  93     friend bool
  94     llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
  95                                    CreateCmpXchgInstFun CreateCmpXchg);
  96   };
  97 }
  98
  99 char AtomicExpand::ID = 0;
 100 char &llvm::AtomicExpandID = AtomicExpand::ID;
 101 INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
 102                    false, false)
 103
 104 FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
 105   return new AtomicExpand(TM);
 106 }
 107
 108 namespace {
 109 // Helper functions to retrieve the size of atomic instructions.
 110 unsigned getAtomicOpSize(LoadInst *LI) {
 111   const DataLayout &DL = LI->getModule()->getDataLayout();
 112   return DL.getTypeStoreSize(LI->getType());
 113 }
 114
 115 unsigned getAtomicOpSize(StoreInst *SI) {
 116   const DataLayout &DL = SI->getModule()->getDataLayout();
 117   return DL.getTypeStoreSize(SI->getValueOperand()->getType());
 118 }
 119
 120 unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
 121   const DataLayout &DL = RMWI->getModule()->getDataLayout();
 122   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
 123 }
 124
 125 unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
 126   const DataLayout &DL = CASI->getModule()->getDataLayout();
 127   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
 128 }
 129
 130 // Helper functions to retrieve the alignment of atomic instructions.
 131 unsigned getAtomicOpAlign(LoadInst *LI) {
 132   unsigned Align = LI->getAlignment();
 133   // In the future, if this IR restriction is relaxed, we should
 134   // return DataLayout::getABITypeAlignment when there's no align
 135   // value.
 136   assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
 137   return Align;
 138 }
 139
 140 unsigned getAtomicOpAlign(StoreInst *SI) {
 141   unsigned Align = SI->getAlignment();
 142   // In the future, if this IR restriction is relaxed, we should
 143   // return DataLayout::getABITypeAlignment when there's no align
 144   // value.
 145   assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
 146   return Align;
 147 }
 148
 149 unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
 150   // TODO(PR27168): This instruction has no alignment attribute, but unlike the
 151   // default alignment for load/store, the default here is to assume
 152   // it has NATURAL alignment, not DataLayout-specified alignment.
 153   const DataLayout &DL = RMWI->getModule()->getDataLayout();
 154   return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
 155 }
 156
 157 unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
 158   // TODO(PR27168): same comment as above.
 159   const DataLayout &DL = CASI->getModule()->getDataLayout();
 160   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
 161 }
 162
 163 // Determine if a particular atomic operation has a supported size,
 164 // and is of appropriate alignment, to be passed through for target
 165 // lowering. (Versus turning into a __atomic libcall)
 166 template <typename Inst>
 167 bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
 168   unsigned Size = getAtomicOpSize(I);
 169   unsigned Align = getAtomicOpAlign(I);
 170   return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
 171 }
 172
 173 } // end anonymous namespace
 174
 175 bool AtomicExpand::runOnFunction(Function &F) {
 176   if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
 177     return false;
 178   TLI = TM->getSubtargetImpl(F)->getTargetLowering();
 179
 180   SmallVector<Instruction *, 1> AtomicInsts;
 181
 182   // Changing control-flow while iterating through it is a bad idea, so gather a
 183   // list of all atomic instructions before we start.
 184   for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
 185     Instruction *I = &*II;
 186     if (I->isAtomic() && !isa<FenceInst>(I))
 187       AtomicInsts.push_back(I);
 188   }
 189
 190   bool MadeChange = false;
 191   for (auto I : AtomicInsts) {
 192     auto LI = dyn_cast<LoadInst>(I);
 193     auto SI = dyn_cast<StoreInst>(I);
 194     auto RMWI = dyn_cast<AtomicRMWInst>(I);
 195     auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
 196     assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
 197
 198     // If the Size/Alignment is not supported, replace with a libcall.
 199     if (LI) {
 200       if (!atomicSizeSupported(TLI, LI)) {
 201         expandAtomicLoadToLibcall(LI);
 202         MadeChange = true;
 203         continue;
 204       }
 205     } else if (SI) {
 206       if (!atomicSizeSupported(TLI, SI)) {
 207         expandAtomicStoreToLibcall(SI);
 208         MadeChange = true;
 209         continue;
 210       }
 211     } else if (RMWI) {
 212       if (!atomicSizeSupported(TLI, RMWI)) {
 213         expandAtomicRMWToLibcall(RMWI);
 214         MadeChange = true;
 215         continue;
 216       }
 217     } else if (CASI) {
 218       if (!atomicSizeSupported(TLI, CASI)) {
 219         expandAtomicCASToLibcall(CASI);
 220         MadeChange = true;
 221         continue;
 222       }
 223     }
 224
 225     if (TLI->shouldInsertFencesForAtomic(I)) {
 226       auto FenceOrdering = AtomicOrdering::Monotonic;
 227       bool IsStore, IsLoad;
 228       if (LI && isAcquireOrStronger(LI->getOrdering())) {
 229         FenceOrdering = LI->getOrdering();
 230         LI->setOrdering(AtomicOrdering::Monotonic);
 231         IsStore = false;
 232         IsLoad = true;
 233       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
 234         FenceOrdering = SI->getOrdering();
 235         SI->setOrdering(AtomicOrdering::Monotonic);
 236         IsStore = true;
 237         IsLoad = false;
 238       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
 239                           isAcquireOrStronger(RMWI->getOrdering()))) {
 240         FenceOrdering = RMWI->getOrdering();
 241         RMWI->setOrdering(AtomicOrdering::Monotonic);
 242         IsStore = IsLoad = true;
 243       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
 244                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
 245                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
 246         // If a compare and swap is lowered to LL/SC, we can do smarter fence
 247         // insertion, with a stronger one on the success path than on the
 248         // failure path. As a result, fence insertion is directly done by
 249         // expandAtomicCmpXchg in that case.
 250         FenceOrdering = CASI->getSuccessOrdering();
 251         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
 252         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
 253         IsStore = IsLoad = true;
 254       }
 255
 256       if (FenceOrdering != AtomicOrdering::Monotonic) {
 257         MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
 258       }
 259     }
 260
 261     if (LI) {
 262       if (LI->getType()->isFloatingPointTy()) {
 263         // TODO: add a TLI hook to control this so that each target can
 264         // convert to lowering the original type one at a time.
 265         LI = convertAtomicLoadToIntegerType(LI);
 266         assert(LI->getType()->isIntegerTy() && "invariant broken");
 267         MadeChange = true;
 268       }
 269
 270       MadeChange |= tryExpandAtomicLoad(LI);
 271     } else if (SI) {
 272       if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
 273         // TODO: add a TLI hook to control this so that each target can
 274         // convert to lowering the original type one at a time.
 275         SI = convertAtomicStoreToIntegerType(SI);
 276         assert(SI->getValueOperand()->getType()->isIntegerTy() &&
 277                "invariant broken");
 278         MadeChange = true;
 279       }
 280
 281       if (TLI->shouldExpandAtomicStoreInIR(SI))
 282         MadeChange |= expandAtomicStore(SI);
 283     } else if (RMWI) {
 284       // There are two different ways of expanding RMW instructions:
 285       // - into a load if it is idempotent
 286       // - into a Cmpxchg/LL-SC loop otherwise
 287       // we try them in that order.
 288
 289       if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
 290         MadeChange = true;
 291       } else {
 292         MadeChange |= tryExpandAtomicRMW(RMWI);
 293       }
 294     } else if (CASI) {
 295       // TODO: when we're ready to make the change at the IR level, we can
 296       // extend convertCmpXchgToInteger for floating point too.
 297       assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
 298              "unimplemented - floating point not legal at IR level");
 299       if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
 300         // TODO: add a TLI hook to control this so that each target can
 301         // convert to lowering the original type one at a time.
 302         CASI = convertCmpXchgToIntegerType(CASI);
 303         assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
 304                "invariant broken");
 305         MadeChange = true;
 306       }
 307
 308       unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
 309       unsigned ValueSize = getAtomicOpSize(CASI);
 310       if (ValueSize < MinCASSize) {
 311         assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
 312                "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
 313         expandPartwordCmpXchg(CASI);
 314       } else {
 315         if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
 316           MadeChange |= expandAtomicCmpXchg(CASI);
 317       }
 318     }
 319   }
 320   return MadeChange;
 321 }
 322
 323 bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
 324                                          bool IsStore, bool IsLoad) {
 325   IRBuilder<> Builder(I);
 326
 327   auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
 328
 329   auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
 330   // The trailing fence is emitted before the instruction instead of after
 331   // because there is no easy way of setting Builder insertion point after
 332   // an instruction. So we must erase it from the BB, and insert it back
 333   // in the right place.
 334   // We have a guard here because not every atomic operation generates a
 335   // trailing fence.
 336   if (TrailingFence) {
 337     TrailingFence->removeFromParent();
 338     TrailingFence->insertAfter(I);
 339   }
 340
 341   return (LeadingFence || TrailingFence);
 342 }
 343
 344 /// Get the iX type with the same bitwidth as T.
 345 IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
 346                                                        const DataLayout &DL) {
 347   EVT VT = TLI->getValueType(DL, T);
 348   unsigned BitWidth = VT.getStoreSizeInBits();
 349   assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
 350   return IntegerType::get(T->getContext(), BitWidth);
 351 }
 352
 353 /// Convert an atomic load of a non-integral type to an integer load of the
 354 /// equivalent bitwidth.  See the function comment on
 355 /// convertAtomicStoreToIntegerType for background.
 356 LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
 357   auto *M = LI->getModule();
 358   Type *NewTy = getCorrespondingIntegerType(LI->getType(),
 359                                             M->getDataLayout());
 360
 361   IRBuilder<> Builder(LI);
 362
 363   Value *Addr = LI->getPointerOperand();
 364   Type *PT = PointerType::get(NewTy,
 365                               Addr->getType()->getPointerAddressSpace());
 366   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 367
 368   auto *NewLI = Builder.CreateLoad(NewAddr);
 369   NewLI->setAlignment(LI->getAlignment());
 370   NewLI->setVolatile(LI->isVolatile());
 371   NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
 372   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
 373
 374   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
 375   LI->replaceAllUsesWith(NewVal);
 376   LI->eraseFromParent();
 377   return NewLI;
 378 }
 379
 380 bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
 381   switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
 382   case TargetLoweringBase::AtomicExpansionKind::None:
 383     return false;
 384   case TargetLoweringBase::AtomicExpansionKind::LLSC:
 385     expandAtomicOpToLLSC(
 386         LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
 387         [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
 388     return true;
 389   case TargetLoweringBase::AtomicExpansionKind::LLOnly:
 390     return expandAtomicLoadToLL(LI);
 391   case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
 392     return expandAtomicLoadToCmpXchg(LI);
 393   }
 394   llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
 395 }
 396
 397 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
 398   IRBuilder<> Builder(LI);
 399
 400   // On some architectures, load-linked instructions are atomic for larger
 401   // sizes than normal loads. For example, the only 64-bit load guaranteed
 402   // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
 403   Value *Val =
 404       TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
 405   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
 406
 407   LI->replaceAllUsesWith(Val);
 408   LI->eraseFromParent();
 409
 410   return true;
 411 }
 412
 413 bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
 414   IRBuilder<> Builder(LI);
 415   AtomicOrdering Order = LI->getOrdering();
 416   Value *Addr = LI->getPointerOperand();
 417   Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
 418   Constant *DummyVal = Constant::getNullValue(Ty);
 419
 420   Value *Pair = Builder.CreateAtomicCmpXchg(
 421       Addr, DummyVal, DummyVal, Order,
 422       AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
 423   Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
 424
 425   LI->replaceAllUsesWith(Loaded);
 426   LI->eraseFromParent();
 427
 428   return true;
 429 }
 430
 431 /// Convert an atomic store of a non-integral type to an integer store of the
 432 /// equivalent bitwidth.  We used to not support floating point or vector
 433 /// atomics in the IR at all.  The backends learned to deal with the bitcast
 434 /// idiom because that was the only way of expressing the notion of a atomic
 435 /// float or vector store.  The long term plan is to teach each backend to
 436 /// instruction select from the original atomic store, but as a migration
 437 /// mechanism, we convert back to the old format which the backends understand.
 438 /// Each backend will need individual work to recognize the new format.
 439 StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
 440   IRBuilder<> Builder(SI);
 441   auto *M = SI->getModule();
 442   Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
 443                                             M->getDataLayout());
 444   Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
 445
 446   Value *Addr = SI->getPointerOperand();
 447   Type *PT = PointerType::get(NewTy,
 448                               Addr->getType()->getPointerAddressSpace());
 449   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 450
 451   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
 452   NewSI->setAlignment(SI->getAlignment());
 453   NewSI->setVolatile(SI->isVolatile());
 454   NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
 455   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
 456   SI->eraseFromParent();
 457   return NewSI;
 458 }
 459
 460 bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
 461   // This function is only called on atomic stores that are too large to be
 462   // atomic if implemented as a native store. So we replace them by an
 463   // atomic swap, that can be implemented for example as a ldrex/strex on ARM
 464   // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
 465   // It is the responsibility of the target to only signal expansion via
 466   // shouldExpandAtomicRMW in cases where this is required and possible.
 467   IRBuilder<> Builder(SI);
 468   AtomicRMWInst *AI =
 469       Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
 470                               SI->getValueOperand(), SI->getOrdering());
 471   SI->eraseFromParent();
 472
 473   // Now we have an appropriate swap instruction, lower it as usual.
 474   return tryExpandAtomicRMW(AI);
 475 }
 476
 477 static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
 478                                  Value *Loaded, Value *NewVal,
 479                                  AtomicOrdering MemOpOrder,
 480                                  Value *&Success, Value *&NewLoaded) {
 481   Value* Pair = Builder.CreateAtomicCmpXchg(
 482       Addr, Loaded, NewVal, MemOpOrder,
 483       AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
 484   Success = Builder.CreateExtractValue(Pair, 1, "success");
 485   NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
 486 }
 487
 488 /// Emit IR to implement the given atomicrmw operation on values in registers,
 489 /// returning the new value.
 490 static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
 491                               Value *Loaded, Value *Inc) {
 492   Value *NewVal;
 493   switch (Op) {
 494   case AtomicRMWInst::Xchg:
 495     return Inc;
 496   case AtomicRMWInst::Add:
 497     return Builder.CreateAdd(Loaded, Inc, "new");
 498   case AtomicRMWInst::Sub:
 499     return Builder.CreateSub(Loaded, Inc, "new");
 500   case AtomicRMWInst::And:
 501     return Builder.CreateAnd(Loaded, Inc, "new");
 502   case AtomicRMWInst::Nand:
 503     return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
 504   case AtomicRMWInst::Or:
 505     return Builder.CreateOr(Loaded, Inc, "new");
 506   case AtomicRMWInst::Xor:
 507     return Builder.CreateXor(Loaded, Inc, "new");
 508   case AtomicRMWInst::Max:
 509     NewVal = Builder.CreateICmpSGT(Loaded, Inc);
 510     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
 511   case AtomicRMWInst::Min:
 512     NewVal = Builder.CreateICmpSLE(Loaded, Inc);
 513     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
 514   case AtomicRMWInst::UMax:
 515     NewVal = Builder.CreateICmpUGT(Loaded, Inc);
 516     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
 517   case AtomicRMWInst::UMin:
 518     NewVal = Builder.CreateICmpULE(Loaded, Inc);
 519     return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
 520   default:
 521     llvm_unreachable("Unknown atomic op");
 522   }
 523 }
 524
 525 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
 526   switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
 527   case TargetLoweringBase::AtomicExpansionKind::None:
 528     return false;
 529   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
 530     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
 531     unsigned ValueSize = getAtomicOpSize(AI);
 532     if (ValueSize < MinCASSize) {
 533       llvm_unreachable(
 534           "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
 535     } else {
 536       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
 537         return performAtomicOp(AI->getOperation(), Builder, Loaded,
 538                                AI->getValOperand());
 539       };
 540       expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
 541                            AI->getOrdering(), PerformOp);
 542     }
 543     return true;
 544   }
 545   case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
 546     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
 547     unsigned ValueSize = getAtomicOpSize(AI);
 548     if (ValueSize < MinCASSize) {
 549       expandPartwordAtomicRMW(AI,
 550                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
 551     } else {
 552       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
 553     }
 554     return true;
 555   }
 556   default:
 557     llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
 558   }
 559 }
 560
 561 namespace {
 562
 563 /// Result values from createMaskInstrs helper.
 564 struct PartwordMaskValues {
 565   Type *WordType;
 566   Type *ValueType;
 567   Value *AlignedAddr;
 568   Value *ShiftAmt;
 569   Value *Mask;
 570   Value *Inv_Mask;
 571 };
 572 } // end anonymous namespace
 573
 574 /// This is a helper function which builds instructions to provide
 575 /// values necessary for partword atomic operations. It takes an
 576 /// incoming address, Addr, and ValueType, and constructs the address,
 577 /// shift-amounts and masks needed to work with a larger value of size
 578 /// WordSize.
 579 ///
 580 /// AlignedAddr: Addr rounded down to a multiple of WordSize
 581 ///
 582 /// ShiftAmt: Number of bits to right-shift a WordSize value loaded
 583 ///           from AlignAddr for it to have the same value as if
 584 ///           ValueType was loaded from Addr.
 585 ///
 586 /// Mask: Value to mask with the value loaded from AlignAddr to
 587 ///       include only the part that would've been loaded from Addr.
 588 ///
 589 /// Inv_Mask: The inverse of Mask.
 590
 591 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
 592                                            Type *ValueType, Value *Addr,
 593                                            unsigned WordSize) {
 594   PartwordMaskValues Ret;
 595
 596   BasicBlock *BB = I->getParent();
 597   Function *F = BB->getParent();
 598   Module *M = I->getModule();
 599
 600   LLVMContext &Ctx = F->getContext();
 601   const DataLayout &DL = M->getDataLayout();
 602
 603   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
 604
 605   assert(ValueSize < WordSize);
 606
 607   Ret.ValueType = ValueType;
 608   Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
 609
 610   Type *WordPtrType =
 611       Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
 612
 613   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
 614   Ret.AlignedAddr = Builder.CreateIntToPtr(
 615       Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
 616       "AlignedAddr");
 617
 618   Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
 619   if (DL.isLittleEndian()) {
 620     // turn bytes into bits
 621     Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
 622   } else {
 623     // turn bytes into bits, and count from the other side.
 624     Ret.ShiftAmt =
 625         Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
 626   }
 627
 628   Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
 629   Ret.Mask = Builder.CreateShl(
 630       ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
 631       "Mask");
 632   Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
 633
 634   return Ret;
 635 }
 636
 637 /// Emit IR to implement a masked version of a given atomicrmw
 638 /// operation. (That is, only the bits under the Mask should be
 639 /// affected by the operation)
 640 static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
 641                                     IRBuilder<> &Builder, Value *Loaded,
 642                                     Value *Shifted_Inc, Value *Inc,
 643                                     const PartwordMaskValues &PMV) {
 644   switch (Op) {
 645   case AtomicRMWInst::Xchg: {
 646     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
 647     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
 648     return FinalVal;
 649   }
 650   case AtomicRMWInst::Or:
 651   case AtomicRMWInst::Xor:
 652     // Or/Xor won't affect any other bits, so can just be done
 653     // directly.
 654     return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
 655   case AtomicRMWInst::Add:
 656   case AtomicRMWInst::Sub:
 657   case AtomicRMWInst::And:
 658   case AtomicRMWInst::Nand: {
 659     // The other arithmetic ops need to be masked into place.
 660     Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
 661     Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
 662     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
 663     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
 664     return FinalVal;
 665   }
 666   case AtomicRMWInst::Max:
 667   case AtomicRMWInst::Min:
 668   case AtomicRMWInst::UMax:
 669   case AtomicRMWInst::UMin: {
 670     // Finally, comparison ops will operate on the full value, so
 671     // truncate down to the original size, and expand out again after
 672     // doing the operation.
 673     Value *Loaded_Shiftdown = Builder.CreateTrunc(
 674         Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
 675     Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
 676     Value *NewVal_Shiftup = Builder.CreateShl(
 677         Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
 678     Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
 679     Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
 680     return FinalVal;
 681   }
 682   default:
 683     llvm_unreachable("Unknown atomic op");
 684   }
 685 }
 686
 687 /// Expand a sub-word atomicrmw operation into an appropriate
 688 /// word-sized operation.
 689 ///
 690 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
 691 /// way as a typical atomicrmw expansion. The only difference here is
 692 /// that the operation inside of the loop must operate only upon a
 693 /// part of the value.
 694 void AtomicExpand::expandPartwordAtomicRMW(
 695     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
 696
 697   assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
 698
 699   AtomicOrdering MemOpOrder = AI->getOrdering();
 700
 701   IRBuilder<> Builder(AI);
 702
 703   PartwordMaskValues PMV =
 704       createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
 705                        TLI->getMinCmpXchgSizeInBits() / 8);
 706
 707   Value *ValOperand_Shifted =
 708       Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
 709                         PMV.ShiftAmt, "ValOperand_Shifted");
 710
 711   auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
 712     return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
 713                                  ValOperand_Shifted, AI->getValOperand(), PMV);
 714   };
 715
 716   // TODO: When we're ready to support LLSC conversions too, use
 717   // insertRMWLLSCLoop here for ExpansionKind==LLSC.
 718   Value *OldResult =
 719       insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
 720                            PerformPartwordOp, createCmpXchgInstFun);
 721   Value *FinalOldResult = Builder.CreateTrunc(
 722       Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
 723   AI->replaceAllUsesWith(FinalOldResult);
 724   AI->eraseFromParent();
 725 }
 726
 727 void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
 728   // The basic idea here is that we're expanding a cmpxchg of a
 729   // smaller memory size up to a word-sized cmpxchg. To do this, we
 730   // need to add a retry-loop for strong cmpxchg, so that
 731   // modifications to other parts of the word don't cause a spurious
 732   // failure.
 733
 734   // This generates code like the following:
 735   //     [[Setup mask values PMV.*]]
 736   //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
 737   //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
 738   //     %InitLoaded = load i32* %addr
 739   //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
 740   //     br partword.cmpxchg.loop
 741   // partword.cmpxchg.loop:
 742   //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
 743   //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
 744   //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
 745   //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
 746   //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
 747   //        i32 %FullWord_NewVal success_ordering failure_ordering
 748   //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
 749   //     %Success = extractvalue { i32, i1 } %NewCI, 1
 750   //     br i1 %Success, label %partword.cmpxchg.end,
 751   //        label %partword.cmpxchg.failure
 752   // partword.cmpxchg.failure:
 753   //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
 754   //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
 755   //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
 756   //         label %partword.cmpxchg.end
 757   // partword.cmpxchg.end:
 758   //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
 759   //    %FinalOldVal = trunc i32 %tmp1 to i8
 760   //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
 761   //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
 762
 763   Value *Addr = CI->getPointerOperand();
 764   Value *Cmp = CI->getCompareOperand();
 765   Value *NewVal = CI->getNewValOperand();
 766
 767   BasicBlock *BB = CI->getParent();
 768   Function *F = BB->getParent();
 769   IRBuilder<> Builder(CI);
 770   LLVMContext &Ctx = Builder.getContext();
 771
 772   const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
 773
 774   BasicBlock *EndBB =
 775       BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
 776   auto FailureBB =
 777       BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
 778   auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
 779
 780   // The split call above "helpfully" added a branch at the end of BB
 781   // (to the wrong place).
 782   std::prev(BB->end())->eraseFromParent();
 783   Builder.SetInsertPoint(BB);
 784
 785   PartwordMaskValues PMV = createMaskInstrs(
 786       Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
 787
 788   // Shift the incoming values over, into the right location in the word.
 789   Value *NewVal_Shifted =
 790       Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
 791   Value *Cmp_Shifted =
 792       Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
 793
 794   // Load the entire current word, and mask into place the expected and new
 795   // values
 796   LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
 797   InitLoaded->setVolatile(CI->isVolatile());
 798   Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
 799   Builder.CreateBr(LoopBB);
 800
 801   // partword.cmpxchg.loop:
 802   Builder.SetInsertPoint(LoopBB);
 803   PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
 804   Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
 805
 806   // Mask/Or the expected and new values into place in the loaded word.
 807   Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
 808   Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
 809   AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
 810       PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
 811       CI->getFailureOrdering(), CI->getSynchScope());
 812   NewCI->setVolatile(CI->isVolatile());
 813   // When we're building a strong cmpxchg, we need a loop, so you
 814   // might think we could use a weak cmpxchg inside. But, using strong
 815   // allows the below comparison for ShouldContinue, and we're
 816   // expecting the underlying cmpxchg to be a machine instruction,
 817   // which is strong anyways.
 818   NewCI->setWeak(CI->isWeak());
 819
 820   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
 821   Value *Success = Builder.CreateExtractValue(NewCI, 1);
 822
 823   if (CI->isWeak())
 824     Builder.CreateBr(EndBB);
 825   else
 826     Builder.CreateCondBr(Success, EndBB, FailureBB);
 827
 828   // partword.cmpxchg.failure:
 829   Builder.SetInsertPoint(FailureBB);
 830   // Upon failure, verify that the masked-out part of the loaded value
 831   // has been modified.  If it didn't, abort the cmpxchg, since the
 832   // masked-in part must've.
 833   Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
 834   Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
 835   Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
 836
 837   // Add the second value to the phi from above
 838   Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
 839
 840   // partword.cmpxchg.end:
 841   Builder.SetInsertPoint(CI);
 842
 843   Value *FinalOldVal = Builder.CreateTrunc(
 844       Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
 845   Value *Res = UndefValue::get(CI->getType());
 846   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
 847   Res = Builder.CreateInsertValue(Res, Success, 1);
 848
 849   CI->replaceAllUsesWith(Res);
 850   CI->eraseFromParent();
 851 }
 852
 853 void AtomicExpand::expandAtomicOpToLLSC(
 854     Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
 855     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
 856   IRBuilder<> Builder(I);
 857   Value *Loaded =
 858       insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
 859
 860   I->replaceAllUsesWith(Loaded);
 861   I->eraseFromParent();
 862 }
 863
 864 Value *AtomicExpand::insertRMWLLSCLoop(
 865     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
 866     AtomicOrdering MemOpOrder,
 867     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
 868   LLVMContext &Ctx = Builder.getContext();
 869   BasicBlock *BB = Builder.GetInsertBlock();
 870   Function *F = BB->getParent();
 871
 872   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
 873   //
 874   // The standard expansion we produce is:
 875   //     [...]
 876   // atomicrmw.start:
 877   //     %loaded = @load.linked(%addr)
 878   //     %new = some_op iN %loaded, %incr
 879   //     %stored = @store_conditional(%new, %addr)
 880   //     %try_again = icmp i32 ne %stored, 0
 881   //     br i1 %try_again, label %loop, label %atomicrmw.end
 882   // atomicrmw.end:
 883   //     [...]
 884   BasicBlock *ExitBB =
 885       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
 886   BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
 887
 888   // The split call above "helpfully" added a branch at the end of BB (to the
 889   // wrong place).
 890   std::prev(BB->end())->eraseFromParent();
 891   Builder.SetInsertPoint(BB);
 892   Builder.CreateBr(LoopBB);
 893
 894   // Start the main loop block now that we've taken care of the preliminaries.
 895   Builder.SetInsertPoint(LoopBB);
 896   Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
 897
 898   Value *NewVal = PerformOp(Builder, Loaded);
 899
 900   Value *StoreSuccess =
 901       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
 902   Value *TryAgain = Builder.CreateICmpNE(
 903       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
 904   Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
 905
 906   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
 907   return Loaded;
 908 }
 909
 910 /// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
 911 /// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
 912 /// IR.  As a migration step, we convert back to what use to be the standard
 913 /// way to represent a pointer cmpxchg so that we can update backends one by
 914 /// one.
 915 AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
 916   auto *M = CI->getModule();
 917   Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
 918                                             M->getDataLayout());
 919
 920   IRBuilder<> Builder(CI);
 921
 922   Value *Addr = CI->getPointerOperand();
 923   Type *PT = PointerType::get(NewTy,
 924                               Addr->getType()->getPointerAddressSpace());
 925   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 926
 927   Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
 928   Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
 929
 930
 931   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
 932                                             CI->getSuccessOrdering(),
 933                                             CI->getFailureOrdering(),
 934                                             CI->getSynchScope());
 935   NewCI->setVolatile(CI->isVolatile());
 936   NewCI->setWeak(CI->isWeak());
 937   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
 938
 939   Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
 940   Value *Succ = Builder.CreateExtractValue(NewCI, 1);
 941
 942   OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
 943
 944   Value *Res = UndefValue::get(CI->getType());
 945   Res = Builder.CreateInsertValue(Res, OldVal, 0);
 946   Res = Builder.CreateInsertValue(Res, Succ, 1);
 947
 948   CI->replaceAllUsesWith(Res);
 949   CI->eraseFromParent();
 950   return NewCI;
 951 }
 952
 953
 954 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 955   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
 956   AtomicOrdering FailureOrder = CI->getFailureOrdering();
 957   Value *Addr = CI->getPointerOperand();
 958   BasicBlock *BB = CI->getParent();
 959   Function *F = BB->getParent();
 960   LLVMContext &Ctx = F->getContext();
 961   // If shouldInsertFencesForAtomic() returns true, then the target does not
 962   // want to deal with memory orders, and emitLeading/TrailingFence should take
 963   // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
 964   // should preserve the ordering.
 965   bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
 966   AtomicOrdering MemOpOrder =
 967       ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
 968
 969   // In implementations which use a barrier to achieve release semantics, we can
 970   // delay emitting this barrier until we know a store is actually going to be
 971   // attempted. The cost of this delay is that we need 2 copies of the block
 972   // emitting the load-linked, affecting code size.
 973   //
 974   // Ideally, this logic would be unconditional except for the minsize check
 975   // since in other cases the extra blocks naturally collapse down to the
 976   // minimal loop. Unfortunately, this puts too much stress on later
 977   // optimisations so we avoid emitting the extra logic in those cases too.
 978   bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
 979                            SuccessOrder != AtomicOrdering::Monotonic &&
 980                            SuccessOrder != AtomicOrdering::Acquire &&
 981                            !F->optForMinSize();
 982
 983   // There's no overhead for sinking the release barrier in a weak cmpxchg, so
 984   // do it even on minsize.
 985   bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
 986
 987   // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
 988   //
 989   // The full expansion we produce is:
 990   //     [...]
 991   // cmpxchg.start:
 992   //     %unreleasedload = @load.linked(%addr)
 993   //     %should_store = icmp eq %unreleasedload, %desired
 994   //     br i1 %should_store, label %cmpxchg.fencedstore,
 995   //                          label %cmpxchg.nostore
 996   // cmpxchg.releasingstore:
 997   //     fence?
 998   //     br label cmpxchg.trystore
 999   // cmpxchg.trystore:
1000   //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
1001   //                            [%releasedload, %cmpxchg.releasedload]
1002   //     %stored = @store_conditional(%new, %addr)
1003   //     %success = icmp eq i32 %stored, 0
1004   //     br i1 %success, label %cmpxchg.success,
1005   //                     label %cmpxchg.releasedload/%cmpxchg.failure
1006   // cmpxchg.releasedload:
1007   //     %releasedload = @load.linked(%addr)
1008   //     %should_store = icmp eq %releasedload, %desired
1009   //     br i1 %should_store, label %cmpxchg.trystore,
1010   //                          label %cmpxchg.failure
1011   // cmpxchg.success:
1012   //     fence?
1013   //     br label %cmpxchg.end
1014   // cmpxchg.nostore:
1015   //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
1016   //                           [%releasedload,
1017   //                               %cmpxchg.releasedload/%cmpxchg.trystore]
1018   //     @load_linked_fail_balance()?
1019   //     br label %cmpxchg.failure
1020   // cmpxchg.failure:
1021   //     fence?
1022   //     br label %cmpxchg.end
1023   // cmpxchg.end:
1024   //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
1025   //                   [%loaded.trystore, %cmpxchg.trystore]
1026   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
1027   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
1028   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
1029   //     [...]
1030   BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
1031   auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
1032   auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
1033   auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
1034   auto ReleasedLoadBB =
1035       BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
1036   auto TryStoreBB =
1037       BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
1038   auto ReleasingStoreBB =
1039       BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
1040   auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
1041
1042   // This grabs the DebugLoc from CI
1043   IRBuilder<> Builder(CI);
1044
1045   // The split call above "helpfully" added a branch at the end of BB (to the
1046   // wrong place), but we might want a fence too. It's easiest to just remove
1047   // the branch entirely.
1048   std::prev(BB->end())->eraseFromParent();
1049   Builder.SetInsertPoint(BB);
1050   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
1051     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
1052                           /*IsLoad=*/true);
1053   Builder.CreateBr(StartBB);
1054
1055   // Start the main loop block now that we've taken care of the preliminaries.
1056   Builder.SetInsertPoint(StartBB);
1057   Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1058   Value *ShouldStore = Builder.CreateICmpEQ(
1059       UnreleasedLoad, CI->getCompareOperand(), "should_store");
1060
1061   // If the cmpxchg doesn't actually need any ordering when it fails, we can
1062   // jump straight past that fence instruction (if it exists).
1063   Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
1064
1065   Builder.SetInsertPoint(ReleasingStoreBB);
1066   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
1067     TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
1068                           /*IsLoad=*/true);
1069   Builder.CreateBr(TryStoreBB);
1070
1071   Builder.SetInsertPoint(TryStoreBB);
1072   Value *StoreSuccess = TLI->emitStoreConditional(
1073       Builder, CI->getNewValOperand(), Addr, MemOpOrder);
1074   StoreSuccess = Builder.CreateICmpEQ(
1075       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
1076   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
1077   Builder.CreateCondBr(StoreSuccess, SuccessBB,
1078                        CI->isWeak() ? FailureBB : RetryBB);
1079
1080   Builder.SetInsertPoint(ReleasedLoadBB);
1081   Value *SecondLoad;
1082   if (HasReleasedLoadBB) {
1083     SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
1084     ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
1085                                        "should_store");
1086
1087     // If the cmpxchg doesn't actually need any ordering when it fails, we can
1088     // jump straight past that fence instruction (if it exists).
1089     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
1090   } else
1091     Builder.CreateUnreachable();
1092
1093   // Make sure later instructions don't get reordered with a fence if
1094   // necessary.
1095   Builder.SetInsertPoint(SuccessBB);
1096   if (ShouldInsertFencesForAtomic)
1097     TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
1098                            /*IsLoad=*/true);
1099   Builder.CreateBr(ExitBB);
1100
1101   Builder.SetInsertPoint(NoStoreBB);
1102   // In the failing case, where we don't execute the store-conditional, the
1103   // target might want to balance out the load-linked with a dedicated
1104   // instruction (e.g., on ARM, clearing the exclusive monitor).
1105   TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
1106   Builder.CreateBr(FailureBB);
1107
1108   Builder.SetInsertPoint(FailureBB);
1109   if (ShouldInsertFencesForAtomic)
1110     TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
1111                            /*IsLoad=*/true);
1112   Builder.CreateBr(ExitBB);
1113
1114   // Finally, we have control-flow based knowledge of whether the cmpxchg
1115   // succeeded or not. We expose this to later passes by converting any
1116   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
1117   // PHI.
1118   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1119   PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
1120   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
1121   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
1122
1123   // Setup the builder so we can create any PHIs we need.
1124   Value *Loaded;
1125   if (!HasReleasedLoadBB)
1126     Loaded = UnreleasedLoad;
1127   else {
1128     Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
1129     PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1130     TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
1131     TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1132
1133     Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
1134     PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1135     NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
1136     NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
1137
1138     Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
1139     PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
1140     ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
1141     ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
1142
1143     Loaded = ExitLoaded;
1144   }
1145
1146   // Look for any users of the cmpxchg that are just comparing the loaded value
1147   // against the desired one, and replace them with the CFG-derived version.
1148   SmallVector<ExtractValueInst *, 2> PrunedInsts;
1149   for (auto User : CI->users()) {
1150     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
1151     if (!EV)
1152       continue;
1153
1154     assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
1155            "weird extraction from { iN, i1 }");
1156
1157     if (EV->getIndices()[0] == 0)
1158       EV->replaceAllUsesWith(Loaded);
1159     else
1160       EV->replaceAllUsesWith(Success);
1161
1162     PrunedInsts.push_back(EV);
1163   }
1164
1165   // We can remove the instructions now we're no longer iterating through them.
1166   for (auto EV : PrunedInsts)
1167     EV->eraseFromParent();
1168
1169   if (!CI->use_empty()) {
1170     // Some use of the full struct return that we don't understand has happened,
1171     // so we've got to reconstruct it properly.
1172     Value *Res;
1173     Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
1174     Res = Builder.CreateInsertValue(Res, Success, 1);
1175
1176     CI->replaceAllUsesWith(Res);
1177   }
1178
1179   CI->eraseFromParent();
1180   return true;
1181 }
1182
1183 bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
1184   auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
1185   if(!C)
1186     return false;
1187
1188   AtomicRMWInst::BinOp Op = RMWI->getOperation();
1189   switch(Op) {
1190     case AtomicRMWInst::Add:
1191     case AtomicRMWInst::Sub:
1192     case AtomicRMWInst::Or:
1193     case AtomicRMWInst::Xor:
1194       return C->isZero();
1195     case AtomicRMWInst::And:
1196       return C->isMinusOne();
1197     // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
1198     default:
1199       return false;
1200   }
1201 }
1202
1203 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
1204   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
1205     tryExpandAtomicLoad(ResultingLoad);
1206     return true;
1207   }
1208   return false;
1209 }
1210
1211 Value *AtomicExpand::insertRMWCmpXchgLoop(
1212     IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
1213     AtomicOrdering MemOpOrder,
1214     function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
1215     CreateCmpXchgInstFun CreateCmpXchg) {
1216   LLVMContext &Ctx = Builder.getContext();
1217   BasicBlock *BB = Builder.GetInsertBlock();
1218   Function *F = BB->getParent();
1219
1220   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
1221   //
1222   // The standard expansion we produce is:
1223   //     [...]
1224   //     %init_loaded = load atomic iN* %addr
1225   //     br label %loop
1226   // loop:
1227   //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
1228   //     %new = some_op iN %loaded, %incr
1229   //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
1230   //     %new_loaded = extractvalue { iN, i1 } %pair, 0
1231   //     %success = extractvalue { iN, i1 } %pair, 1
1232   //     br i1 %success, label %atomicrmw.end, label %loop
1233   // atomicrmw.end:
1234   //     [...]
1235   BasicBlock *ExitBB =
1236       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
1237   BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
1238
1239   // The split call above "helpfully" added a branch at the end of BB (to the
1240   // wrong place), but we want a load. It's easiest to just remove
1241   // the branch entirely.
1242   std::prev(BB->end())->eraseFromParent();
1243   Builder.SetInsertPoint(BB);
1244   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
1245   // Atomics require at least natural alignment.
1246   InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
1247   Builder.CreateBr(LoopBB);
1248
1249   // Start the main loop block now that we've taken care of the preliminaries.
1250   Builder.SetInsertPoint(LoopBB);
1251   PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
1252   Loaded->addIncoming(InitLoaded, BB);
1253
1254   Value *NewVal = PerformOp(Builder, Loaded);
1255
1256   Value *NewLoaded = nullptr;
1257   Value *Success = nullptr;
1258
1259   CreateCmpXchg(Builder, Addr, Loaded, NewVal,
1260                 MemOpOrder == AtomicOrdering::Unordered
1261                     ? AtomicOrdering::Monotonic
1262                     : MemOpOrder,
1263                 Success, NewLoaded);
1264   assert(Success && NewLoaded);
1265
1266   Loaded->addIncoming(NewLoaded, LoopBB);
1267
1268   Builder.CreateCondBr(Success, ExitBB, LoopBB);
1269
1270   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
1271   return NewLoaded;
1272 }
1273
1274 // Note: This function is exposed externally by AtomicExpandUtils.h
1275 bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
1276                                     CreateCmpXchgInstFun CreateCmpXchg) {
1277   IRBuilder<> Builder(AI);
1278   Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
1279       Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
1280       [&](IRBuilder<> &Builder, Value *Loaded) {
1281         return performAtomicOp(AI->getOperation(), Builder, Loaded,
1282                                AI->getValOperand());
1283       },
1284       CreateCmpXchg);
1285
1286   AI->replaceAllUsesWith(Loaded);
1287   AI->eraseFromParent();
1288   return true;
1289 }
1290
1291 // In order to use one of the sized library calls such as
1292 // __atomic_fetch_add_4, the alignment must be sufficient, the size
1293 // must be one of the potentially-specialized sizes, and the value
1294 // type must actually exist in C on the target (otherwise, the
1295 // function wouldn't actually be defined.)
1296 static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
1297                                   const DataLayout &DL) {
1298   // TODO: "LargestSize" is an approximation for "largest type that
1299   // you can express in C". It seems to be the case that int128 is
1300   // supported on all 64-bit platforms, otherwise only up to 64-bit
1301   // integers are supported. If we get this wrong, then we'll try to
1302   // call a sized libcall that doesn't actually exist. There should
1303   // really be some more reliable way in LLVM of determining integer
1304   // sizes which are valid in the target's C ABI...
1305   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
1306   return Align >= Size &&
1307          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
1308          Size <= LargestSize;
1309 }
1310
1311 void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
1312   static const RTLIB::Libcall Libcalls[6] = {
1313       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
1314       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
1315   unsigned Size = getAtomicOpSize(I);
1316   unsigned Align = getAtomicOpAlign(I);
1317
1318   bool expanded = expandAtomicOpToLibcall(
1319       I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
1320       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1321   (void)expanded;
1322   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
1323 }
1324
1325 void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
1326   static const RTLIB::Libcall Libcalls[6] = {
1327       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
1328       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
1329   unsigned Size = getAtomicOpSize(I);
1330   unsigned Align = getAtomicOpAlign(I);
1331
1332   bool expanded = expandAtomicOpToLibcall(
1333       I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
1334       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1335   (void)expanded;
1336   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
1337 }
1338
1339 void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
1340   static const RTLIB::Libcall Libcalls[6] = {
1341       RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
1342       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
1343       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
1344   unsigned Size = getAtomicOpSize(I);
1345   unsigned Align = getAtomicOpAlign(I);
1346
1347   bool expanded = expandAtomicOpToLibcall(
1348       I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
1349       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
1350       Libcalls);
1351   (void)expanded;
1352   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
1353 }
1354
1355 static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
1356   static const RTLIB::Libcall LibcallsXchg[6] = {
1357       RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
1358       RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
1359       RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
1360   static const RTLIB::Libcall LibcallsAdd[6] = {
1361       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
1362       RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
1363       RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
1364   static const RTLIB::Libcall LibcallsSub[6] = {
1365       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
1366       RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
1367       RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
1368   static const RTLIB::Libcall LibcallsAnd[6] = {
1369       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
1370       RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
1371       RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
1372   static const RTLIB::Libcall LibcallsOr[6] = {
1373       RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
1374       RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
1375       RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
1376   static const RTLIB::Libcall LibcallsXor[6] = {
1377       RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
1378       RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
1379       RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
1380   static const RTLIB::Libcall LibcallsNand[6] = {
1381       RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
1382       RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
1383       RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
1384
1385   switch (Op) {
1386   case AtomicRMWInst::BAD_BINOP:
1387     llvm_unreachable("Should not have BAD_BINOP.");
1388   case AtomicRMWInst::Xchg:
1389     return makeArrayRef(LibcallsXchg);
1390   case AtomicRMWInst::Add:
1391     return makeArrayRef(LibcallsAdd);
1392   case AtomicRMWInst::Sub:
1393     return makeArrayRef(LibcallsSub);
1394   case AtomicRMWInst::And:
1395     return makeArrayRef(LibcallsAnd);
1396   case AtomicRMWInst::Or:
1397     return makeArrayRef(LibcallsOr);
1398   case AtomicRMWInst::Xor:
1399     return makeArrayRef(LibcallsXor);
1400   case AtomicRMWInst::Nand:
1401     return makeArrayRef(LibcallsNand);
1402   case AtomicRMWInst::Max:
1403   case AtomicRMWInst::Min:
1404   case AtomicRMWInst::UMax:
1405   case AtomicRMWInst::UMin:
1406     // No atomic libcalls are available for max/min/umax/umin.
1407     return {};
1408   }
1409   llvm_unreachable("Unexpected AtomicRMW operation.");
1410 }
1411
1412 void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
1413   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
1414
1415   unsigned Size = getAtomicOpSize(I);
1416   unsigned Align = getAtomicOpAlign(I);
1417
1418   bool Success = false;
1419   if (!Libcalls.empty())
1420     Success = expandAtomicOpToLibcall(
1421         I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
1422         I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
1423
1424   // The expansion failed: either there were no libcalls at all for
1425   // the operation (min/max), or there were only size-specialized
1426   // libcalls (add/sub/etc) and we needed a generic. So, expand to a
1427   // CAS libcall, via a CAS loop, instead.
1428   if (!Success) {
1429     expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
1430                                        Value *Loaded, Value *NewVal,
1431                                        AtomicOrdering MemOpOrder,
1432                                        Value *&Success, Value *&NewLoaded) {
1433       // Create the CAS instruction normally...
1434       AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
1435           Addr, Loaded, NewVal, MemOpOrder,
1436           AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
1437       Success = Builder.CreateExtractValue(Pair, 1, "success");
1438       NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
1439
1440       // ...and then expand the CAS into a libcall.
1441       expandAtomicCASToLibcall(Pair);
1442     });
1443   }
1444 }
1445
1446 // A helper routine for the above expandAtomic*ToLibcall functions.
1447 //
1448 // 'Libcalls' contains an array of enum values for the particular
1449 // ATOMIC libcalls to be emitted. All of the other arguments besides
1450 // 'I' are extracted from the Instruction subclass by the
1451 // caller. Depending on the particular call, some will be null.
1452 bool AtomicExpand::expandAtomicOpToLibcall(
1453     Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
1454     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
1455     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
1456   assert(Libcalls.size() == 6);
1457
1458   LLVMContext &Ctx = I->getContext();
1459   Module *M = I->getModule();
1460   const DataLayout &DL = M->getDataLayout();
1461   IRBuilder<> Builder(I);
1462   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
1463
1464   bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
1465   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
1466
1467   unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
1468
1469   // TODO: the "order" argument type is "int", not int32. So
1470   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
1471   ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
1472   assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
1473   Constant *OrderingVal =
1474       ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
1475   Constant *Ordering2Val = nullptr;
1476   if (CASExpected) {
1477     assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
1478     Ordering2Val =
1479         ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
1480   }
1481   bool HasResult = I->getType() != Type::getVoidTy(Ctx);
1482
1483   RTLIB::Libcall RTLibType;
1484   if (UseSizedLibcall) {
1485     switch (Size) {
1486     case 1: RTLibType = Libcalls[1]; break;
1487     case 2: RTLibType = Libcalls[2]; break;
1488     case 4: RTLibType = Libcalls[3]; break;
1489     case 8: RTLibType = Libcalls[4]; break;
1490     case 16: RTLibType = Libcalls[5]; break;
1491     }
1492   } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
1493     RTLibType = Libcalls[0];
1494   } else {
1495     // Can't use sized function, and there's no generic for this
1496     // operation, so give up.
1497     return false;
1498   }
1499
1500   // Build up the function call. There's two kinds. First, the sized
1501   // variants.  These calls are going to be one of the following (with
1502   // N=1,2,4,8,16):
1503   //  iN    __atomic_load_N(iN *ptr, int ordering)
1504   //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
1505   //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
1506   //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
1507   //                                    int success_order, int failure_order)
1508   //
1509   // Note that these functions can be used for non-integer atomic
1510   // operations, the values just need to be bitcast to integers on the
1511   // way in and out.
1512   //
1513   // And, then, the generic variants. They look like the following:
1514   //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
1515   //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
1516   //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
1517   //                          int ordering)
1518   //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
1519   //                                  void *desired, int success_order,
1520   //                                  int failure_order)
1521   //
1522   // The different signatures are built up depending on the
1523   // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
1524   // variables.
1525
1526   AllocaInst *AllocaCASExpected = nullptr;
1527   Value *AllocaCASExpected_i8 = nullptr;
1528   AllocaInst *AllocaValue = nullptr;
1529   Value *AllocaValue_i8 = nullptr;
1530   AllocaInst *AllocaResult = nullptr;
1531   Value *AllocaResult_i8 = nullptr;
1532
1533   Type *ResultTy;
1534   SmallVector<Value *, 6> Args;
1535   AttributeSet Attr;
1536
1537   // 'size' argument.
1538   if (!UseSizedLibcall) {
1539     // Note, getIntPtrType is assumed equivalent to size_t.
1540     Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
1541   }
1542
1543   // 'ptr' argument.
1544   Value *PtrVal =
1545       Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
1546   Args.push_back(PtrVal);
1547
1548   // 'expected' argument, if present.
1549   if (CASExpected) {
1550     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
1551     AllocaCASExpected->setAlignment(AllocaAlignment);
1552     AllocaCASExpected_i8 =
1553         Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
1554     Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
1555     Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
1556     Args.push_back(AllocaCASExpected_i8);
1557   }
1558
1559   // 'val' argument ('desired' for cas), if present.
1560   if (ValueOperand) {
1561     if (UseSizedLibcall) {
1562       Value *IntValue =
1563           Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
1564       Args.push_back(IntValue);
1565     } else {
1566       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
1567       AllocaValue->setAlignment(AllocaAlignment);
1568       AllocaValue_i8 =
1569           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
1570       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
1571       Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
1572       Args.push_back(AllocaValue_i8);
1573     }
1574   }
1575
1576   // 'ret' argument.
1577   if (!CASExpected && HasResult && !UseSizedLibcall) {
1578     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
1579     AllocaResult->setAlignment(AllocaAlignment);
1580     AllocaResult_i8 =
1581         Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
1582     Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
1583     Args.push_back(AllocaResult_i8);
1584   }
1585
1586   // 'ordering' ('success_order' for cas) argument.
1587   Args.push_back(OrderingVal);
1588
1589   // 'failure_order' argument, if present.
1590   if (Ordering2Val)
1591     Args.push_back(Ordering2Val);
1592
1593   // Now, the return type.
1594   if (CASExpected) {
1595     ResultTy = Type::getInt1Ty(Ctx);
1596     Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
1597   } else if (HasResult && UseSizedLibcall)
1598     ResultTy = SizedIntTy;
1599   else
1600     ResultTy = Type::getVoidTy(Ctx);
1601
1602   // Done with setting up arguments and return types, create the call:
1603   SmallVector<Type *, 6> ArgTys;
1604   for (Value *Arg : Args)
1605     ArgTys.push_back(Arg->getType());
1606   FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
1607   Constant *LibcallFn =
1608       M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
1609   CallInst *Call = Builder.CreateCall(LibcallFn, Args);
1610   Call->setAttributes(Attr);
1611   Value *Result = Call;
1612
1613   // And then, extract the results...
1614   if (ValueOperand && !UseSizedLibcall)
1615     Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
1616
1617   if (CASExpected) {
1618     // The final result from the CAS is {load of 'expected' alloca, bool result
1619     // from call}
1620     Type *FinalResultTy = I->getType();
1621     Value *V = UndefValue::get(FinalResultTy);
1622     Value *ExpectedOut =
1623         Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
1624     Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
1625     V = Builder.CreateInsertValue(V, ExpectedOut, 0);
1626     V = Builder.CreateInsertValue(V, Result, 1);
1627     I->replaceAllUsesWith(V);
1628   } else if (HasResult) {
1629     Value *V;
1630     if (UseSizedLibcall)
1631       V = Builder.CreateBitOrPointerCast(Result, I->getType());
1632     else {
1633       V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
1634       Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
1635     }
1636     I->replaceAllUsesWith(V);
1637   }
1638   I->eraseFromParent();
1639   return true;
1640 }