contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp

   1 //===- DivergenceAnalysis.cpp --------- Divergence Analysis Implementation -==//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements a general divergence analysis for loop vectorization
  11 // and GPU programs. It determines which branches and values in a loop or GPU
  12 // program are divergent. It can help branch optimizations such as jump
  13 // threading and loop unswitching to make better decisions.
  14 //
  15 // GPU programs typically use the SIMD execution model, where multiple threads
  16 // in the same execution group have to execute in lock-step. Therefore, if the
  17 // code contains divergent branches (i.e., threads in a group do not agree on
  18 // which path of the branch to take), the group of threads has to execute all
  19 // the paths from that branch with different subsets of threads enabled until
  20 // they re-converge.
  21 //
  22 // Due to this execution model, some optimizations such as jump
  23 // threading and loop unswitching can interfere with thread re-convergence.
  24 // Therefore, an analysis that computes which branches in a GPU program are
  25 // divergent can help the compiler to selectively run these optimizations.
  26 //
  27 // This implementation is derived from the Vectorization Analysis of the
  28 // Region Vectorizer (RV). That implementation in turn is based on the approach
  29 // described in
  30 //
  31 //   Improving Performance of OpenCL on CPUs
  32 //   Ralf Karrenberg and Sebastian Hack
  33 //   CC '12
  34 //
  35 // This DivergenceAnalysis implementation is generic in the sense that it does
  36 // not itself identify original sources of divergence.
  37 // Instead specialized adapter classes, (LoopDivergenceAnalysis) for loops and
  38 // (GPUDivergenceAnalysis) for GPU programs, identify the sources of divergence
  39 // (e.g., special variables that hold the thread ID or the iteration variable).
  40 //
  41 // The generic implementation propagates divergence to variables that are data
  42 // or sync dependent on a source of divergence.
  43 //
  44 // While data dependency is a well-known concept, the notion of sync dependency
  45 // is worth more explanation. Sync dependence characterizes the control flow
  46 // aspect of the propagation of branch divergence. For example,
  47 //
  48 //   %cond = icmp slt i32 %tid, 10
  49 //   br i1 %cond, label %then, label %else
  50 // then:
  51 //   br label %merge
  52 // else:
  53 //   br label %merge
  54 // merge:
  55 //   %a = phi i32 [ 0, %then ], [ 1, %else ]
  56 //
  57 // Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
  58 // because %tid is not on its use-def chains, %a is sync dependent on %tid
  59 // because the branch "br i1 %cond" depends on %tid and affects which value %a
  60 // is assigned to.
  61 //
  62 // The sync dependence detection (which branch induces divergence in which join
  63 // points) is implemented in the SyncDependenceAnalysis.
  64 //
  65 // The current DivergenceAnalysis implementation has the following limitations:
  66 // 1. intra-procedural. It conservatively considers the arguments of a
  67 //    non-kernel-entry function and the return value of a function call as
  68 //    divergent.
  69 // 2. memory as black box. It conservatively considers values loaded from
  70 //    generic or local address as divergent. This can be improved by leveraging
  71 //    pointer analysis and/or by modelling non-escaping memory objects in SSA
  72 //    as done in RV.
  73 //
  74 //===----------------------------------------------------------------------===//
  75
  76 #include "llvm/Analysis/DivergenceAnalysis.h"
  77 #include "llvm/Analysis/LoopInfo.h"
  78 #include "llvm/Analysis/Passes.h"
  79 #include "llvm/Analysis/PostDominators.h"
  80 #include "llvm/Analysis/TargetTransformInfo.h"
  81 #include "llvm/IR/Dominators.h"
  82 #include "llvm/IR/InstIterator.h"
  83 #include "llvm/IR/Instructions.h"
  84 #include "llvm/IR/IntrinsicInst.h"
  85 #include "llvm/IR/Value.h"
  86 #include "llvm/Support/Debug.h"
  87 #include "llvm/Support/raw_ostream.h"
  88 #include <vector>
  89
  90 using namespace llvm;
  91
  92 #define DEBUG_TYPE "divergence-analysis"
  93
  94 // class DivergenceAnalysis
  95 DivergenceAnalysis::DivergenceAnalysis(
  96     const Function &F, const Loop *RegionLoop, const DominatorTree &DT,
  97     const LoopInfo &LI, SyncDependenceAnalysis &SDA, bool IsLCSSAForm)
  98     : F(F), RegionLoop(RegionLoop), DT(DT), LI(LI), SDA(SDA),
  99       IsLCSSAForm(IsLCSSAForm) {}
 100
 101 void DivergenceAnalysis::markDivergent(const Value &DivVal) {
 102   assert(isa<Instruction>(DivVal) || isa<Argument>(DivVal));
 103   assert(!isAlwaysUniform(DivVal) && "cannot be a divergent");
 104   DivergentValues.insert(&DivVal);
 105 }
 106
 107 void DivergenceAnalysis::addUniformOverride(const Value &UniVal) {
 108   UniformOverrides.insert(&UniVal);
 109 }
 110
 111 bool DivergenceAnalysis::updateTerminator(const Instruction &Term) const {
 112   if (Term.getNumSuccessors() <= 1)
 113     return false;
 114   if (auto *BranchTerm = dyn_cast<BranchInst>(&Term)) {
 115     assert(BranchTerm->isConditional());
 116     return isDivergent(*BranchTerm->getCondition());
 117   }
 118   if (auto *SwitchTerm = dyn_cast<SwitchInst>(&Term)) {
 119     return isDivergent(*SwitchTerm->getCondition());
 120   }
 121   if (isa<InvokeInst>(Term)) {
 122     return false; // ignore abnormal executions through landingpad
 123   }
 124
 125   llvm_unreachable("unexpected terminator");
 126 }
 127
 128 bool DivergenceAnalysis::updateNormalInstruction(const Instruction &I) const {
 129   // TODO function calls with side effects, etc
 130   for (const auto &Op : I.operands()) {
 131     if (isDivergent(*Op))
 132       return true;
 133   }
 134   return false;
 135 }
 136
 137 bool DivergenceAnalysis::isTemporalDivergent(const BasicBlock &ObservingBlock,
 138                                              const Value &Val) const {
 139   const auto *Inst = dyn_cast<const Instruction>(&Val);
 140   if (!Inst)
 141     return false;
 142   // check whether any divergent loop carrying Val terminates before control
 143   // proceeds to ObservingBlock
 144   for (const auto *Loop = LI.getLoopFor(Inst->getParent());
 145        Loop != RegionLoop && !Loop->contains(&ObservingBlock);
 146        Loop = Loop->getParentLoop()) {
 147     if (DivergentLoops.find(Loop) != DivergentLoops.end())
 148       return true;
 149   }
 150
 151   return false;
 152 }
 153
 154 bool DivergenceAnalysis::updatePHINode(const PHINode &Phi) const {
 155   // joining divergent disjoint path in Phi parent block
 156   if (!Phi.hasConstantOrUndefValue() && isJoinDivergent(*Phi.getParent())) {
 157     return true;
 158   }
 159
 160   // An incoming value could be divergent by itself.
 161   // Otherwise, an incoming value could be uniform within the loop
 162   // that carries its definition but it may appear divergent
 163   // from outside the loop. This happens when divergent loop exits
 164   // drop definitions of that uniform value in different iterations.
 165   //
 166   // for (int i = 0; i < n; ++i) { // 'i' is uniform inside the loop
 167   //   if (i % thread_id == 0) break;    // divergent loop exit
 168   // }
 169   // int divI = i;                 // divI is divergent
 170   for (size_t i = 0; i < Phi.getNumIncomingValues(); ++i) {
 171     const auto *InVal = Phi.getIncomingValue(i);
 172     if (isDivergent(*Phi.getIncomingValue(i)) ||
 173         isTemporalDivergent(*Phi.getParent(), *InVal)) {
 174       return true;
 175     }
 176   }
 177   return false;
 178 }
 179
 180 bool DivergenceAnalysis::inRegion(const Instruction &I) const {
 181   return I.getParent() && inRegion(*I.getParent());
 182 }
 183
 184 bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
 185   return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
 186 }
 187
 188 // marks all users of loop-carried values of the loop headed by LoopHeader as
 189 // divergent
 190 void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
 191   auto *DivLoop = LI.getLoopFor(&LoopHeader);
 192   assert(DivLoop && "loopHeader is not actually part of a loop");
 193
 194   SmallVector<BasicBlock *, 8> TaintStack;
 195   DivLoop->getExitBlocks(TaintStack);
 196
 197   // Otherwise potential users of loop-carried values could be anywhere in the
 198   // dominance region of DivLoop (including its fringes for phi nodes)
 199   DenseSet<const BasicBlock *> Visited;
 200   for (auto *Block : TaintStack) {
 201     Visited.insert(Block);
 202   }
 203   Visited.insert(&LoopHeader);
 204
 205   while (!TaintStack.empty()) {
 206     auto *UserBlock = TaintStack.back();
 207     TaintStack.pop_back();
 208
 209     // don't spread divergence beyond the region
 210     if (!inRegion(*UserBlock))
 211       continue;
 212
 213     assert(!DivLoop->contains(UserBlock) &&
 214            "irreducible control flow detected");
 215
 216     // phi nodes at the fringes of the dominance region
 217     if (!DT.dominates(&LoopHeader, UserBlock)) {
 218       // all PHI nodes of UserBlock become divergent
 219       for (auto &Phi : UserBlock->phis()) {
 220         Worklist.push_back(&Phi);
 221       }
 222       continue;
 223     }
 224
 225     // taint outside users of values carried by DivLoop
 226     for (auto &I : *UserBlock) {
 227       if (isAlwaysUniform(I))
 228         continue;
 229       if (isDivergent(I))
 230         continue;
 231
 232       for (auto &Op : I.operands()) {
 233         auto *OpInst = dyn_cast<Instruction>(&Op);
 234         if (!OpInst)
 235           continue;
 236         if (DivLoop->contains(OpInst->getParent())) {
 237           markDivergent(I);
 238           pushUsers(I);
 239           break;
 240         }
 241       }
 242     }
 243
 244     // visit all blocks in the dominance region
 245     for (auto *SuccBlock : successors(UserBlock)) {
 246       if (!Visited.insert(SuccBlock).second) {
 247         continue;
 248       }
 249       TaintStack.push_back(SuccBlock);
 250     }
 251   }
 252 }
 253
 254 void DivergenceAnalysis::pushPHINodes(const BasicBlock &Block) {
 255   for (const auto &Phi : Block.phis()) {
 256     if (isDivergent(Phi))
 257       continue;
 258     Worklist.push_back(&Phi);
 259   }
 260 }
 261
 262 void DivergenceAnalysis::pushUsers(const Value &V) {
 263   for (const auto *User : V.users()) {
 264     const auto *UserInst = dyn_cast<const Instruction>(User);
 265     if (!UserInst)
 266       continue;
 267
 268     if (isDivergent(*UserInst))
 269       continue;
 270
 271     // only compute divergent inside loop
 272     if (!inRegion(*UserInst))
 273       continue;
 274     Worklist.push_back(UserInst);
 275   }
 276 }
 277
 278 bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
 279                                                  const Loop *BranchLoop) {
 280   LLVM_DEBUG(dbgs() << "\tpropJoinDiv " << JoinBlock.getName() << "\n");
 281
 282   // ignore divergence outside the region
 283   if (!inRegion(JoinBlock)) {
 284     return false;
 285   }
 286
 287   // push non-divergent phi nodes in JoinBlock to the worklist
 288   pushPHINodes(JoinBlock);
 289
 290   // JoinBlock is a divergent loop exit
 291   if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
 292     return true;
 293   }
 294
 295   // disjoint-paths divergent at JoinBlock
 296   markBlockJoinDivergent(JoinBlock);
 297   return false;
 298 }
 299
 300 void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
 301   LLVM_DEBUG(dbgs() << "propBranchDiv " << Term.getParent()->getName() << "\n");
 302
 303   markDivergent(Term);
 304
 305   const auto *BranchLoop = LI.getLoopFor(Term.getParent());
 306
 307   // whether there is a divergent loop exit from BranchLoop (if any)
 308   bool IsBranchLoopDivergent = false;
 309
 310   // iterate over all blocks reachable by disjoint from Term within the loop
 311   // also iterates over loop exits that become divergent due to Term.
 312   for (const auto *JoinBlock : SDA.join_blocks(Term)) {
 313     IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
 314   }
 315
 316   // Branch loop is a divergent loop due to the divergent branch in Term
 317   if (IsBranchLoopDivergent) {
 318     assert(BranchLoop);
 319     if (!DivergentLoops.insert(BranchLoop).second) {
 320       return;
 321     }
 322     propagateLoopDivergence(*BranchLoop);
 323   }
 324 }
 325
 326 void DivergenceAnalysis::propagateLoopDivergence(const Loop &ExitingLoop) {
 327   LLVM_DEBUG(dbgs() << "propLoopDiv " << ExitingLoop.getName() << "\n");
 328
 329   // don't propagate beyond region
 330   if (!inRegion(*ExitingLoop.getHeader()))
 331     return;
 332
 333   const auto *BranchLoop = ExitingLoop.getParentLoop();
 334
 335   // Uses of loop-carried values could occur anywhere
 336   // within the dominance region of the definition. All loop-carried
 337   // definitions are dominated by the loop header (reducible control).
 338   // Thus all users have to be in the dominance region of the loop header,
 339   // except PHI nodes that can also live at the fringes of the dom region
 340   // (incoming defining value).
 341   if (!IsLCSSAForm)
 342     taintLoopLiveOuts(*ExitingLoop.getHeader());
 343
 344   // whether there is a divergent loop exit from BranchLoop (if any)
 345   bool IsBranchLoopDivergent = false;
 346
 347   // iterate over all blocks reachable by disjoint paths from exits of
 348   // ExitingLoop also iterates over loop exits (of BranchLoop) that in turn
 349   // become divergent.
 350   for (const auto *JoinBlock : SDA.join_blocks(ExitingLoop)) {
 351     IsBranchLoopDivergent |= propagateJoinDivergence(*JoinBlock, BranchLoop);
 352   }
 353
 354   // Branch loop is a divergent due to divergent loop exit in ExitingLoop
 355   if (IsBranchLoopDivergent) {
 356     assert(BranchLoop);
 357     if (!DivergentLoops.insert(BranchLoop).second) {
 358       return;
 359     }
 360     propagateLoopDivergence(*BranchLoop);
 361   }
 362 }
 363
 364 void DivergenceAnalysis::compute() {
 365   for (auto *DivVal : DivergentValues) {
 366     pushUsers(*DivVal);
 367   }
 368
 369   // propagate divergence
 370   while (!Worklist.empty()) {
 371     const Instruction &I = *Worklist.back();
 372     Worklist.pop_back();
 373
 374     // maintain uniformity of overrides
 375     if (isAlwaysUniform(I))
 376       continue;
 377
 378     bool WasDivergent = isDivergent(I);
 379     if (WasDivergent)
 380       continue;
 381
 382     // propagate divergence caused by terminator
 383     if (I.isTerminator()) {
 384       if (updateTerminator(I)) {
 385         // propagate control divergence to affected instructions
 386         propagateBranchDivergence(I);
 387         continue;
 388       }
 389     }
 390
 391     // update divergence of I due to divergent operands
 392     bool DivergentUpd = false;
 393     const auto *Phi = dyn_cast<const PHINode>(&I);
 394     if (Phi) {
 395       DivergentUpd = updatePHINode(*Phi);
 396     } else {
 397       DivergentUpd = updateNormalInstruction(I);
 398     }
 399
 400     // propagate value divergence to users
 401     if (DivergentUpd) {
 402       markDivergent(I);
 403       pushUsers(I);
 404     }
 405   }
 406 }
 407
 408 bool DivergenceAnalysis::isAlwaysUniform(const Value &V) const {
 409   return UniformOverrides.find(&V) != UniformOverrides.end();
 410 }
 411
 412 bool DivergenceAnalysis::isDivergent(const Value &V) const {
 413   return DivergentValues.find(&V) != DivergentValues.end();
 414 }
 415
 416 void DivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
 417   if (DivergentValues.empty())
 418     return;
 419   // iterate instructions using instructions() to ensure a deterministic order.
 420   for (auto &I : instructions(F)) {
 421     if (isDivergent(I))
 422       OS << "DIVERGENT:" << I << '\n';
 423   }
 424 }
 425
 426 // class GPUDivergenceAnalysis
 427 GPUDivergenceAnalysis::GPUDivergenceAnalysis(Function &F,
 428                                              const DominatorTree &DT,
 429                                              const PostDominatorTree &PDT,
 430                                              const LoopInfo &LI,
 431                                              const TargetTransformInfo &TTI)
 432     : SDA(DT, PDT, LI), DA(F, nullptr, DT, LI, SDA, false) {
 433   for (auto &I : instructions(F)) {
 434     if (TTI.isSourceOfDivergence(&I)) {
 435       DA.markDivergent(I);
 436     } else if (TTI.isAlwaysUniform(&I)) {
 437       DA.addUniformOverride(I);
 438     }
 439   }
 440   for (auto &Arg : F.args()) {
 441     if (TTI.isSourceOfDivergence(&Arg)) {
 442       DA.markDivergent(Arg);
 443     }
 444   }
 445
 446   DA.compute();
 447 }
 448
 449 bool GPUDivergenceAnalysis::isDivergent(const Value &val) const {
 450   return DA.isDivergent(val);
 451 }
 452
 453 void GPUDivergenceAnalysis::print(raw_ostream &OS, const Module *mod) const {
 454   OS << "Divergence of kernel " << DA.getFunction().getName() << " {\n";
 455   DA.print(OS, mod);
 456   OS << "}\n";
 457 }