//===--- SyncDependenceAnalysis.cpp - Compute Control Divergence Effects --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements an algorithm that returns for a divergent branch
// the set of basic blocks whose phi nodes become divergent due to divergent
// control. These are the blocks that are reachable by two disjoint paths from
// the branch or loop exits that have a reaching path that is disjoint from a
// path to the loop latch.
//
// The SyncDependenceAnalysis is used in the DivergenceAnalysis to model
// control-induced divergence in phi nodes.
//
//
// -- Reference --
// The algorithm is presented in Section 5 of 
//
//   An abstract interpretation for SPMD divergence
//       on reducible control flow graphs.
//   Julian Rosemann, Simon Moll and Sebastian Hack
//   POPL '21
//
//
// -- Sync dependence --
// Sync dependence characterizes the control flow aspect of the
// propagation of branch divergence. For example,
//
//   %cond = icmp slt i32 %tid, 10
//   br i1 %cond, label %then, label %else
// then:
//   br label %merge
// else:
//   br label %merge
// merge:
//   %a = phi i32 [ 0, %then ], [ 1, %else ]
//
// Suppose %tid holds the thread ID. Although %a is not data dependent on %tid
// because %tid is not on its use-def chains, %a is sync dependent on %tid
// because the branch "br i1 %cond" depends on %tid and affects which value %a
// is assigned to.
//
//
// -- Reduction to SSA construction --
// There are two disjoint paths from A to X, if a certain variant of SSA
// construction places a phi node in X under the following set-up scheme.
//
// This variant of SSA construction ignores incoming undef values.
// That is paths from the entry without a definition do not result in
// phi nodes.
//
//       entry
//     /      \
//    A        \
//  /   \       Y
// B     C     /
//  \   /  \  /
//    D     E
//     \   /
//       F
//
// Assume that A contains a divergent branch. We are interested
// in the set of all blocks where each block is reachable from A
// via two disjoint paths. This would be the set {D, F} in this
// case.
// To generally reduce this query to SSA construction we introduce
// a virtual variable x and assign to x different values in each
// successor block of A.
//
//           entry
//         /      \
//        A        \
//      /   \       Y
// x = 0   x = 1   /
//      \  /   \  /
//        D     E
//         \   /
//           F
//
// Our flavor of SSA construction for x will construct the following
//
//            entry
//          /      \
//         A        \
//       /   \       Y
// x0 = 0   x1 = 1  /
//       \   /   \ /
//     x2 = phi   E
//         \     /
//         x3 = phi
//
// The blocks D and F contain phi nodes and are thus each reachable
// by two disjoins paths from A.
//
// -- Remarks --
// * In case of loop exits we need to check the disjoint path criterion for loops.
//   To this end, we check whether the definition of x differs between the
//   loop exit and the loop header (_after_ SSA construction).
//
// -- Known Limitations & Future Work --
// * The algorithm requires reducible loops because the implementation
//   implicitly performs a single iteration of the underlying data flow analysis.
//   This was done for pragmatism, simplicity and speed.
//
//   Relevant related work for extending the algorithm to irreducible control:
//     A simple algorithm for global data flow analysis problems.
//     Matthew S. Hecht and Jeffrey D. Ullman.
//     SIAM Journal on Computing, 4(4):519–532, December 1975.
//
// * Another reason for requiring reducible loops is that points of
//   synchronization in irreducible loops aren't 'obvious' - there is no unique
//   header where threads 'should' synchronize when entering or coming back
//   around from the latch.
//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/SyncDependenceAnalysis.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"

#include <functional>
#include <stack>
#include <unordered_set>

#define DEBUG_TYPE "sync-dependence"

// The SDA algorithm operates on a modified CFG - we modify the edges leaving
// loop headers as follows:
//
// * We remove all edges leaving all loop headers.
// * We add additional edges from the loop headers to their exit blocks.
//
// The modification is virtual, that is whenever we visit a loop header we
// pretend it had different successors.
namespace {
using namespace llvm;

// Custom Post-Order Traveral
//
// We cannot use the vanilla (R)PO computation of LLVM because:
// * We (virtually) modify the CFG.
// * We want a loop-compact block enumeration, that is the numbers assigned to
//   blocks of a loop form an interval
//   
using POCB = std::function<void(const BasicBlock &)>;
using VisitedSet = std::set<const BasicBlock *>;
using BlockStack = std::vector<const BasicBlock *>;

// forward
static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
                          VisitedSet &Finalized);

// for a nested region (top-level loop or nested loop)
static void computeStackPO(BlockStack &Stack, const LoopInfo &LI, Loop *Loop,
                           POCB CallBack, VisitedSet &Finalized) {
  const auto *LoopHeader = Loop ? Loop->getHeader() : nullptr;
  while (!Stack.empty()) {
    const auto *NextBB = Stack.back();

    auto *NestedLoop = LI.getLoopFor(NextBB);
    bool IsNestedLoop = NestedLoop != Loop;

    // Treat the loop as a node
    if (IsNestedLoop) {
      SmallVector<BasicBlock *, 3> NestedExits;
      NestedLoop->getUniqueExitBlocks(NestedExits);
      bool PushedNodes = false;
      for (const auto *NestedExitBB : NestedExits) {
        if (NestedExitBB == LoopHeader)
          continue;
        if (Loop && !Loop->contains(NestedExitBB))
          continue;
        if (Finalized.count(NestedExitBB))
          continue;
        PushedNodes = true;
        Stack.push_back(NestedExitBB);
      }
      if (!PushedNodes) {
        // All loop exits finalized -> finish this node
        Stack.pop_back();
        computeLoopPO(LI, *NestedLoop, CallBack, Finalized);
      }
      continue;
    }

    // DAG-style
    bool PushedNodes = false;
    for (const auto *SuccBB : successors(NextBB)) {
      if (SuccBB == LoopHeader)
        continue;
      if (Loop && !Loop->contains(SuccBB))
        continue;
      if (Finalized.count(SuccBB))
        continue;
      PushedNodes = true;
      Stack.push_back(SuccBB);
    }
    if (!PushedNodes) {
      // Never push nodes twice
      Stack.pop_back();
      if (!Finalized.insert(NextBB).second)
        continue;
      CallBack(*NextBB);
    }
  }
}

static void computeTopLevelPO(Function &F, const LoopInfo &LI, POCB CallBack) {
  VisitedSet Finalized;
  BlockStack Stack;
  Stack.reserve(24); // FIXME made-up number
  Stack.push_back(&F.getEntryBlock());
  computeStackPO(Stack, LI, nullptr, CallBack, Finalized);
}

static void computeLoopPO(const LoopInfo &LI, Loop &Loop, POCB CallBack,
                          VisitedSet &Finalized) {
  /// Call CallBack on all loop blocks.
  std::vector<const BasicBlock *> Stack;
  const auto *LoopHeader = Loop.getHeader();

  // Visit the header last
  Finalized.insert(LoopHeader);
  CallBack(*LoopHeader);

  // Initialize with immediate successors
  for (const auto *BB : successors(LoopHeader)) {
    if (!Loop.contains(BB))
      continue;
    if (BB == LoopHeader)
      continue;
    Stack.push_back(BB);
  }

  // Compute PO inside region
  computeStackPO(Stack, LI, &Loop, CallBack, Finalized);
}

} // namespace

namespace llvm {

ControlDivergenceDesc SyncDependenceAnalysis::EmptyDivergenceDesc;

SyncDependenceAnalysis::SyncDependenceAnalysis(const DominatorTree &DT,
                                               const PostDominatorTree &PDT,
                                               const LoopInfo &LI)
    : DT(DT), PDT(PDT), LI(LI) {
  computeTopLevelPO(*DT.getRoot()->getParent(), LI,
                    [&](const BasicBlock &BB) { LoopPO.appendBlock(BB); });
}

SyncDependenceAnalysis::~SyncDependenceAnalysis() {}

// divergence propagator for reducible CFGs
struct DivergencePropagator {
  const ModifiedPO &LoopPOT;
  const DominatorTree &DT;
  const PostDominatorTree &PDT;
  const LoopInfo &LI;
  const BasicBlock &DivTermBlock;

  // * if BlockLabels[IndexOf(B)] == C then C is the dominating definition at
  //   block B
  // * if BlockLabels[IndexOf(B)] ~ undef then we haven't seen B yet
  // * if BlockLabels[IndexOf(B)] == B then B is a join point of disjoint paths
  // from X or B is an immediate successor of X (initial value).
  using BlockLabelVec = std::vector<const BasicBlock *>;
  BlockLabelVec BlockLabels;
  // divergent join and loop exit descriptor.
  std::unique_ptr<ControlDivergenceDesc> DivDesc;

  DivergencePropagator(const ModifiedPO &LoopPOT, const DominatorTree &DT,
                       const PostDominatorTree &PDT, const LoopInfo &LI,
                       const BasicBlock &DivTermBlock)
      : LoopPOT(LoopPOT), DT(DT), PDT(PDT), LI(LI), DivTermBlock(DivTermBlock),
        BlockLabels(LoopPOT.size(), nullptr),
        DivDesc(new ControlDivergenceDesc) {}

  void printDefs(raw_ostream &Out) {
    Out << "Propagator::BlockLabels {\n";
    for (int BlockIdx = (int)BlockLabels.size() - 1; BlockIdx > 0; --BlockIdx) {
      const auto *Label = BlockLabels[BlockIdx];
      Out << LoopPOT.getBlockAt(BlockIdx)->getName().str() << "(" << BlockIdx
          << ") : ";
      if (!Label) {
        Out << "<null>\n";
      } else {
        Out << Label->getName() << "\n";
      }
    }
    Out << "}\n";
  }

  // Push a definition (\p PushedLabel) to \p SuccBlock and return whether this
  // causes a divergent join.
  bool computeJoin(const BasicBlock &SuccBlock, const BasicBlock &PushedLabel) {
    auto SuccIdx = LoopPOT.getIndexOf(SuccBlock);

    // unset or same reaching label
    const auto *OldLabel = BlockLabels[SuccIdx];
    if (!OldLabel || (OldLabel == &PushedLabel)) {
      BlockLabels[SuccIdx] = &PushedLabel;
      return false;
    }

    // Update the definition
    BlockLabels[SuccIdx] = &SuccBlock;
    return true;
  }

  // visiting a virtual loop exit edge from the loop header --> temporal
  // divergence on join
  bool visitLoopExitEdge(const BasicBlock &ExitBlock,
                         const BasicBlock &DefBlock, bool FromParentLoop) {
    // Pushing from a non-parent loop cannot cause temporal divergence.
    if (!FromParentLoop)
      return visitEdge(ExitBlock, DefBlock);

    if (!computeJoin(ExitBlock, DefBlock))
      return false;

    // Identified a divergent loop exit
    DivDesc->LoopDivBlocks.insert(&ExitBlock);
    LLVM_DEBUG(dbgs() << "\tDivergent loop exit: " << ExitBlock.getName()
                      << "\n");
    return true;
  }

  // process \p SuccBlock with reaching definition \p DefBlock
  bool visitEdge(const BasicBlock &SuccBlock, const BasicBlock &DefBlock) {
    if (!computeJoin(SuccBlock, DefBlock))
      return false;

    // Divergent, disjoint paths join.
    DivDesc->JoinDivBlocks.insert(&SuccBlock);
    LLVM_DEBUG(dbgs() << "\tDivergent join: " << SuccBlock.getName());
    return true;
  }

  std::unique_ptr<ControlDivergenceDesc> computeJoinPoints() {
    assert(DivDesc);

    LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints: " << DivTermBlock.getName()
                      << "\n");

    const auto *DivBlockLoop = LI.getLoopFor(&DivTermBlock);

    // Early stopping criterion
    int FloorIdx = LoopPOT.size() - 1;
    const BasicBlock *FloorLabel = nullptr;

    // bootstrap with branch targets
    int BlockIdx = 0;

    for (const auto *SuccBlock : successors(&DivTermBlock)) {
      auto SuccIdx = LoopPOT.getIndexOf(*SuccBlock);
      BlockLabels[SuccIdx] = SuccBlock;

      // Find the successor with the highest index to start with
      BlockIdx = std::max<int>(BlockIdx, SuccIdx);
      FloorIdx = std::min<int>(FloorIdx, SuccIdx);

      // Identify immediate divergent loop exits
      if (!DivBlockLoop)
        continue;

      const auto *BlockLoop = LI.getLoopFor(SuccBlock);
      if (BlockLoop && DivBlockLoop->contains(BlockLoop))
        continue;
      DivDesc->LoopDivBlocks.insert(SuccBlock);
      LLVM_DEBUG(dbgs() << "\tImmediate divergent loop exit: "
                        << SuccBlock->getName() << "\n");
    }

    // propagate definitions at the immediate successors of the node in RPO
    for (; BlockIdx >= FloorIdx; --BlockIdx) {
      LLVM_DEBUG(dbgs() << "Before next visit:\n"; printDefs(dbgs()));

      // Any label available here
      const auto *Label = BlockLabels[BlockIdx];
      if (!Label)
        continue;

      // Ok. Get the block
      const auto *Block = LoopPOT.getBlockAt(BlockIdx);
      LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");

      auto *BlockLoop = LI.getLoopFor(Block);
      bool IsLoopHeader = BlockLoop && BlockLoop->getHeader() == Block;
      bool CausedJoin = false;
      int LoweredFloorIdx = FloorIdx;
      if (IsLoopHeader) {
        // Disconnect from immediate successors and propagate directly to loop
        // exits.
        SmallVector<BasicBlock *, 4> BlockLoopExits;
        BlockLoop->getExitBlocks(BlockLoopExits);

        bool IsParentLoop = BlockLoop->contains(&DivTermBlock);
        for (const auto *BlockLoopExit : BlockLoopExits) {
          CausedJoin |= visitLoopExitEdge(*BlockLoopExit, *Label, IsParentLoop);
          LoweredFloorIdx = std::min<int>(LoweredFloorIdx,
                                          LoopPOT.getIndexOf(*BlockLoopExit));
        }
      } else {
        // Acyclic successor case
        for (const auto *SuccBlock : successors(Block)) {
          CausedJoin |= visitEdge(*SuccBlock, *Label);
          LoweredFloorIdx =
              std::min<int>(LoweredFloorIdx, LoopPOT.getIndexOf(*SuccBlock));
        }
      }

      // Floor update
      if (CausedJoin) {
        // 1. Different labels pushed to successors
        FloorIdx = LoweredFloorIdx;
      } else if (FloorLabel != Label) {
        // 2. No join caused BUT we pushed a label that is different than the
        // last pushed label
        FloorIdx = LoweredFloorIdx;
        FloorLabel = Label;
      }
    }

    LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));

    return std::move(DivDesc);
  }
};

#ifndef NDEBUG
static void printBlockSet(ConstBlockSet &Blocks, raw_ostream &Out) {
  Out << "[";
  ListSeparator LS;
  for (const auto *BB : Blocks)
    Out << LS << BB->getName();
  Out << "]";
}
#endif

const ControlDivergenceDesc &
SyncDependenceAnalysis::getJoinBlocks(const Instruction &Term) {
  // trivial case
  if (Term.getNumSuccessors() <= 1) {
    return EmptyDivergenceDesc;
  }

  // already available in cache?
  auto ItCached = CachedControlDivDescs.find(&Term);
  if (ItCached != CachedControlDivDescs.end())
    return *ItCached->second;

  // compute all join points
  // Special handling of divergent loop exits is not needed for LCSSA
  const auto &TermBlock = *Term.getParent();
  DivergencePropagator Propagator(LoopPO, DT, PDT, LI, TermBlock);
  auto DivDesc = Propagator.computeJoinPoints();

  LLVM_DEBUG(dbgs() << "Result (" << Term.getParent()->getName() << "):\n";
             dbgs() << "JoinDivBlocks: ";
             printBlockSet(DivDesc->JoinDivBlocks, dbgs());
             dbgs() << "\nLoopDivBlocks: ";
             printBlockSet(DivDesc->LoopDivBlocks, dbgs()); dbgs() << "\n";);

  auto ItInserted = CachedControlDivDescs.emplace(&Term, std::move(DivDesc));
  assert(ItInserted.second);
  return *ItInserted.first->second;
}

} // namespace llvm