contrib/llvm/lib/Target/ARM/ARMParallelDSP.cpp

   1 //===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// Armv6 introduced instructions to perform 32-bit SIMD operations. The
  12 /// purpose of this pass is do some IR pattern matching to create ACLE
  13 /// DSP intrinsics, which map on these 32-bit SIMD operations.
  14 /// This pass runs only when unaligned accesses is supported/enabled.
  15 //
  16 //===----------------------------------------------------------------------===//
  17
  18 #include "llvm/ADT/Statistic.h"
  19 #include "llvm/ADT/SmallPtrSet.h"
  20 #include "llvm/Analysis/AliasAnalysis.h"
  21 #include "llvm/Analysis/LoopAccessAnalysis.h"
  22 #include "llvm/Analysis/LoopPass.h"
  23 #include "llvm/Analysis/LoopInfo.h"
  24 #include "llvm/IR/Instructions.h"
  25 #include "llvm/IR/NoFolder.h"
  26 #include "llvm/Transforms/Scalar.h"
  27 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  28 #include "llvm/Transforms/Utils/LoopUtils.h"
  29 #include "llvm/Pass.h"
  30 #include "llvm/PassRegistry.h"
  31 #include "llvm/PassSupport.h"
  32 #include "llvm/Support/Debug.h"
  33 #include "llvm/IR/PatternMatch.h"
  34 #include "llvm/CodeGen/TargetPassConfig.h"
  35 #include "ARM.h"
  36 #include "ARMSubtarget.h"
  37
  38 using namespace llvm;
  39 using namespace PatternMatch;
  40
  41 #define DEBUG_TYPE "arm-parallel-dsp"
  42
  43 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
  44
  45 static cl::opt<bool>
  46 DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false),
  47                    cl::desc("Disable the ARM Parallel DSP pass"));
  48
  49 namespace {
  50   struct OpChain;
  51   struct BinOpChain;
  52   struct Reduction;
  53
  54   using OpChainList     = SmallVector<std::unique_ptr<OpChain>, 8>;
  55   using ReductionList   = SmallVector<Reduction, 8>;
  56   using ValueList       = SmallVector<Value*, 8>;
  57   using MemInstList     = SmallVector<Instruction*, 8>;
  58   using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
  59   using PMACPairList    = SmallVector<PMACPair, 8>;
  60   using Instructions    = SmallVector<Instruction*,16>;
  61   using MemLocList      = SmallVector<MemoryLocation, 4>;
  62
  63   struct OpChain {
  64     Instruction   *Root;
  65     ValueList     AllValues;
  66     MemInstList   VecLd;    // List of all load instructions.
  67     MemLocList    MemLocs;  // All memory locations read by this tree.
  68     bool          ReadOnly = true;
  69
  70     OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
  71     virtual ~OpChain() = default;
  72
  73     void SetMemoryLocations() {
  74       const auto Size = LocationSize::unknown();
  75       for (auto *V : AllValues) {
  76         if (auto *I = dyn_cast<Instruction>(V)) {
  77           if (I->mayWriteToMemory())
  78             ReadOnly = false;
  79           if (auto *Ld = dyn_cast<LoadInst>(V))
  80             MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
  81         }
  82       }
  83     }
  84
  85     unsigned size() const { return AllValues.size(); }
  86   };
  87
  88   // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
  89   // 'Reduction' contains the phi-node and accumulator statement from where we
  90   // start pattern matching, and 'BinOpChain' the multiplication
  91   // instructions that are candidates for parallel execution.
  92   struct BinOpChain : public OpChain {
  93     ValueList     LHS;      // List of all (narrow) left hand operands.
  94     ValueList     RHS;      // List of all (narrow) right hand operands.
  95     bool Exchange = false;
  96
  97     BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
  98       OpChain(I, lhs), LHS(lhs), RHS(rhs) {
  99         for (auto *V : RHS)
 100           AllValues.push_back(V);
 101       }
 102
 103     bool AreSymmetrical(BinOpChain *Other);
 104   };
 105
 106   struct Reduction {
 107     PHINode         *Phi;             // The Phi-node from where we start
 108                                       // pattern matching.
 109     Instruction     *AccIntAdd;       // The accumulating integer add statement,
 110                                       // i.e, the reduction statement.
 111     OpChainList     MACCandidates;    // The MAC candidates associated with
 112                                       // this reduction statement.
 113     PMACPairList    PMACPairs;
 114     Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
 115   };
 116
 117   class ARMParallelDSP : public LoopPass {
 118     ScalarEvolution   *SE;
 119     AliasAnalysis     *AA;
 120     TargetLibraryInfo *TLI;
 121     DominatorTree     *DT;
 122     LoopInfo          *LI;
 123     Loop              *L;
 124     const DataLayout  *DL;
 125     Module            *M;
 126     std::map<LoadInst*, LoadInst*> LoadPairs;
 127     std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
 128
 129     bool RecordSequentialLoads(BasicBlock *Header);
 130     bool InsertParallelMACs(Reduction &Reduction);
 131     bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
 132     void CreateParallelMACPairs(Reduction &R);
 133     Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
 134                                  Instruction *Acc, bool Exchange,
 135                                  Instruction *InsertAfter);
 136
 137     /// Try to match and generate: SMLAD, SMLADX - Signed Multiply Accumulate
 138     /// Dual performs two signed 16x16-bit multiplications. It adds the
 139     /// products to a 32-bit accumulate operand. Optionally, the instruction can
 140     /// exchange the halfwords of the second operand before performing the
 141     /// arithmetic.
 142     bool MatchSMLAD(Function &F);
 143
 144   public:
 145     static char ID;
 146
 147     ARMParallelDSP() : LoopPass(ID) { }
 148
 149     void getAnalysisUsage(AnalysisUsage &AU) const override {
 150       LoopPass::getAnalysisUsage(AU);
 151       AU.addRequired<AssumptionCacheTracker>();
 152       AU.addRequired<ScalarEvolutionWrapperPass>();
 153       AU.addRequired<AAResultsWrapperPass>();
 154       AU.addRequired<TargetLibraryInfoWrapperPass>();
 155       AU.addRequired<LoopInfoWrapperPass>();
 156       AU.addRequired<DominatorTreeWrapperPass>();
 157       AU.addRequired<TargetPassConfig>();
 158       AU.addPreserved<LoopInfoWrapperPass>();
 159       AU.setPreservesCFG();
 160     }
 161
 162     bool runOnLoop(Loop *TheLoop, LPPassManager &) override {
 163       if (DisableParallelDSP)
 164         return false;
 165       L = TheLoop;
 166       SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
 167       AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 168       TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
 169       DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 170       LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 171       auto &TPC = getAnalysis<TargetPassConfig>();
 172
 173       BasicBlock *Header = TheLoop->getHeader();
 174       if (!Header)
 175         return false;
 176
 177       // TODO: We assume the loop header and latch to be the same block.
 178       // This is not a fundamental restriction, but lifting this would just
 179       // require more work to do the transformation and then patch up the CFG.
 180       if (Header != TheLoop->getLoopLatch()) {
 181         LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not "
 182                              "running pass ARMParallelDSP\n");
 183         return false;
 184       }
 185
 186       Function &F = *Header->getParent();
 187       M = F.getParent();
 188       DL = &M->getDataLayout();
 189
 190       auto &TM = TPC.getTM<TargetMachine>();
 191       auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
 192
 193       if (!ST->allowsUnalignedMem()) {
 194         LLVM_DEBUG(dbgs() << "Unaligned memory access not supported: not "
 195                              "running pass ARMParallelDSP\n");
 196         return false;
 197       }
 198
 199       if (!ST->hasDSP()) {
 200         LLVM_DEBUG(dbgs() << "DSP extension not enabled: not running pass "
 201                              "ARMParallelDSP\n");
 202         return false;
 203       }
 204
 205       LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI);
 206       bool Changes = false;
 207
 208       LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
 209       LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
 210
 211       if (!RecordSequentialLoads(Header)) {
 212         LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
 213         return false;
 214       }
 215
 216       Changes = MatchSMLAD(F);
 217       return Changes;
 218     }
 219   };
 220 }
 221
 222 // MaxBitwidth: the maximum supported bitwidth of the elements in the DSP
 223 // instructions, which is set to 16. So here we should collect all i8 and i16
 224 // narrow operations.
 225 // TODO: we currently only collect i16, and will support i8 later, so that's
 226 // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth.
 227 template<unsigned MaxBitWidth>
 228 static bool IsNarrowSequence(Value *V, ValueList &VL) {
 229   LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump());
 230   ConstantInt *CInt;
 231
 232   if (match(V, m_ConstantInt(CInt))) {
 233     // TODO: if a constant is used, it needs to fit within the bit width.
 234     return false;
 235   }
 236
 237   auto *I = dyn_cast<Instruction>(V);
 238   if (!I)
 239    return false;
 240
 241   Value *Val, *LHS, *RHS;
 242   if (match(V, m_Trunc(m_Value(Val)))) {
 243     if (cast<TruncInst>(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth)
 244       return IsNarrowSequence<MaxBitWidth>(Val, VL);
 245   } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) {
 246     // TODO: we need to implement sadd16/sadd8 for this, which enables to
 247     // also do the rewrite for smlad8.ll, but it is unsupported for now.
 248     LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
 249     return false;
 250   } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) {
 251     if (cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) {
 252       LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " <<
 253         cast<CastInst>(I)->getSrcTy()->getIntegerBitWidth() << "\n");
 254       return false;
 255     }
 256
 257     if (match(Val, m_Load(m_Value()))) {
 258       LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump());
 259       VL.push_back(Val);
 260       VL.push_back(I);
 261       return true;
 262     }
 263   }
 264   LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump());
 265   return false;
 266 }
 267
 268 template<typename MemInst>
 269 static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
 270                                   const DataLayout &DL, ScalarEvolution &SE) {
 271   if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
 272     LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
 273     return false;
 274   }
 275   if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
 276     LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
 277     return true;
 278   }
 279   LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n");
 280   return false;
 281 }
 282
 283 bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
 284                                         MemInstList &VecMem) {
 285   if (!Ld0 || !Ld1)
 286     return false;
 287
 288   LLVM_DEBUG(dbgs() << "Are consecutive loads:\n";
 289     dbgs() << "Ld0:"; Ld0->dump();
 290     dbgs() << "Ld1:"; Ld1->dump();
 291   );
 292
 293   if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) {
 294     LLVM_DEBUG(dbgs() << "No, load has more than one use.\n");
 295     return false;
 296   }
 297
 298   if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
 299     return false;
 300
 301   VecMem.clear();
 302   VecMem.push_back(Ld0);
 303   VecMem.push_back(Ld1);
 304   return true;
 305 }
 306
 307 /// Iterate through the block and record base, offset pairs of loads as well as
 308 /// maximal sequences of sequential loads.
 309 bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
 310   SmallVector<LoadInst*, 8> Loads;
 311   for (auto &I : *Header) {
 312     auto *Ld = dyn_cast<LoadInst>(&I);
 313     if (!Ld)
 314       continue;
 315     Loads.push_back(Ld);
 316   }
 317
 318   std::map<LoadInst*, LoadInst*> BaseLoads;
 319
 320   for (auto *Ld0 : Loads) {
 321     for (auto *Ld1 : Loads) {
 322       if (Ld0 == Ld1)
 323         continue;
 324
 325       if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
 326         LoadPairs[Ld0] = Ld1;
 327         if (BaseLoads.count(Ld0)) {
 328           LoadInst *Base = BaseLoads[Ld0];
 329           BaseLoads[Ld1] = Base;
 330           SequentialLoads[Base].push_back(Ld1);
 331         } else {
 332           BaseLoads[Ld1] = Ld0;
 333           SequentialLoads[Ld0].push_back(Ld1);
 334         }
 335       }
 336     }
 337   }
 338   return LoadPairs.size() > 1;
 339 }
 340
 341 void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
 342   OpChainList &Candidates = R.MACCandidates;
 343   PMACPairList &PMACPairs = R.PMACPairs;
 344   const unsigned Elems = Candidates.size();
 345
 346   if (Elems < 2)
 347     return;
 348
 349   auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
 350     if (!PMul0->AreSymmetrical(PMul1))
 351       return false;
 352
 353     // The first elements of each vector should be loads with sexts. If we
 354     // find that its two pairs of consecutive loads, then these can be
 355     // transformed into two wider loads and the users can be replaced with
 356     // DSP intrinsics.
 357     for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
 358       auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
 359       auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
 360       auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
 361       auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
 362
 363       if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
 364         return false;
 365
 366       LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
 367                  << "\t Ld0: " << *Ld0 << "\n"
 368                  << "\t Ld1: " << *Ld1 << "\n"
 369                  << "and operands " << x + 2 << ":\n"
 370                  << "\t Ld2: " << *Ld2 << "\n"
 371                  << "\t Ld3: " << *Ld3 << "\n");
 372
 373       if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
 374         if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
 375           LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
 376           PMACPairs.push_back(std::make_pair(PMul0, PMul1));
 377           return true;
 378         } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
 379           LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
 380           LLVM_DEBUG(dbgs() << "    exchanging Ld2 and Ld3\n");
 381           PMul1->Exchange = true;
 382           PMACPairs.push_back(std::make_pair(PMul0, PMul1));
 383           return true;
 384         }
 385       } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
 386                  AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
 387         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
 388         LLVM_DEBUG(dbgs() << "    exchanging Ld0 and Ld1\n");
 389         LLVM_DEBUG(dbgs() << "    and swapping muls\n");
 390         PMul0->Exchange = true;
 391         // Only the second operand can be exchanged, so swap the muls.
 392         PMACPairs.push_back(std::make_pair(PMul1, PMul0));
 393         return true;
 394       }
 395     }
 396     return false;
 397   };
 398
 399   SmallPtrSet<const Instruction*, 4> Paired;
 400   for (unsigned i = 0; i < Elems; ++i) {
 401     BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i].get());
 402     if (Paired.count(PMul0->Root))
 403       continue;
 404
 405     for (unsigned j = 0; j < Elems; ++j) {
 406       if (i == j)
 407         continue;
 408
 409       BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[j].get());
 410       if (Paired.count(PMul1->Root))
 411         continue;
 412
 413       const Instruction *Mul0 = PMul0->Root;
 414       const Instruction *Mul1 = PMul1->Root;
 415       if (Mul0 == Mul1)
 416         continue;
 417
 418       assert(PMul0 != PMul1 && "expected different chains");
 419
 420       LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n";
 421                  dbgs() << "- "; Mul0->dump();
 422                  dbgs() << "- "; Mul1->dump());
 423
 424       LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
 425       if (CanPair(PMul0, PMul1)) {
 426         Paired.insert(Mul0);
 427         Paired.insert(Mul1);
 428         break;
 429       }
 430     }
 431   }
 432 }
 433
 434 bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
 435   Instruction *Acc = Reduction.Phi;
 436   Instruction *InsertAfter = Reduction.AccIntAdd;
 437
 438   for (auto &Pair : Reduction.PMACPairs) {
 439     BinOpChain *PMul0 = Pair.first;
 440     BinOpChain *PMul1 = Pair.second;
 441     LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
 442                dbgs() << "- "; PMul0->Root->dump();
 443                dbgs() << "- "; PMul1->Root->dump());
 444
 445     auto *VecLd0 = cast<LoadInst>(PMul0->VecLd[0]);
 446     auto *VecLd1 = cast<LoadInst>(PMul1->VecLd[0]);
 447     Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter);
 448     InsertAfter = Acc;
 449   }
 450
 451   if (Acc != Reduction.Phi) {
 452     LLVM_DEBUG(dbgs() << "Replace Accumulate: "; Acc->dump());
 453     Reduction.AccIntAdd->replaceAllUsesWith(Acc);
 454     return true;
 455   }
 456   return false;
 457 }
 458
 459 static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
 460                             ReductionList &Reductions) {
 461   RecurrenceDescriptor RecDesc;
 462   const bool HasFnNoNaNAttr =
 463     F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
 464   const BasicBlock *Latch = TheLoop->getLoopLatch();
 465
 466   // We need a preheader as getIncomingValueForBlock assumes there is one.
 467   if (!TheLoop->getLoopPreheader()) {
 468     LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
 469     return;
 470   }
 471
 472   for (PHINode &Phi : Header->phis()) {
 473     const auto *Ty = Phi.getType();
 474     if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64))
 475       continue;
 476
 477     const bool IsReduction =
 478       RecurrenceDescriptor::AddReductionVar(&Phi,
 479                                             RecurrenceDescriptor::RK_IntegerAdd,
 480                                             TheLoop, HasFnNoNaNAttr, RecDesc);
 481     if (!IsReduction)
 482       continue;
 483
 484     Instruction *Acc = dyn_cast<Instruction>(Phi.getIncomingValueForBlock(Latch));
 485     if (!Acc)
 486       continue;
 487
 488     Reductions.push_back(Reduction(&Phi, Acc));
 489   }
 490
 491   LLVM_DEBUG(
 492     dbgs() << "\nAccumulating integer additions (reductions) found:\n";
 493     for (auto &R : Reductions) {
 494       dbgs() << "-  "; R.Phi->dump();
 495       dbgs() << "-> "; R.AccIntAdd->dump();
 496     }
 497   );
 498 }
 499
 500 static void AddMACCandidate(OpChainList &Candidates,
 501                             Instruction *Mul,
 502                             Value *MulOp0, Value *MulOp1) {
 503   LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
 504   assert(Mul->getOpcode() == Instruction::Mul &&
 505          "expected mul instruction");
 506   ValueList LHS;
 507   ValueList RHS;
 508   if (IsNarrowSequence<16>(MulOp0, LHS) &&
 509       IsNarrowSequence<16>(MulOp1, RHS)) {
 510     LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
 511     Candidates.push_back(make_unique<BinOpChain>(Mul, LHS, RHS));
 512   }
 513 }
 514
 515 static void MatchParallelMACSequences(Reduction &R,
 516                                       OpChainList &Candidates) {
 517   Instruction *Acc = R.AccIntAdd;
 518   LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc);
 519
 520   // Returns false to signal the search should be stopped.
 521   std::function<bool(Value*)> Match =
 522     [&Candidates, &Match](Value *V) -> bool {
 523
 524     auto *I = dyn_cast<Instruction>(V);
 525     if (!I)
 526       return false;
 527
 528     switch (I->getOpcode()) {
 529     case Instruction::Add:
 530       if (Match(I->getOperand(0)) || (Match(I->getOperand(1))))
 531         return true;
 532       break;
 533     case Instruction::Mul: {
 534       Value *MulOp0 = I->getOperand(0);
 535       Value *MulOp1 = I->getOperand(1);
 536       if (isa<SExtInst>(MulOp0) && isa<SExtInst>(MulOp1))
 537         AddMACCandidate(Candidates, I, MulOp0, MulOp1);
 538       return false;
 539     }
 540     case Instruction::SExt:
 541       return Match(I->getOperand(0));
 542     }
 543     return false;
 544   };
 545
 546   while (Match (Acc));
 547   LLVM_DEBUG(dbgs() << "Finished matching MAC sequences, found "
 548              << Candidates.size() << " candidates.\n");
 549 }
 550
 551 // Collects all instructions that are not part of the MAC chains, which is the
 552 // set of instructions that can potentially alias with the MAC operands.
 553 static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
 554                             Instructions &Writes) {
 555   for (auto &I : *Header) {
 556     if (I.mayReadFromMemory())
 557       Reads.push_back(&I);
 558     if (I.mayWriteToMemory())
 559       Writes.push_back(&I);
 560   }
 561 }
 562
 563 // Check whether statements in the basic block that write to memory alias with
 564 // the memory locations accessed by the MAC-chains.
 565 // TODO: we need the read statements when we accept more complicated chains.
 566 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
 567                        Instructions &Writes, OpChainList &MACCandidates) {
 568   LLVM_DEBUG(dbgs() << "Alias checks:\n");
 569   for (auto &MAC : MACCandidates) {
 570     LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
 571
 572     // At the moment, we allow only simple chains that only consist of reads,
 573     // accumulate their result with an integer add, and thus that don't write
 574     // memory, and simply bail if they do.
 575     if (!MAC->ReadOnly)
 576       return true;
 577
 578     // Now for all writes in the basic block, check that they don't alias with
 579     // the memory locations accessed by our MAC-chain:
 580     for (auto *I : Writes) {
 581       LLVM_DEBUG(dbgs() << "- "; I->dump());
 582       assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
 583       for (auto &MemLoc : MAC->MemLocs) {
 584         if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
 585                                           ModRefInfo::ModRef))) {
 586           LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
 587           return true;
 588         }
 589       }
 590     }
 591   }
 592
 593   LLVM_DEBUG(dbgs() << "OK: no aliases found!\n");
 594   return false;
 595 }
 596
 597 static bool CheckMACMemory(OpChainList &Candidates) {
 598   for (auto &C : Candidates) {
 599     // A mul has 2 operands, and a narrow op consist of sext and a load; thus
 600     // we expect at least 4 items in this operand value list.
 601     if (C->size() < 4) {
 602       LLVM_DEBUG(dbgs() << "Operand list too short.\n");
 603       return false;
 604     }
 605     C->SetMemoryLocations();
 606     ValueList &LHS = static_cast<BinOpChain*>(C.get())->LHS;
 607     ValueList &RHS = static_cast<BinOpChain*>(C.get())->RHS;
 608
 609     // Use +=2 to skip over the expected extend instructions.
 610     for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
 611       if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
 612         return false;
 613     }
 614   }
 615   return true;
 616 }
 617
 618 // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector
 619 // multiplications.
 620 // To use SMLAD:
 621 // 1) we first need to find integer add reduction PHIs,
 622 // 2) then from the PHI, look for this pattern:
 623 //
 624 // acc0 = phi i32 [0, %entry], [%acc1, %loop.body]
 625 // ld0 = load i16
 626 // sext0 = sext i16 %ld0 to i32
 627 // ld1 = load i16
 628 // sext1 = sext i16 %ld1 to i32
 629 // mul0 = mul %sext0, %sext1
 630 // ld2 = load i16
 631 // sext2 = sext i16 %ld2 to i32
 632 // ld3 = load i16
 633 // sext3 = sext i16 %ld3 to i32
 634 // mul1 = mul i32 %sext2, %sext3
 635 // add0 = add i32 %mul0, %acc0
 636 // acc1 = add i32 %add0, %mul1
 637 //
 638 // Which can be selected to:
 639 //
 640 // ldr.h r0
 641 // ldr.h r1
 642 // smlad r2, r0, r1, r2
 643 //
 644 // If constants are used instead of loads, these will need to be hoisted
 645 // out and into a register.
 646 //
 647 // If loop invariants are used instead of loads, these need to be packed
 648 // before the loop begins.
 649 //
 650 bool ARMParallelDSP::MatchSMLAD(Function &F) {
 651   BasicBlock *Header = L->getHeader();
 652   LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n";
 653              dbgs() << "Header block:\n"; Header->dump();
 654              dbgs() << "Loop info:\n\n"; L->dump());
 655
 656   bool Changed = false;
 657   ReductionList Reductions;
 658   MatchReductions(F, L, Header, Reductions);
 659
 660   for (auto &R : Reductions) {
 661     OpChainList MACCandidates;
 662     MatchParallelMACSequences(R, MACCandidates);
 663     if (!CheckMACMemory(MACCandidates))
 664       continue;
 665
 666     R.MACCandidates = std::move(MACCandidates);
 667
 668     LLVM_DEBUG(dbgs() << "MAC candidates:\n";
 669       for (auto &M : R.MACCandidates)
 670         M->Root->dump();
 671       dbgs() << "\n";);
 672   }
 673
 674   // Collect all instructions that may read or write memory. Our alias
 675   // analysis checks bail out if any of these instructions aliases with an
 676   // instruction from the MAC-chain.
 677   Instructions Reads, Writes;
 678   AliasCandidates(Header, Reads, Writes);
 679
 680   for (auto &R : Reductions) {
 681     if (AreAliased(AA, Reads, Writes, R.MACCandidates))
 682       return false;
 683     CreateParallelMACPairs(R);
 684     Changed |= InsertParallelMACs(R);
 685   }
 686
 687   LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
 688   return Changed;
 689 }
 690
 691 static LoadInst *CreateLoadIns(IRBuilder<NoFolder> &IRB, LoadInst &BaseLoad,
 692                                const Type *LoadTy) {
 693   const unsigned AddrSpace = BaseLoad.getPointerAddressSpace();
 694
 695   Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(),
 696                                     LoadTy->getPointerTo(AddrSpace));
 697   return IRB.CreateAlignedLoad(VecPtr, BaseLoad.getAlignment());
 698 }
 699
 700 Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
 701                                              Instruction *Acc, bool Exchange,
 702                                              Instruction *InsertAfter) {
 703   LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n"
 704              << "- " << *VecLd0 << "\n"
 705              << "- " << *VecLd1 << "\n"
 706              << "- " << *Acc << "\n"
 707              << "Exchange: " << Exchange << "\n");
 708
 709   IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
 710                               ++BasicBlock::iterator(InsertAfter));
 711
 712   // Replace the reduction chain with an intrinsic call
 713   const Type *Ty = IntegerType::get(M->getContext(), 32);
 714   LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty);
 715   LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty);
 716   Value* Args[] = { NewLd0, NewLd1, Acc };
 717   Function *SMLAD = nullptr;
 718   if (Exchange)
 719     SMLAD = Acc->getType()->isIntegerTy(32) ?
 720       Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) :
 721       Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx);
 722   else
 723     SMLAD = Acc->getType()->isIntegerTy(32) ?
 724       Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) :
 725       Intrinsic::getDeclaration(M, Intrinsic::arm_smlald);
 726   CallInst *Call = Builder.CreateCall(SMLAD, Args);
 727   NumSMLAD++;
 728   return Call;
 729 }
 730
 731 // Compare the value lists in Other to this chain.
 732 bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
 733   // Element-by-element comparison of Value lists returning true if they are
 734   // instructions with the same opcode or constants with the same value.
 735   auto CompareValueList = [](const ValueList &VL0,
 736                              const ValueList &VL1) {
 737     if (VL0.size() != VL1.size()) {
 738       LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
 739                         << VL0.size() << " != " << VL1.size() << "\n");
 740       return false;
 741     }
 742
 743     const unsigned Pairs = VL0.size();
 744     LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
 745
 746     for (unsigned i = 0; i < Pairs; ++i) {
 747       const Value *V0 = VL0[i];
 748       const Value *V1 = VL1[i];
 749       const auto *Inst0 = dyn_cast<Instruction>(V0);
 750       const auto *Inst1 = dyn_cast<Instruction>(V1);
 751
 752       LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
 753                 dbgs() << "mul1: "; V0->dump();
 754                 dbgs() << "mul2: "; V1->dump());
 755
 756       if (!Inst0 || !Inst1)
 757         return false;
 758
 759       if (Inst0->isSameOperationAs(Inst1)) {
 760         LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
 761         continue;
 762       }
 763
 764       const APInt *C0, *C1;
 765       if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
 766         return false;
 767     }
 768
 769     LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
 770     return true;
 771   };
 772
 773   return CompareValueList(LHS, Other->LHS) &&
 774          CompareValueList(RHS, Other->RHS);
 775 }
 776
 777 Pass *llvm::createARMParallelDSPPass() {
 778   return new ARMParallelDSP();
 779 }
 780
 781 char ARMParallelDSP::ID = 0;
 782
 783 INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp",
 784                 "Transform loops to use DSP intrinsics", false, false)
 785 INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp",
 786                 "Transform loops to use DSP intrinsics", false, false)