contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp

   1 //===--------- X86InterleavedAccess.cpp ----------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===--------------------------------------------------------------------===//
   9 ///
  10 /// \file
  11 /// This file contains the X86 implementation of the interleaved accesses
  12 /// optimization generating X86-specific instructions/intrinsics for
  13 /// interleaved access groups.
  14 ///
  15 //===--------------------------------------------------------------------===//
  16
  17 #include "X86ISelLowering.h"
  18 #include "X86TargetMachine.h"
  19 #include "llvm/Analysis/VectorUtils.h"
  20
  21 using namespace llvm;
  22
  23 namespace {
  24 /// \brief This class holds necessary information to represent an interleaved
  25 /// access group and supports utilities to lower the group into
  26 /// X86-specific instructions/intrinsics.
  27 ///  E.g. A group of interleaving access loads (Factor = 2; accessing every
  28 ///       other element)
  29 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  30 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  31 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  32 class X86InterleavedAccessGroup {
  33   /// \brief Reference to the wide-load instruction of an interleaved access
  34   /// group.
  35   Instruction *const Inst;
  36
  37   /// \brief Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
  38   ArrayRef<ShuffleVectorInst *> Shuffles;
  39
  40   /// \brief Reference to the starting index of each user-shuffle.
  41   ArrayRef<unsigned> Indices;
  42
  43   /// \brief Reference to the interleaving stride in terms of elements.
  44   const unsigned Factor;
  45
  46   /// \brief Reference to the underlying target.
  47   const X86Subtarget &Subtarget;
  48
  49   const DataLayout &DL;
  50
  51   IRBuilder<> &Builder;
  52
  53   /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
  54   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
  55   void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
  56                  SmallVectorImpl<Instruction *> &DecomposedVectors);
  57
  58   /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
  59   /// returns the transposed-vectors in \p TransposedVectors.
  60   /// E.g.
  61   /// InputVectors:
  62   ///   In-V0 = p1, p2, p3, p4
  63   ///   In-V1 = q1, q2, q3, q4
  64   ///   In-V2 = r1, r2, r3, r4
  65   ///   In-V3 = s1, s2, s3, s4
  66   /// OutputVectors:
  67   ///   Out-V0 = p1, q1, r1, s1
  68   ///   Out-V1 = p2, q2, r2, s2
  69   ///   Out-V2 = p3, q3, r3, s3
  70   ///   Out-V3 = P4, q4, r4, s4
  71   void transpose_4x4(ArrayRef<Instruction *> InputVectors,
  72                      SmallVectorImpl<Value *> &TrasposedVectors);
  73
  74 public:
  75   /// In order to form an interleaved access group X86InterleavedAccessGroup
  76   /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
  77   /// \p Shuffs, reference to the first indices of each interleaved-vector
  78   /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
  79   /// X86-specific instructions/intrinsics it also requires the underlying
  80   /// target information \p STarget.
  81   explicit X86InterleavedAccessGroup(Instruction *I,
  82                                      ArrayRef<ShuffleVectorInst *> Shuffs,
  83                                      ArrayRef<unsigned> Ind, const unsigned F,
  84                                      const X86Subtarget &STarget,
  85                                      IRBuilder<> &B)
  86       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
  87         DL(Inst->getModule()->getDataLayout()), Builder(B) {}
  88
  89   /// \brief Returns true if this interleaved access group can be lowered into
  90   /// x86-specific instructions/intrinsics, false otherwise.
  91   bool isSupported() const;
  92
  93   /// \brief Lowers this interleaved access group into X86-specific
  94   /// instructions/intrinsics.
  95   bool lowerIntoOptimizedSequence();
  96 };
  97 } // end anonymous namespace
  98
  99 bool X86InterleavedAccessGroup::isSupported() const {
 100   VectorType *ShuffleVecTy = Shuffles[0]->getType();
 101   uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
 102   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 103
 104   // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
 105   uint64_t ExpectedShuffleVecSize;
 106   if (isa<LoadInst>(Inst))
 107     ExpectedShuffleVecSize = 256;
 108   else
 109     ExpectedShuffleVecSize = 1024;
 110
 111   if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
 112       DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
 113     return false;
 114
 115   return true;
 116 }
 117
 118 void X86InterleavedAccessGroup::decompose(
 119     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
 120     SmallVectorImpl<Instruction *> &DecomposedVectors) {
 121
 122   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
 123          "Expected Load or Shuffle");
 124
 125   Type *VecTy = VecInst->getType();
 126   (void)VecTy;
 127   assert(VecTy->isVectorTy() &&
 128          DL.getTypeSizeInBits(VecTy) >=
 129              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
 130          "Invalid Inst-size!!!");
 131
 132   if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
 133     Value *Op0 = SVI->getOperand(0);
 134     Value *Op1 = SVI->getOperand(1);
 135
 136     // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
 137     for (unsigned i = 0; i < NumSubVectors; ++i)
 138       DecomposedVectors.push_back(
 139           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
 140               Op0, Op1, createSequentialMask(Builder, Indices[i],
 141                                              SubVecTy->getVectorNumElements(), 0))));
 142     return;
 143   }
 144
 145   // Decompose the load instruction.
 146   LoadInst *LI = cast<LoadInst>(VecInst);
 147   Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
 148   Value *VecBasePtr =
 149       Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 150
 151   // Generate N loads of T type.
 152   for (unsigned i = 0; i < NumSubVectors; i++) {
 153     // TODO: Support inbounds GEP.
 154     Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
 155     Instruction *NewLoad =
 156         Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
 157     DecomposedVectors.push_back(NewLoad);
 158   }
 159 }
 160
 161 void X86InterleavedAccessGroup::transpose_4x4(
 162     ArrayRef<Instruction *> Matrix,
 163     SmallVectorImpl<Value *> &TransposedMatrix) {
 164   assert(Matrix.size() == 4 && "Invalid matrix size");
 165   TransposedMatrix.resize(4);
 166
 167   // dst = src1[0,1],src2[0,1]
 168   uint32_t IntMask1[] = {0, 1, 4, 5};
 169   ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
 170   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 171   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 172
 173   // dst = src1[2,3],src2[2,3]
 174   uint32_t IntMask2[] = {2, 3, 6, 7};
 175   Mask = makeArrayRef(IntMask2, 4);
 176   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
 177   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 178
 179   // dst = src1[0],src2[0],src1[2],src2[2]
 180   uint32_t IntMask3[] = {0, 4, 2, 6};
 181   Mask = makeArrayRef(IntMask3, 4);
 182   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 183   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 184
 185   // dst = src1[1],src2[1],src1[3],src2[3]
 186   uint32_t IntMask4[] = {1, 5, 3, 7};
 187   Mask = makeArrayRef(IntMask4, 4);
 188   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
 189   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 190 }
 191
 192 // Lowers this interleaved access group into X86-specific
 193 // instructions/intrinsics.
 194 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 195   SmallVector<Instruction *, 4> DecomposedVectors;
 196   SmallVector<Value *, 4> TransposedVectors;
 197   VectorType *ShuffleTy = Shuffles[0]->getType();
 198
 199   if (isa<LoadInst>(Inst)) {
 200     // Try to generate target-sized register(/instruction).
 201     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
 202
 203     // Perform matrix-transposition in order to compute interleaved
 204     // results by generating some sort of (optimized) target-specific
 205     // instructions.
 206     transpose_4x4(DecomposedVectors, TransposedVectors);
 207
 208     // Now replace the unoptimized-interleaved-vectors with the
 209     // transposed-interleaved vectors.
 210     for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
 211       Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
 212
 213     return true;
 214   }
 215
 216   Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
 217   unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
 218
 219   // Lower the interleaved stores:
 220   //   1. Decompose the interleaved wide shuffle into individual shuffle
 221   //   vectors.
 222   decompose(Shuffles[0], Factor,
 223             VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
 224
 225   //   2. Transpose the interleaved-vectors into vectors of contiguous
 226   //      elements.
 227   transpose_4x4(DecomposedVectors, TransposedVectors);
 228
 229   //   3. Concatenate the contiguous-vectors back into a wide vector.
 230   Value *WideVec = concatenateVectors(Builder, TransposedVectors);
 231
 232   //   4. Generate a store instruction for wide-vec.
 233   StoreInst *SI = cast<StoreInst>(Inst);
 234   Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
 235                              SI->getAlignment());
 236
 237   return true;
 238 }
 239
 240 // Lower interleaved load(s) into target specific instructions/
 241 // intrinsics. Lowering sequence varies depending on the vector-types, factor,
 242 // number of shuffles and ISA.
 243 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 244 bool X86TargetLowering::lowerInterleavedLoad(
 245     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
 246     ArrayRef<unsigned> Indices, unsigned Factor) const {
 247   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 248          "Invalid interleave factor");
 249   assert(!Shuffles.empty() && "Empty shufflevector input");
 250   assert(Shuffles.size() == Indices.size() &&
 251          "Unmatched number of shufflevectors and indices");
 252
 253   // Create an interleaved access group.
 254   IRBuilder<> Builder(LI);
 255   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
 256                                 Builder);
 257
 258   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 259 }
 260
 261 bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
 262                                               ShuffleVectorInst *SVI,
 263                                               unsigned Factor) const {
 264   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
 265          "Invalid interleave factor");
 266
 267   assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
 268          "Invalid interleaved store");
 269
 270   // Holds the indices of SVI that correspond to the starting index of each
 271   // interleaved shuffle.
 272   SmallVector<unsigned, 4> Indices;
 273   auto Mask = SVI->getShuffleMask();
 274   for (unsigned i = 0; i < Factor; i++)
 275     Indices.push_back(Mask[i]);
 276
 277   ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
 278
 279   // Create an interleaved access group.
 280   IRBuilder<> Builder(SI);
 281   X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
 282                                 Builder);
 283
 284   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 285 }