1 //===- ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ----===//
4 // The LLVM Compiler Infrastructure
6 // This file is distributed under the University of Illinois Open Source
7 // License. See LICENSE.TXT for details.
9 //===----------------------------------------------------------------------===//
11 // This pass replaces masked memory intrinsics - when unsupported by the target
12 // - with a chain of basic blocks, that deal with the elements one-by-one if the
13 // appropriate mask bit is set.
15 //===----------------------------------------------------------------------===//
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/TargetTransformInfo.h"
19 #include "llvm/CodeGen/TargetSubtargetInfo.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/Constant.h"
22 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/IR/IRBuilder.h"
26 #include "llvm/IR/InstrTypes.h"
27 #include "llvm/IR/Instruction.h"
28 #include "llvm/IR/Instructions.h"
29 #include "llvm/IR/IntrinsicInst.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/IR/Value.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/Casting.h"
40 #define DEBUG_TYPE "scalarize-masked-mem-intrin"
44 class ScalarizeMaskedMemIntrin : public FunctionPass {
45 const TargetTransformInfo *TTI = nullptr;
48 static char ID; // Pass identification, replacement for typeid
50 explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID) {
51 initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
54 bool runOnFunction(Function &F) override;
56 StringRef getPassName() const override {
57 return "Scalarize Masked Memory Intrinsics";
60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.addRequired<TargetTransformInfoWrapperPass>();
65 bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
66 bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
69 } // end anonymous namespace
71 char ScalarizeMaskedMemIntrin::ID = 0;
73 INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE,
74 "Scalarize unsupported masked memory intrinsics", false, false)
76 FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
77 return new ScalarizeMaskedMemIntrin();
80 static bool isConstantIntVector(Value *Mask) {
81 Constant *C = dyn_cast<Constant>(Mask);
85 unsigned NumElts = Mask->getType()->getVectorNumElements();
86 for (unsigned i = 0; i != NumElts; ++i) {
87 Constant *CElt = C->getAggregateElement(i);
88 if (!CElt || !isa<ConstantInt>(CElt))
95 // Translate a masked load intrinsic like
96 // <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
97 // <16 x i1> %mask, <16 x i32> %passthru)
98 // to a chain of basic blocks, with loading element one-by-one if
99 // the appropriate mask bit is set
101 // %1 = bitcast i8* %addr to i32*
102 // %2 = extractelement <16 x i1> %mask, i32 0
103 // br i1 %2, label %cond.load, label %else
105 // cond.load: ; preds = %0
106 // %3 = getelementptr i32* %1, i32 0
108 // %5 = insertelement <16 x i32> %passthru, i32 %4, i32 0
111 // else: ; preds = %0, %cond.load
112 // %res.phi.else = phi <16 x i32> [ %5, %cond.load ], [ undef, %0 ]
113 // %6 = extractelement <16 x i1> %mask, i32 1
114 // br i1 %6, label %cond.load1, label %else2
116 // cond.load1: ; preds = %else
117 // %7 = getelementptr i32* %1, i32 1
119 // %9 = insertelement <16 x i32> %res.phi.else, i32 %8, i32 1
122 // else2: ; preds = %else, %cond.load1
123 // %res.phi.else3 = phi <16 x i32> [ %9, %cond.load1 ], [ %res.phi.else, %else ]
124 // %10 = extractelement <16 x i1> %mask, i32 2
125 // br i1 %10, label %cond.load4, label %else5
127 static void scalarizeMaskedLoad(CallInst *CI) {
128 Value *Ptr = CI->getArgOperand(0);
129 Value *Alignment = CI->getArgOperand(1);
130 Value *Mask = CI->getArgOperand(2);
131 Value *Src0 = CI->getArgOperand(3);
133 unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
134 VectorType *VecType = cast<VectorType>(CI->getType());
136 Type *EltTy = VecType->getElementType();
138 IRBuilder<> Builder(CI->getContext());
139 Instruction *InsertPt = CI;
140 BasicBlock *IfBlock = CI->getParent();
142 Builder.SetInsertPoint(InsertPt);
143 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
145 // Short-cut if the mask is all-true.
146 if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
147 Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
148 CI->replaceAllUsesWith(NewI);
149 CI->eraseFromParent();
153 // Adjust alignment for the scalar instruction.
154 AlignVal = MinAlign(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
155 // Bitcast %addr fron i8* to EltTy*
157 EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
158 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
159 unsigned VectorWidth = VecType->getNumElements();
162 Value *VResult = Src0;
164 if (isConstantIntVector(Mask)) {
165 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
166 if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
169 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
170 LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
172 Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
174 CI->replaceAllUsesWith(VResult);
175 CI->eraseFromParent();
179 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
180 // Fill the "else" block, created in the previous iteration
182 // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
183 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
184 // br i1 %mask_1, label %cond.load, label %else
188 Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
190 // Create "cond" block
192 // %EltAddr = getelementptr i32* %1, i32 0
193 // %Elt = load i32* %EltAddr
194 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
196 BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(),
198 Builder.SetInsertPoint(InsertPt);
201 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
202 LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
203 Value *NewVResult = Builder.CreateInsertElement(VResult, Load,
204 Builder.getInt32(Idx));
206 // Create "else" block, fill it in the next iteration
207 BasicBlock *NewIfBlock =
208 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
209 Builder.SetInsertPoint(InsertPt);
210 Instruction *OldBr = IfBlock->getTerminator();
211 BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
212 OldBr->eraseFromParent();
213 BasicBlock *PrevIfBlock = IfBlock;
214 IfBlock = NewIfBlock;
216 // Create the phi to join the new and previous value.
217 PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
218 Phi->addIncoming(NewVResult, CondBlock);
219 Phi->addIncoming(VResult, PrevIfBlock);
223 CI->replaceAllUsesWith(VResult);
224 CI->eraseFromParent();
227 // Translate a masked store intrinsic, like
228 // void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
230 // to a chain of basic blocks, that stores element one-by-one if
231 // the appropriate mask bit is set
233 // %1 = bitcast i8* %addr to i32*
234 // %2 = extractelement <16 x i1> %mask, i32 0
235 // br i1 %2, label %cond.store, label %else
237 // cond.store: ; preds = %0
238 // %3 = extractelement <16 x i32> %val, i32 0
239 // %4 = getelementptr i32* %1, i32 0
240 // store i32 %3, i32* %4
243 // else: ; preds = %0, %cond.store
244 // %5 = extractelement <16 x i1> %mask, i32 1
245 // br i1 %5, label %cond.store1, label %else2
247 // cond.store1: ; preds = %else
248 // %6 = extractelement <16 x i32> %val, i32 1
249 // %7 = getelementptr i32* %1, i32 1
250 // store i32 %6, i32* %7
253 static void scalarizeMaskedStore(CallInst *CI) {
254 Value *Src = CI->getArgOperand(0);
255 Value *Ptr = CI->getArgOperand(1);
256 Value *Alignment = CI->getArgOperand(2);
257 Value *Mask = CI->getArgOperand(3);
259 unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
260 VectorType *VecType = cast<VectorType>(Src->getType());
262 Type *EltTy = VecType->getElementType();
264 IRBuilder<> Builder(CI->getContext());
265 Instruction *InsertPt = CI;
266 BasicBlock *IfBlock = CI->getParent();
267 Builder.SetInsertPoint(InsertPt);
268 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
270 // Short-cut if the mask is all-true.
271 if (isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue()) {
272 Builder.CreateAlignedStore(Src, Ptr, AlignVal);
273 CI->eraseFromParent();
277 // Adjust alignment for the scalar instruction.
278 AlignVal = MinAlign(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
279 // Bitcast %addr fron i8* to EltTy*
281 EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
282 Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
283 unsigned VectorWidth = VecType->getNumElements();
285 if (isConstantIntVector(Mask)) {
286 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
287 if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
289 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
291 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
292 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
294 CI->eraseFromParent();
298 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
299 // Fill the "else" block, created in the previous iteration
301 // %mask_1 = extractelement <16 x i1> %mask, i32 Idx
302 // br i1 %mask_1, label %cond.store, label %else
305 Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
307 // Create "cond" block
309 // %OneElt = extractelement <16 x i32> %Src, i32 Idx
310 // %EltAddr = getelementptr i32* %1, i32 0
311 // %store i32 %OneElt, i32* %EltAddr
313 BasicBlock *CondBlock =
314 IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
315 Builder.SetInsertPoint(InsertPt);
317 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
319 Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
320 Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
322 // Create "else" block, fill it in the next iteration
323 BasicBlock *NewIfBlock =
324 CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
325 Builder.SetInsertPoint(InsertPt);
326 Instruction *OldBr = IfBlock->getTerminator();
327 BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
328 OldBr->eraseFromParent();
329 IfBlock = NewIfBlock;
331 CI->eraseFromParent();
334 // Translate a masked gather intrinsic like
335 // <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
336 // <16 x i1> %Mask, <16 x i32> %Src)
337 // to a chain of basic blocks, with loading element one-by-one if
338 // the appropriate mask bit is set
340 // %Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
341 // %Mask0 = extractelement <16 x i1> %Mask, i32 0
342 // br i1 %Mask0, label %cond.load, label %else
345 // %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
346 // %Load0 = load i32, i32* %Ptr0, align 4
347 // %Res0 = insertelement <16 x i32> undef, i32 %Load0, i32 0
351 // %res.phi.else = phi <16 x i32>[%Res0, %cond.load], [undef, %0]
352 // %Mask1 = extractelement <16 x i1> %Mask, i32 1
353 // br i1 %Mask1, label %cond.load1, label %else2
356 // %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
357 // %Load1 = load i32, i32* %Ptr1, align 4
358 // %Res1 = insertelement <16 x i32> %res.phi.else, i32 %Load1, i32 1
361 // %Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
362 // ret <16 x i32> %Result
363 static void scalarizeMaskedGather(CallInst *CI) {
364 Value *Ptrs = CI->getArgOperand(0);
365 Value *Alignment = CI->getArgOperand(1);
366 Value *Mask = CI->getArgOperand(2);
367 Value *Src0 = CI->getArgOperand(3);
369 VectorType *VecType = cast<VectorType>(CI->getType());
371 IRBuilder<> Builder(CI->getContext());
372 Instruction *InsertPt = CI;
373 BasicBlock *IfBlock = CI->getParent();
374 Builder.SetInsertPoint(InsertPt);
375 unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
377 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
380 Value *VResult = Src0;
381 unsigned VectorWidth = VecType->getNumElements();
383 // Shorten the way if the mask is a vector of constants.
384 if (isConstantIntVector(Mask)) {
385 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
386 if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
388 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
391 Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
392 VResult = Builder.CreateInsertElement(
393 VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
395 CI->replaceAllUsesWith(VResult);
396 CI->eraseFromParent();
400 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
401 // Fill the "else" block, created in the previous iteration
403 // %Mask1 = extractelement <16 x i1> %Mask, i32 1
404 // br i1 %Mask1, label %cond.load, label %else
407 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
408 "Mask" + Twine(Idx));
410 // Create "cond" block
412 // %EltAddr = getelementptr i32* %1, i32 0
413 // %Elt = load i32* %EltAddr
414 // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
416 BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
417 Builder.SetInsertPoint(InsertPt);
419 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
422 Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
423 Value *NewVResult = Builder.CreateInsertElement(VResult, Load,
424 Builder.getInt32(Idx),
427 // Create "else" block, fill it in the next iteration
428 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
429 Builder.SetInsertPoint(InsertPt);
430 Instruction *OldBr = IfBlock->getTerminator();
431 BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
432 OldBr->eraseFromParent();
433 BasicBlock *PrevIfBlock = IfBlock;
434 IfBlock = NewIfBlock;
436 PHINode *Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
437 Phi->addIncoming(NewVResult, CondBlock);
438 Phi->addIncoming(VResult, PrevIfBlock);
442 CI->replaceAllUsesWith(VResult);
443 CI->eraseFromParent();
446 // Translate a masked scatter intrinsic, like
447 // void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
449 // to a chain of basic blocks, that stores element one-by-one if
450 // the appropriate mask bit is set.
452 // %Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
453 // %Mask0 = extractelement <16 x i1> %Mask, i32 0
454 // br i1 %Mask0, label %cond.store, label %else
457 // %Elt0 = extractelement <16 x i32> %Src, i32 0
458 // %Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
459 // store i32 %Elt0, i32* %Ptr0, align 4
463 // %Mask1 = extractelement <16 x i1> %Mask, i32 1
464 // br i1 %Mask1, label %cond.store1, label %else2
467 // %Elt1 = extractelement <16 x i32> %Src, i32 1
468 // %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
469 // store i32 %Elt1, i32* %Ptr1, align 4
472 static void scalarizeMaskedScatter(CallInst *CI) {
473 Value *Src = CI->getArgOperand(0);
474 Value *Ptrs = CI->getArgOperand(1);
475 Value *Alignment = CI->getArgOperand(2);
476 Value *Mask = CI->getArgOperand(3);
478 assert(isa<VectorType>(Src->getType()) &&
479 "Unexpected data type in masked scatter intrinsic");
480 assert(isa<VectorType>(Ptrs->getType()) &&
481 isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
482 "Vector of pointers is expected in masked scatter intrinsic");
484 IRBuilder<> Builder(CI->getContext());
485 Instruction *InsertPt = CI;
486 BasicBlock *IfBlock = CI->getParent();
487 Builder.SetInsertPoint(InsertPt);
488 Builder.SetCurrentDebugLocation(CI->getDebugLoc());
490 unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
491 unsigned VectorWidth = Src->getType()->getVectorNumElements();
493 // Shorten the way if the mask is a vector of constants.
494 if (isConstantIntVector(Mask)) {
495 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
496 if (cast<ConstantVector>(Mask)->getAggregateElement(Idx)->isNullValue())
498 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
500 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
502 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
504 CI->eraseFromParent();
508 for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
509 // Fill the "else" block, created in the previous iteration
511 // %Mask1 = extractelement <16 x i1> %Mask, i32 Idx
512 // br i1 %Mask1, label %cond.store, label %else
514 Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
515 "Mask" + Twine(Idx));
517 // Create "cond" block
519 // %Elt1 = extractelement <16 x i32> %Src, i32 1
520 // %Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
521 // %store i32 %Elt1, i32* %Ptr1
523 BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
524 Builder.SetInsertPoint(InsertPt);
526 Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
528 Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
530 Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
532 // Create "else" block, fill it in the next iteration
533 BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
534 Builder.SetInsertPoint(InsertPt);
535 Instruction *OldBr = IfBlock->getTerminator();
536 BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr);
537 OldBr->eraseFromParent();
538 IfBlock = NewIfBlock;
540 CI->eraseFromParent();
543 bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
544 bool EverMadeChange = false;
546 TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
548 bool MadeChange = true;
551 for (Function::iterator I = F.begin(); I != F.end();) {
552 BasicBlock *BB = &*I++;
553 bool ModifiedDTOnIteration = false;
554 MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
556 // Restart BB iteration if the dominator tree of the Function was changed
557 if (ModifiedDTOnIteration)
561 EverMadeChange |= MadeChange;
564 return EverMadeChange;
567 bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
568 bool MadeChange = false;
570 BasicBlock::iterator CurInstIterator = BB.begin();
571 while (CurInstIterator != BB.end()) {
572 if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
573 MadeChange |= optimizeCallInst(CI, ModifiedDT);
581 bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
583 IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
585 switch (II->getIntrinsicID()) {
588 case Intrinsic::masked_load:
589 // Scalarize unsupported vector masked load
590 if (!TTI->isLegalMaskedLoad(CI->getType())) {
591 scalarizeMaskedLoad(CI);
596 case Intrinsic::masked_store:
597 if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
598 scalarizeMaskedStore(CI);
603 case Intrinsic::masked_gather:
604 if (!TTI->isLegalMaskedGather(CI->getType())) {
605 scalarizeMaskedGather(CI);
610 case Intrinsic::masked_scatter:
611 if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
612 scalarizeMaskedScatter(CI);