1 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines a DAG pattern matching instruction selector for X86,
10 // converting from a legalized dag to a X86 dag.
12 //===----------------------------------------------------------------------===//
15 #include "X86MachineFunctionInfo.h"
16 #include "X86RegisterInfo.h"
17 #include "X86Subtarget.h"
18 #include "X86TargetMachine.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/SelectionDAGISel.h"
23 #include "llvm/Config/llvm-config.h"
24 #include "llvm/IR/ConstantRange.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/Intrinsics.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/KnownBits.h"
32 #include "llvm/Support/MathExtras.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Target/TargetMachine.h"
35 #include "llvm/Target/TargetOptions.h"
39 #define DEBUG_TYPE "x86-isel"
41 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
43 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44 cl::desc("Enable setting constant bits to reduce size of mask immediates"),
47 //===----------------------------------------------------------------------===//
48 // Pattern Matcher Implementation
49 //===----------------------------------------------------------------------===//
52 /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53 /// numbers for the leaves of the matched tree.
54 struct X86ISelAddressMode {
60 // This is really a union, discriminated by BaseType!
68 const GlobalValue *GV;
70 const BlockAddress *BlockAddr;
74 unsigned Align; // CP alignment.
75 unsigned char SymbolFlags; // X86II::MO_*
76 bool NegateIndex = false;
79 : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80 Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81 MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
83 bool hasSymbolicDisplacement() const {
84 return GV != nullptr || CP != nullptr || ES != nullptr ||
85 MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
88 bool hasBaseOrIndexReg() const {
89 return BaseType == FrameIndexBase ||
90 IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
93 /// Return true if this addressing mode is already RIP-relative.
94 bool isRIPRelative() const {
95 if (BaseType != RegBase) return false;
96 if (RegisterSDNode *RegNode =
97 dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98 return RegNode->getReg() == X86::RIP;
102 void setBaseReg(SDValue Reg) {
107 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108 void dump(SelectionDAG *DAG = nullptr) {
109 dbgs() << "X86ISelAddressMode " << this << '\n';
110 dbgs() << "Base_Reg ";
111 if (Base_Reg.getNode())
112 Base_Reg.getNode()->dump(DAG);
115 if (BaseType == FrameIndexBase)
116 dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117 dbgs() << " Scale " << Scale << '\n'
121 if (IndexReg.getNode())
122 IndexReg.getNode()->dump(DAG);
125 dbgs() << " Disp " << Disp << '\n'
147 dbgs() << " JT" << JT << " Align" << Align << '\n';
154 //===--------------------------------------------------------------------===//
155 /// ISel - X86-specific code to select X86 machine instructions for
156 /// SelectionDAG operations.
158 class X86DAGToDAGISel final : public SelectionDAGISel {
159 /// Keep a pointer to the X86Subtarget around so that we can
160 /// make the right decision when generating code for different targets.
161 const X86Subtarget *Subtarget;
163 /// If true, selector should try to optimize for code size instead of
167 /// If true, selector should try to optimize for minimum code size.
170 /// Disable direct TLS access through segment registers.
171 bool IndirectTlsSegRefs;
174 explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175 : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176 OptForMinSize(false), IndirectTlsSegRefs(false) {}
178 StringRef getPassName() const override {
179 return "X86 DAG->DAG Instruction Selection";
182 bool runOnMachineFunction(MachineFunction &MF) override {
183 // Reset the subtarget each time through.
184 Subtarget = &MF.getSubtarget<X86Subtarget>();
185 IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186 "indirect-tls-seg-refs");
188 // OptFor[Min]Size are used in pattern predicates that isel is matching.
189 OptForSize = MF.getFunction().hasOptSize();
190 OptForMinSize = MF.getFunction().hasMinSize();
191 assert((!OptForMinSize || OptForSize) &&
192 "OptForMinSize implies OptForSize");
194 SelectionDAGISel::runOnMachineFunction(MF);
198 void EmitFunctionEntryCode() override;
200 bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
202 void PreprocessISelDAG() override;
203 void PostprocessISelDAG() override;
205 // Include the pieces autogenerated from the target description.
206 #include "X86GenDAGISel.inc"
209 void Select(SDNode *N) override;
211 bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212 bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213 bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214 bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215 bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216 bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217 bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
219 bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220 bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221 SDValue &Scale, SDValue &Index, SDValue &Disp,
223 bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224 SDValue &Scale, SDValue &Index, SDValue &Disp,
226 bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227 bool selectLEAAddr(SDValue N, SDValue &Base,
228 SDValue &Scale, SDValue &Index, SDValue &Disp,
230 bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231 SDValue &Scale, SDValue &Index, SDValue &Disp,
233 bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234 SDValue &Scale, SDValue &Index, SDValue &Disp,
236 bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237 SDValue &Base, SDValue &Scale,
238 SDValue &Index, SDValue &Disp,
240 SDValue &NodeWithChain);
241 bool selectRelocImm(SDValue N, SDValue &Op);
243 bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244 SDValue &Base, SDValue &Scale,
245 SDValue &Index, SDValue &Disp,
248 // Convenience method where P is also root.
249 bool tryFoldLoad(SDNode *P, SDValue N,
250 SDValue &Base, SDValue &Scale,
251 SDValue &Index, SDValue &Disp,
253 return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
256 /// Implement addressing mode selection for inline asm expressions.
257 bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258 unsigned ConstraintID,
259 std::vector<SDValue> &OutOps) override;
261 void emitSpecialCodeForMain();
263 inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264 MVT VT, SDValue &Base, SDValue &Scale,
265 SDValue &Index, SDValue &Disp,
267 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268 Base = CurDAG->getTargetFrameIndex(
269 AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
270 else if (AM.Base_Reg.getNode())
273 Base = CurDAG->getRegister(0, VT);
275 Scale = getI8Imm(AM.Scale, DL);
277 // Negate the index if needed.
278 if (AM.NegateIndex) {
279 unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r;
280 SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
285 if (AM.IndexReg.getNode())
288 Index = CurDAG->getRegister(0, VT);
290 // These are 32-bit even in 64-bit mode since RIP-relative offset
293 Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
297 Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
298 AM.Align, AM.Disp, AM.SymbolFlags);
300 assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
301 Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
302 } else if (AM.MCSym) {
303 assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
304 assert(AM.SymbolFlags == 0 && "oo");
305 Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
306 } else if (AM.JT != -1) {
307 assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
308 Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
309 } else if (AM.BlockAddr)
310 Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
313 Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
315 if (AM.Segment.getNode())
316 Segment = AM.Segment;
318 Segment = CurDAG->getRegister(0, MVT::i16);
321 // Utility function to determine whether we should avoid selecting
322 // immediate forms of instructions for better code size or not.
323 // At a high level, we'd like to avoid such instructions when
324 // we have similar constants used within the same basic block
325 // that can be kept in a register.
327 bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
328 uint32_t UseCount = 0;
330 // Do not want to hoist if we're not optimizing for size.
331 // TODO: We'd like to remove this restriction.
332 // See the comment in X86InstrInfo.td for more info.
336 // Walk all the users of the immediate.
337 for (SDNode::use_iterator UI = N->use_begin(),
338 UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
342 // This user is already selected. Count it as a legitimate use and
344 if (User->isMachineOpcode()) {
349 // We want to count stores of immediates as real uses.
350 if (User->getOpcode() == ISD::STORE &&
351 User->getOperand(1).getNode() == N) {
356 // We don't currently match users that have > 2 operands (except
357 // for stores, which are handled above)
358 // Those instruction won't match in ISEL, for now, and would
359 // be counted incorrectly.
360 // This may change in the future as we add additional instruction
362 if (User->getNumOperands() != 2)
365 // Immediates that are used for offsets as part of stack
366 // manipulation should be left alone. These are typically
367 // used to indicate SP offsets for argument passing and
368 // will get pulled into stores/pushes (implicitly).
369 if (User->getOpcode() == X86ISD::ADD ||
370 User->getOpcode() == ISD::ADD ||
371 User->getOpcode() == X86ISD::SUB ||
372 User->getOpcode() == ISD::SUB) {
374 // Find the other operand of the add/sub.
375 SDValue OtherOp = User->getOperand(0);
376 if (OtherOp.getNode() == N)
377 OtherOp = User->getOperand(1);
379 // Don't count if the other operand is SP.
380 RegisterSDNode *RegNode;
381 if (OtherOp->getOpcode() == ISD::CopyFromReg &&
382 (RegNode = dyn_cast_or_null<RegisterSDNode>(
383 OtherOp->getOperand(1).getNode())))
384 if ((RegNode->getReg() == X86::ESP) ||
385 (RegNode->getReg() == X86::RSP))
389 // ... otherwise, count this and move on.
393 // If we have more than 1 use, then recommend for hoisting.
394 return (UseCount > 1);
397 /// Return a target constant with the specified value of type i8.
398 inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
399 return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
402 /// Return a target constant with the specified value, of type i32.
403 inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
404 return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
407 /// Return a target constant with the specified value, of type i64.
408 inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
409 return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
412 SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
414 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
415 uint64_t Index = N->getConstantOperandVal(1);
416 MVT VecVT = N->getOperand(0).getSimpleValueType();
417 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
420 SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
422 assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
423 uint64_t Index = N->getConstantOperandVal(2);
424 MVT VecVT = N->getSimpleValueType(0);
425 return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
428 // Helper to detect unneeded and instructions on shift amounts. Called
429 // from PatFrags in tablegen.
430 bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
431 assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
432 const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
434 if (Val.countTrailingOnes() >= Width)
437 APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
438 return Mask.countTrailingOnes() >= Width;
441 /// Return an SDNode that returns the value of the global base register.
442 /// Output instructions required to initialize the global base register,
444 SDNode *getGlobalBaseReg();
446 /// Return a reference to the TargetMachine, casted to the target-specific
448 const X86TargetMachine &getTargetMachine() const {
449 return static_cast<const X86TargetMachine &>(TM);
452 /// Return a reference to the TargetInstrInfo, casted to the target-specific
454 const X86InstrInfo *getInstrInfo() const {
455 return Subtarget->getInstrInfo();
458 /// Address-mode matching performs shift-of-and to and-of-shift
459 /// reassociation in order to expose more scaled addressing
461 bool ComplexPatternFuncMutatesDAG() const override {
465 bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
467 /// Returns whether this is a relocatable immediate in the range
468 /// [-2^Width .. 2^Width-1].
469 template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470 if (auto *CN = dyn_cast<ConstantSDNode>(N))
471 return isInt<Width>(CN->getSExtValue());
472 return isSExtAbsoluteSymbolRef(Width, N);
475 // Indicates we should prefer to use a non-temporal load for this load.
476 bool useNonTemporalLoad(LoadSDNode *N) const {
477 if (!N->isNonTemporal())
480 unsigned StoreSize = N->getMemoryVT().getStoreSize();
482 if (N->getAlignment() < StoreSize)
486 default: llvm_unreachable("Unsupported store size");
491 return Subtarget->hasSSE41();
493 return Subtarget->hasAVX2();
495 return Subtarget->hasAVX512();
499 bool foldLoadStoreIntoMemOperand(SDNode *Node);
500 MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
501 bool matchBitExtract(SDNode *Node);
502 bool shrinkAndImmediate(SDNode *N);
503 bool isMaskZeroExtended(SDNode *N) const;
504 bool tryShiftAmountMod(SDNode *N);
505 bool tryShrinkShlLogicImm(SDNode *N);
506 bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
508 MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
509 const SDLoc &dl, MVT VT, SDNode *Node);
510 MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
511 const SDLoc &dl, MVT VT, SDNode *Node,
514 bool tryOptimizeRem8Extend(SDNode *N);
516 bool onlyUsesZeroFlag(SDValue Flags) const;
517 bool hasNoSignFlagUses(SDValue Flags) const;
518 bool hasNoCarryFlagUses(SDValue Flags) const;
523 // Returns true if this masked compare can be implemented legally with this
525 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
526 unsigned Opcode = N->getOpcode();
527 if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
528 Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
529 // We can get 256-bit 8 element types here without VLX being enabled. When
530 // this happens we will use 512-bit operations and the mask will not be
532 EVT OpVT = N->getOperand(0).getValueType();
533 if (OpVT.is256BitVector() || OpVT.is128BitVector())
534 return Subtarget->hasVLX();
538 // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
539 if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
540 Opcode == X86ISD::FSETCCM_SAE)
546 // Returns true if we can assume the writer of the mask has zero extended it
548 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
549 // If this is an AND, check if we have a compare on either side. As long as
550 // one side guarantees the mask is zero extended, the AND will preserve those
552 if (N->getOpcode() == ISD::AND)
553 return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
554 isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
556 return isLegalMaskCompare(N, Subtarget);
560 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
561 if (OptLevel == CodeGenOpt::None) return false;
566 if (N.getOpcode() != ISD::LOAD)
569 // Don't fold non-temporal loads if we have an instruction for them.
570 if (useNonTemporalLoad(cast<LoadSDNode>(N)))
573 // If N is a load, do additional profitability checks.
575 switch (U->getOpcode()) {
589 SDValue Op1 = U->getOperand(1);
591 // If the other operand is a 8-bit immediate we should fold the immediate
592 // instead. This reduces code size.
594 // movl 4(%esp), %eax
598 // addl 4(%esp), %eax
599 // The former is 2 bytes shorter. In case where the increment is 1, then
600 // the saving can be 4 bytes (by using incl %eax).
601 if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
602 if (Imm->getAPIntValue().isSignedIntN(8))
605 // If this is a 64-bit AND with an immediate that fits in 32-bits,
606 // prefer using the smaller and over folding the load. This is needed to
607 // make sure immediates created by shrinkAndImmediate are always folded.
608 // Ideally we would narrow the load during DAG combine and get the
609 // best of both worlds.
610 if (U->getOpcode() == ISD::AND &&
611 Imm->getAPIntValue().getBitWidth() == 64 &&
612 Imm->getAPIntValue().isIntN(32))
615 // If this really a zext_inreg that can be represented with a movzx
616 // instruction, prefer that.
617 // TODO: We could shrink the load and fold if it is non-volatile.
618 if (U->getOpcode() == ISD::AND &&
619 (Imm->getAPIntValue() == UINT8_MAX ||
620 Imm->getAPIntValue() == UINT16_MAX ||
621 Imm->getAPIntValue() == UINT32_MAX))
624 // ADD/SUB with can negate the immediate and use the opposite operation
625 // to fit 128 into a sign extended 8 bit immediate.
626 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
627 (-Imm->getAPIntValue()).isSignedIntN(8))
631 // If the other operand is a TLS address, we should fold it instead.
634 // leal i@NTPOFF(%eax), %eax
636 // movl $i@NTPOFF, %eax
638 // if the block also has an access to a second TLS address this will save
640 // FIXME: This is probably also true for non-TLS addresses.
641 if (Op1.getOpcode() == X86ISD::Wrapper) {
642 SDValue Val = Op1.getOperand(0);
643 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
647 // Don't fold load if this matches the BTS/BTR/BTC patterns.
648 // BTS: (or X, (shl 1, n))
649 // BTR: (and X, (rotl -2, n))
650 // BTC: (xor X, (shl 1, n))
651 if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
652 if (U->getOperand(0).getOpcode() == ISD::SHL &&
653 isOneConstant(U->getOperand(0).getOperand(0)))
656 if (U->getOperand(1).getOpcode() == ISD::SHL &&
657 isOneConstant(U->getOperand(1).getOperand(0)))
660 if (U->getOpcode() == ISD::AND) {
661 SDValue U0 = U->getOperand(0);
662 SDValue U1 = U->getOperand(1);
663 if (U0.getOpcode() == ISD::ROTL) {
664 auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
665 if (C && C->getSExtValue() == -2)
669 if (U1.getOpcode() == ISD::ROTL) {
670 auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
671 if (C && C->getSExtValue() == -2)
681 // Don't fold a load into a shift by immediate. The BMI2 instructions
682 // support folding a load, but not an immediate. The legacy instructions
683 // support folding an immediate, but can't fold a load. Folding an
684 // immediate is preferable to folding a load.
685 if (isa<ConstantSDNode>(U->getOperand(1)))
692 // Prevent folding a load if this can implemented with an insert_subreg or
693 // a move that implicitly zeroes.
694 if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
695 isNullConstant(Root->getOperand(2)) &&
696 (Root->getOperand(0).isUndef() ||
697 ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
703 /// Replace the original chain operand of the call with
704 /// load's chain operand and move load below the call's chain operand.
705 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
706 SDValue Call, SDValue OrigChain) {
707 SmallVector<SDValue, 8> Ops;
708 SDValue Chain = OrigChain.getOperand(0);
709 if (Chain.getNode() == Load.getNode())
710 Ops.push_back(Load.getOperand(0));
712 assert(Chain.getOpcode() == ISD::TokenFactor &&
713 "Unexpected chain operand");
714 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
715 if (Chain.getOperand(i).getNode() == Load.getNode())
716 Ops.push_back(Load.getOperand(0));
718 Ops.push_back(Chain.getOperand(i));
720 CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
722 Ops.push_back(NewChain);
724 Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
725 CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
726 CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
727 Load.getOperand(1), Load.getOperand(2));
730 Ops.push_back(SDValue(Load.getNode(), 1));
731 Ops.append(Call->op_begin() + 1, Call->op_end());
732 CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
735 /// Return true if call address is a load and it can be
736 /// moved below CALLSEQ_START and the chains leading up to the call.
737 /// Return the CALLSEQ_START by reference as a second output.
738 /// In the case of a tail call, there isn't a callseq node between the call
739 /// chain and the load.
740 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
741 // The transformation is somewhat dangerous if the call's chain was glued to
742 // the call. After MoveBelowOrigChain the load is moved between the call and
743 // the chain, this can create a cycle if the load is not folded. So it is
744 // *really* important that we are sure the load will be folded.
745 if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
747 LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
750 LD->getAddressingMode() != ISD::UNINDEXED ||
751 LD->getExtensionType() != ISD::NON_EXTLOAD)
754 // Now let's find the callseq_start.
755 while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
756 if (!Chain.hasOneUse())
758 Chain = Chain.getOperand(0);
761 if (!Chain.getNumOperands())
763 // Since we are not checking for AA here, conservatively abort if the chain
764 // writes to memory. It's not safe to move the callee (a load) across a store.
765 if (isa<MemSDNode>(Chain.getNode()) &&
766 cast<MemSDNode>(Chain.getNode())->writeMem())
768 if (Chain.getOperand(0).getNode() == Callee.getNode())
770 if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
771 Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
772 Callee.getValue(1).hasOneUse())
777 void X86DAGToDAGISel::PreprocessISelDAG() {
778 for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
779 E = CurDAG->allnodes_end(); I != E; ) {
780 SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
782 // If this is a target specific AND node with no flag usages, turn it back
783 // into ISD::AND to enable test instruction matching.
784 if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
785 SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
786 N->getOperand(0), N->getOperand(1));
788 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
790 CurDAG->DeleteNode(N);
794 switch (N->getOpcode()) {
795 case ISD::FP_TO_SINT:
796 case ISD::FP_TO_UINT: {
797 // Replace vector fp_to_s/uint with their X86 specific equivalent so we
798 // don't need 2 sets of patterns.
799 if (!N->getSimpleValueType(0).isVector())
803 switch (N->getOpcode()) {
804 default: llvm_unreachable("Unexpected opcode!");
805 case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
806 case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
808 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
811 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
813 CurDAG->DeleteNode(N);
819 // Replace vector shifts with their X86 specific equivalent so we don't
820 // need 2 sets of patterns.
821 if (!N->getValueType(0).isVector())
825 switch (N->getOpcode()) {
826 default: llvm_unreachable("Unexpected opcode!");
827 case ISD::SHL: NewOpc = X86ISD::VSHLV; break;
828 case ISD::SRA: NewOpc = X86ISD::VSRAV; break;
829 case ISD::SRL: NewOpc = X86ISD::VSRLV; break;
831 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
832 N->getOperand(0), N->getOperand(1));
834 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
836 CurDAG->DeleteNode(N);
839 case ISD::ANY_EXTEND:
840 case ISD::ANY_EXTEND_VECTOR_INREG: {
841 // Replace vector any extend with the zero extend equivalents so we don't
842 // need 2 sets of patterns. Ignore vXi1 extensions.
843 if (!N->getValueType(0).isVector() ||
844 N->getOperand(0).getScalarValueSizeInBits() == 1)
847 unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
849 : ISD::ZERO_EXTEND_VECTOR_INREG;
851 SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
854 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
856 CurDAG->DeleteNode(N);
862 case ISD::FNEARBYINT:
864 // Replace fp rounding with their X86 specific equivalent so we don't
865 // need 2 sets of patterns.
867 switch (N->getOpcode()) {
868 default: llvm_unreachable("Unexpected opcode!");
869 case ISD::FCEIL: Imm = 0xA; break;
870 case ISD::FFLOOR: Imm = 0x9; break;
871 case ISD::FTRUNC: Imm = 0xB; break;
872 case ISD::FNEARBYINT: Imm = 0xC; break;
873 case ISD::FRINT: Imm = 0x4; break;
876 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
879 CurDAG->getConstant(Imm, dl, MVT::i8));
881 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
883 CurDAG->DeleteNode(N);
890 // Widen scalar fp logic ops to vector to reduce isel patterns.
891 // FIXME: Can we do this during lowering/combine.
892 MVT VT = N->getSimpleValueType(0);
893 if (VT.isVector() || VT == MVT::f128)
896 MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
898 SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
900 SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
904 if (Subtarget->hasSSE2()) {
905 EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
906 Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
907 Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
909 switch (N->getOpcode()) {
910 default: llvm_unreachable("Unexpected opcode!");
911 case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
912 case X86ISD::FAND: Opc = ISD::AND; break;
913 case X86ISD::FOR: Opc = ISD::OR; break;
914 case X86ISD::FXOR: Opc = ISD::XOR; break;
916 Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
917 Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
919 Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
921 Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
922 CurDAG->getIntPtrConstant(0, dl));
924 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
926 CurDAG->DeleteNode(N);
931 if (OptLevel != CodeGenOpt::None &&
932 // Only do this when the target can fold the load into the call or
934 !Subtarget->useRetpolineIndirectCalls() &&
935 ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
936 (N->getOpcode() == X86ISD::TC_RETURN &&
937 (Subtarget->is64Bit() ||
938 !getTargetMachine().isPositionIndependent())))) {
939 /// Also try moving call address load from outside callseq_start to just
940 /// before the call to allow it to be folded.
958 bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
959 SDValue Chain = N->getOperand(0);
960 SDValue Load = N->getOperand(1);
961 if (!isCalleeLoad(Load, Chain, HasCallSeq))
963 moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
968 // Lower fpround and fpextend nodes that target the FP stack to be store and
969 // load to the stack. This is a gross hack. We would like to simply mark
970 // these as being illegal, but when we do that, legalize produces these when
971 // it expands calls, then expands these in the same legalize pass. We would
972 // like dag combine to be able to hack on these between the call expansion
973 // and the node legalization. As such this pass basically does "really
974 // late" legalization of these inline with the X86 isel pass.
975 // FIXME: This should only happen when not compiled with -O0.
976 switch (N->getOpcode()) {
981 MVT SrcVT = N->getOperand(0).getSimpleValueType();
982 MVT DstVT = N->getSimpleValueType(0);
984 // If any of the sources are vectors, no fp stack involved.
985 if (SrcVT.isVector() || DstVT.isVector())
988 // If the source and destination are SSE registers, then this is a legal
989 // conversion that should not be lowered.
990 const X86TargetLowering *X86Lowering =
991 static_cast<const X86TargetLowering *>(TLI);
992 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
993 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
994 if (SrcIsSSE && DstIsSSE)
997 if (!SrcIsSSE && !DstIsSSE) {
998 // If this is an FPStack extension, it is a noop.
999 if (N->getOpcode() == ISD::FP_EXTEND)
1001 // If this is a value-preserving FPStack truncation, it is a noop.
1002 if (N->getConstantOperandVal(1))
1006 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1007 // FPStack has extload and truncstore. SSE can fold direct loads into other
1008 // operations. Based on this, decide what we want to do.
1010 if (N->getOpcode() == ISD::FP_ROUND)
1011 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1013 MemVT = SrcIsSSE ? SrcVT : DstVT;
1015 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1018 // FIXME: optimize the case where the src/dest is a load or store?
1020 SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1021 MemTmp, MachinePointerInfo(), MemVT);
1022 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1023 MachinePointerInfo(), MemVT);
1025 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1026 // extload we created. This will cause general havok on the dag because
1027 // anything below the conversion could be folded into other existing nodes.
1028 // To avoid invalidating 'I', back it up to the convert node.
1030 CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1034 //The sequence of events for lowering STRICT_FP versions of these nodes requires
1035 //dealing with the chain differently, as there is already a preexisting chain.
1036 case ISD::STRICT_FP_ROUND:
1037 case ISD::STRICT_FP_EXTEND:
1039 MVT SrcVT = N->getOperand(1).getSimpleValueType();
1040 MVT DstVT = N->getSimpleValueType(0);
1042 // If any of the sources are vectors, no fp stack involved.
1043 if (SrcVT.isVector() || DstVT.isVector())
1046 // If the source and destination are SSE registers, then this is a legal
1047 // conversion that should not be lowered.
1048 const X86TargetLowering *X86Lowering =
1049 static_cast<const X86TargetLowering *>(TLI);
1050 bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1051 bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1052 if (SrcIsSSE && DstIsSSE)
1055 if (!SrcIsSSE && !DstIsSSE) {
1056 // If this is an FPStack extension, it is a noop.
1057 if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1059 // If this is a value-preserving FPStack truncation, it is a noop.
1060 if (N->getConstantOperandVal(2))
1064 // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1065 // FPStack has extload and truncstore. SSE can fold direct loads into other
1066 // operations. Based on this, decide what we want to do.
1068 if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1069 MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1071 MemVT = SrcIsSSE ? SrcVT : DstVT;
1073 SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1076 // FIXME: optimize the case where the src/dest is a load or store?
1078 //Since the operation is StrictFP, use the preexisting chain.
1079 SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1080 MemTmp, MachinePointerInfo(), MemVT);
1081 SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1082 MachinePointerInfo(), MemVT);
1084 // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1085 // extload we created. This will cause general havok on the dag because
1086 // anything below the conversion could be folded into other existing nodes.
1087 // To avoid invalidating 'I', back it up to the convert node.
1089 CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1095 // Now that we did that, the node is dead. Increment the iterator to the
1096 // next node to process, then delete N.
1098 CurDAG->DeleteNode(N);
1101 // The load+call transform above can leave some dead nodes in the graph. Make
1102 // sure we remove them. Its possible some of the other transforms do to so
1103 // just remove dead nodes unconditionally.
1104 CurDAG->RemoveDeadNodes();
1107 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1108 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1109 unsigned Opc = N->getMachineOpcode();
1110 if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
1111 Opc != X86::MOVSX64rr8)
1114 SDValue N0 = N->getOperand(0);
1116 // We need to be extracting the lower bit of an extend.
1117 if (!N0.isMachineOpcode() ||
1118 N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
1119 N0.getConstantOperandVal(1) != X86::sub_8bit)
1122 // We're looking for either a movsx or movzx to match the original opcode.
1123 unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
1124 : X86::MOVSX32rr8_NOREX;
1125 SDValue N00 = N0.getOperand(0);
1126 if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
1129 if (Opc == X86::MOVSX64rr8) {
1130 // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1132 MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1134 ReplaceUses(N, Extend);
1136 // Ok we can drop this extend and just use the original extend.
1137 ReplaceUses(N, N00.getNode());
1143 void X86DAGToDAGISel::PostprocessISelDAG() {
1144 // Skip peepholes at -O0.
1145 if (TM.getOptLevel() == CodeGenOpt::None)
1148 SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1150 bool MadeChange = false;
1151 while (Position != CurDAG->allnodes_begin()) {
1152 SDNode *N = &*--Position;
1153 // Skip dead nodes and any non-machine opcodes.
1154 if (N->use_empty() || !N->isMachineOpcode())
1157 if (tryOptimizeRem8Extend(N)) {
1162 // Look for a TESTrr+ANDrr pattern where both operands of the test are
1163 // the same. Rewrite to remove the AND.
1164 unsigned Opc = N->getMachineOpcode();
1165 if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
1166 Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
1167 N->getOperand(0) == N->getOperand(1) &&
1168 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1169 N->getOperand(0).isMachineOpcode()) {
1170 SDValue And = N->getOperand(0);
1171 unsigned N0Opc = And.getMachineOpcode();
1172 if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
1173 N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
1174 MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1178 ReplaceUses(N, Test);
1182 if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
1183 N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
1186 case X86::AND8rm: NewOpc = X86::TEST8mr; break;
1187 case X86::AND16rm: NewOpc = X86::TEST16mr; break;
1188 case X86::AND32rm: NewOpc = X86::TEST32mr; break;
1189 case X86::AND64rm: NewOpc = X86::TEST64mr; break;
1192 // Need to swap the memory and register operand.
1193 SDValue Ops[] = { And.getOperand(1),
1199 And.getOperand(6) /* Chain */ };
1200 MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1201 MVT::i32, MVT::Other, Ops);
1202 ReplaceUses(N, Test);
1208 // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1209 // used. We're doing this late so we can prefer to fold the AND into masked
1210 // comparisons. Doing that can be better for the live range of the mask
1212 if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
1213 Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
1214 N->getOperand(0) == N->getOperand(1) &&
1215 N->isOnlyUserOf(N->getOperand(0).getNode()) &&
1216 N->getOperand(0).isMachineOpcode() &&
1217 onlyUsesZeroFlag(SDValue(N, 0))) {
1218 SDValue And = N->getOperand(0);
1219 unsigned N0Opc = And.getMachineOpcode();
1220 // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1221 // KAND instructions and KTEST use the same ISA feature.
1222 if (N0Opc == X86::KANDBrr ||
1223 (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
1224 N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
1227 default: llvm_unreachable("Unexpected opcode!");
1228 case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
1229 case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
1230 case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
1231 case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
1233 MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1237 ReplaceUses(N, KTest);
1243 // Attempt to remove vectors moves that were inserted to zero upper bits.
1244 if (Opc != TargetOpcode::SUBREG_TO_REG)
1247 unsigned SubRegIdx = N->getConstantOperandVal(2);
1248 if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
1251 SDValue Move = N->getOperand(1);
1252 if (!Move.isMachineOpcode())
1255 // Make sure its one of the move opcodes we recognize.
1256 switch (Move.getMachineOpcode()) {
1259 case X86::VMOVAPDrr: case X86::VMOVUPDrr:
1260 case X86::VMOVAPSrr: case X86::VMOVUPSrr:
1261 case X86::VMOVDQArr: case X86::VMOVDQUrr:
1262 case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
1263 case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
1264 case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
1265 case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
1266 case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
1267 case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1268 case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1269 case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
1270 case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
1271 case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1272 case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1276 SDValue In = Move.getOperand(0);
1277 if (!In.isMachineOpcode() ||
1278 In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
1281 // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1282 // the SHA instructions which use a legacy encoding.
1283 uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1284 if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1285 (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
1286 (TSFlags & X86II::EncodingMask) != X86II::XOP)
1289 // Producing instruction is another vector instruction. We can drop the
1291 CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1296 CurDAG->RemoveDeadNodes();
1300 /// Emit any code that needs to be executed only in the main function.
1301 void X86DAGToDAGISel::emitSpecialCodeForMain() {
1302 if (Subtarget->isTargetCygMing()) {
1303 TargetLowering::ArgListTy Args;
1304 auto &DL = CurDAG->getDataLayout();
1306 TargetLowering::CallLoweringInfo CLI(*CurDAG);
1307 CLI.setChain(CurDAG->getRoot())
1308 .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1309 CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1311 const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1312 std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1313 CurDAG->setRoot(Result.second);
1317 void X86DAGToDAGISel::EmitFunctionEntryCode() {
1318 // If this is main, emit special code for main.
1319 const Function &F = MF->getFunction();
1320 if (F.hasExternalLinkage() && F.getName() == "main")
1321 emitSpecialCodeForMain();
1324 static bool isDispSafeForFrameIndex(int64_t Val) {
1325 // On 64-bit platforms, we can run into an issue where a frame index
1326 // includes a displacement that, when added to the explicit displacement,
1327 // will overflow the displacement field. Assuming that the frame index
1328 // displacement fits into a 31-bit integer (which is only slightly more
1329 // aggressive than the current fundamental assumption that it fits into
1330 // a 32-bit integer), a 31-bit disp should always be safe.
1331 return isInt<31>(Val);
1334 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1335 X86ISelAddressMode &AM) {
1336 // If there's no offset to fold, we don't need to do any work.
1340 // Cannot combine ExternalSymbol displacements with integer offsets.
1341 if (AM.ES || AM.MCSym)
1344 int64_t Val = AM.Disp + Offset;
1345 CodeModel::Model M = TM.getCodeModel();
1346 if (Subtarget->is64Bit()) {
1347 if (!X86::isOffsetSuitableForCodeModel(Val, M,
1348 AM.hasSymbolicDisplacement()))
1350 // In addition to the checks required for a register base, check that
1351 // we do not try to use an unsafe Disp with a frame index.
1352 if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1353 !isDispSafeForFrameIndex(Val))
1361 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1362 SDValue Address = N->getOperand(1);
1364 // load gs:0 -> GS segment register.
1365 // load fs:0 -> FS segment register.
1367 // This optimization is valid because the GNU TLS model defines that
1368 // gs:0 (or fs:0 on X86-64) contains its own address.
1369 // For more information see http://people.redhat.com/drepper/tls.pdf
1370 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1371 if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
1372 !IndirectTlsSegRefs &&
1373 (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
1374 Subtarget->isTargetFuchsia()))
1375 switch (N->getPointerInfo().getAddrSpace()) {
1377 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1380 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1382 // Address space 258 is not handled here, because it is not used to
1383 // address TLS areas.
1389 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1390 /// mode. These wrap things that will resolve down into a symbol reference.
1391 /// If no match is possible, this returns true, otherwise it returns false.
1392 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1393 // If the addressing mode already has a symbol as the displacement, we can
1394 // never match another symbol.
1395 if (AM.hasSymbolicDisplacement())
1398 bool IsRIPRelTLS = false;
1399 bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1401 SDValue Val = N.getOperand(0);
1402 if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1406 // We can't use an addressing mode in the 64-bit large code model.
1407 // Global TLS addressing is an exception. In the medium code model,
1408 // we use can use a mode when RIP wrappers are present.
1409 // That signifies access to globals that are known to be "near",
1410 // such as the GOT itself.
1411 CodeModel::Model M = TM.getCodeModel();
1412 if (Subtarget->is64Bit() &&
1413 ((M == CodeModel::Large && !IsRIPRelTLS) ||
1414 (M == CodeModel::Medium && !IsRIPRel)))
1417 // Base and index reg must be 0 in order to use %rip as base.
1418 if (IsRIPRel && AM.hasBaseOrIndexReg())
1421 // Make a local copy in case we can't do this fold.
1422 X86ISelAddressMode Backup = AM;
1425 SDValue N0 = N.getOperand(0);
1426 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1427 AM.GV = G->getGlobal();
1428 AM.SymbolFlags = G->getTargetFlags();
1429 Offset = G->getOffset();
1430 } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
1431 AM.CP = CP->getConstVal();
1432 AM.Align = CP->getAlignment();
1433 AM.SymbolFlags = CP->getTargetFlags();
1434 Offset = CP->getOffset();
1435 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
1436 AM.ES = S->getSymbol();
1437 AM.SymbolFlags = S->getTargetFlags();
1438 } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1439 AM.MCSym = S->getMCSymbol();
1440 } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1441 AM.JT = J->getIndex();
1442 AM.SymbolFlags = J->getTargetFlags();
1443 } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
1444 AM.BlockAddr = BA->getBlockAddress();
1445 AM.SymbolFlags = BA->getTargetFlags();
1446 Offset = BA->getOffset();
1448 llvm_unreachable("Unhandled symbol reference node.");
1450 if (foldOffsetIntoAddress(Offset, AM)) {
1456 AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1458 // Commit the changes now that we know this fold is safe.
1462 /// Add the specified node to the specified addressing mode, returning true if
1463 /// it cannot be done. This just pattern matches for the addressing mode.
1464 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1465 if (matchAddressRecursively(N, AM, 0))
1468 // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1469 // a smaller encoding and avoids a scaled-index.
1470 if (AM.Scale == 2 &&
1471 AM.BaseType == X86ISelAddressMode::RegBase &&
1472 AM.Base_Reg.getNode() == nullptr) {
1473 AM.Base_Reg = AM.IndexReg;
1477 // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1478 // because it has a smaller encoding.
1479 // TODO: Which other code models can use this?
1480 switch (TM.getCodeModel()) {
1482 case CodeModel::Small:
1483 case CodeModel::Kernel:
1484 if (Subtarget->is64Bit() &&
1486 AM.BaseType == X86ISelAddressMode::RegBase &&
1487 AM.Base_Reg.getNode() == nullptr &&
1488 AM.IndexReg.getNode() == nullptr &&
1489 AM.SymbolFlags == X86II::MO_NO_FLAG &&
1490 AM.hasSymbolicDisplacement())
1491 AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1498 bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1500 // Add an artificial use to this node so that we can keep track of
1501 // it if it gets CSE'd with a different node.
1502 HandleSDNode Handle(N);
1504 X86ISelAddressMode Backup = AM;
1505 if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1506 !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
1510 // Try again after commuting the operands.
1511 if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1512 !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
1516 // If we couldn't fold both operands into the address at the same time,
1517 // see if we can just put each operand into a register and fold at least
1519 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1520 !AM.Base_Reg.getNode() &&
1521 !AM.IndexReg.getNode()) {
1522 N = Handle.getValue();
1523 AM.Base_Reg = N.getOperand(0);
1524 AM.IndexReg = N.getOperand(1);
1528 N = Handle.getValue();
1532 // Insert a node into the DAG at least before the Pos node's position. This
1533 // will reposition the node as needed, and will assign it a node ID that is <=
1534 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1535 // IDs! The selection DAG must no longer depend on their uniqueness when this
1537 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1538 if (N->getNodeId() == -1 ||
1539 (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1540 SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1541 DAG.RepositionNode(Pos->getIterator(), N.getNode());
1542 // Mark Node as invalid for pruning as after this it may be a successor to a
1543 // selected node but otherwise be in the same position of Pos.
1544 // Conservatively mark it with the same -abs(Id) to assure node id
1545 // invariant is preserved.
1546 N->setNodeId(Pos->getNodeId());
1547 SelectionDAGISel::InvalidateNodeId(N.getNode());
1551 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1552 // safe. This allows us to convert the shift and and into an h-register
1553 // extract and a scaled index. Returns false if the simplification is
1555 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1557 SDValue Shift, SDValue X,
1558 X86ISelAddressMode &AM) {
1559 if (Shift.getOpcode() != ISD::SRL ||
1560 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1564 int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1565 if (ScaleLog <= 0 || ScaleLog >= 4 ||
1566 Mask != (0xffu << ScaleLog))
1569 MVT VT = N.getSimpleValueType();
1571 SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1572 SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1573 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1574 SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1575 SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1576 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1578 // Insert the new nodes into the topological ordering. We must do this in
1579 // a valid topological ordering as nothing is going to go back and re-sort
1580 // these nodes. We continually insert before 'N' in sequence as this is
1581 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1582 // hierarchy left to express.
1583 insertDAGNode(DAG, N, Eight);
1584 insertDAGNode(DAG, N, Srl);
1585 insertDAGNode(DAG, N, NewMask);
1586 insertDAGNode(DAG, N, And);
1587 insertDAGNode(DAG, N, ShlCount);
1588 insertDAGNode(DAG, N, Shl);
1589 DAG.ReplaceAllUsesWith(N, Shl);
1590 DAG.RemoveDeadNode(N.getNode());
1592 AM.Scale = (1 << ScaleLog);
1596 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1597 // allows us to fold the shift into this addressing mode. Returns false if the
1598 // transform succeeded.
1599 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1600 X86ISelAddressMode &AM) {
1601 SDValue Shift = N.getOperand(0);
1603 // Use a signed mask so that shifting right will insert sign bits. These
1604 // bits will be removed when we shift the result left so it doesn't matter
1605 // what we use. This might allow a smaller immediate encoding.
1606 int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1608 // If we have an any_extend feeding the AND, look through it to see if there
1609 // is a shift behind it. But only if the AND doesn't use the extended bits.
1610 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1611 bool FoundAnyExtend = false;
1612 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
1613 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
1615 FoundAnyExtend = true;
1616 Shift = Shift.getOperand(0);
1619 if (Shift.getOpcode() != ISD::SHL ||
1620 !isa<ConstantSDNode>(Shift.getOperand(1)))
1623 SDValue X = Shift.getOperand(0);
1625 // Not likely to be profitable if either the AND or SHIFT node has more
1626 // than one use (unless all uses are for address computation). Besides,
1627 // isel mechanism requires their node ids to be reused.
1628 if (!N.hasOneUse() || !Shift.hasOneUse())
1631 // Verify that the shift amount is something we can fold.
1632 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1633 if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
1636 MVT VT = N.getSimpleValueType();
1638 if (FoundAnyExtend) {
1639 SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1640 insertDAGNode(DAG, N, NewX);
1644 SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1645 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1646 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1648 // Insert the new nodes into the topological ordering. We must do this in
1649 // a valid topological ordering as nothing is going to go back and re-sort
1650 // these nodes. We continually insert before 'N' in sequence as this is
1651 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1652 // hierarchy left to express.
1653 insertDAGNode(DAG, N, NewMask);
1654 insertDAGNode(DAG, N, NewAnd);
1655 insertDAGNode(DAG, N, NewShift);
1656 DAG.ReplaceAllUsesWith(N, NewShift);
1657 DAG.RemoveDeadNode(N.getNode());
1659 AM.Scale = 1 << ShiftAmt;
1660 AM.IndexReg = NewAnd;
1664 // Implement some heroics to detect shifts of masked values where the mask can
1665 // be replaced by extending the shift and undoing that in the addressing mode
1666 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1667 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1668 // the addressing mode. This results in code such as:
1670 // int f(short *y, int *lookup_table) {
1672 // return *y + lookup_table[*y >> 11];
1676 // movzwl (%rdi), %eax
1679 // addl (%rsi,%rcx,4), %eax
1682 // movzwl (%rdi), %eax
1686 // addl (%rsi,%rcx), %eax
1688 // Note that this function assumes the mask is provided as a mask *after* the
1689 // value is shifted. The input chain may or may not match that, but computing
1690 // such a mask is trivial.
1691 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1693 SDValue Shift, SDValue X,
1694 X86ISelAddressMode &AM) {
1695 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1696 !isa<ConstantSDNode>(Shift.getOperand(1)))
1699 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1700 unsigned MaskLZ = countLeadingZeros(Mask);
1701 unsigned MaskTZ = countTrailingZeros(Mask);
1703 // The amount of shift we're trying to fit into the addressing mode is taken
1704 // from the trailing zeros of the mask.
1705 unsigned AMShiftAmt = MaskTZ;
1707 // There is nothing we can do here unless the mask is removing some bits.
1708 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1709 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1711 // We also need to ensure that mask is a continuous run of bits.
1712 if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
1714 // Scale the leading zero count down based on the actual size of the value.
1715 // Also scale it down based on the size of the shift.
1716 unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1717 if (MaskLZ < ScaleDown)
1719 MaskLZ -= ScaleDown;
1721 // The final check is to ensure that any masked out high bits of X are
1722 // already known to be zero. Otherwise, the mask has a semantic impact
1723 // other than masking out a couple of low bits. Unfortunately, because of
1724 // the mask, zero extensions will be removed from operands in some cases.
1725 // This code works extra hard to look through extensions because we can
1726 // replace them with zero extensions cheaply if necessary.
1727 bool ReplacingAnyExtend = false;
1728 if (X.getOpcode() == ISD::ANY_EXTEND) {
1729 unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1730 X.getOperand(0).getSimpleValueType().getSizeInBits();
1731 // Assume that we'll replace the any-extend with a zero-extend, and
1732 // narrow the search to the extended value.
1733 X = X.getOperand(0);
1734 MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1735 ReplacingAnyExtend = true;
1737 APInt MaskedHighBits =
1738 APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1739 KnownBits Known = DAG.computeKnownBits(X);
1740 if (MaskedHighBits != Known.Zero) return true;
1742 // We've identified a pattern that can be transformed into a single shift
1743 // and an addressing mode. Make it so.
1744 MVT VT = N.getSimpleValueType();
1745 if (ReplacingAnyExtend) {
1746 assert(X.getValueType() != VT);
1747 // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1748 SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1749 insertDAGNode(DAG, N, NewX);
1753 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1754 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1755 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1756 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1758 // Insert the new nodes into the topological ordering. We must do this in
1759 // a valid topological ordering as nothing is going to go back and re-sort
1760 // these nodes. We continually insert before 'N' in sequence as this is
1761 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1762 // hierarchy left to express.
1763 insertDAGNode(DAG, N, NewSRLAmt);
1764 insertDAGNode(DAG, N, NewSRL);
1765 insertDAGNode(DAG, N, NewSHLAmt);
1766 insertDAGNode(DAG, N, NewSHL);
1767 DAG.ReplaceAllUsesWith(N, NewSHL);
1768 DAG.RemoveDeadNode(N.getNode());
1770 AM.Scale = 1 << AMShiftAmt;
1771 AM.IndexReg = NewSRL;
1775 // Transform "(X >> SHIFT) & (MASK << C1)" to
1776 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1777 // matched to a BEXTR later. Returns false if the simplification is performed.
1778 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1780 SDValue Shift, SDValue X,
1781 X86ISelAddressMode &AM,
1782 const X86Subtarget &Subtarget) {
1783 if (Shift.getOpcode() != ISD::SRL ||
1784 !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1785 !Shift.hasOneUse() || !N.hasOneUse())
1788 // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1789 if (!Subtarget.hasTBM() &&
1790 !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
1793 // We need to ensure that mask is a continuous run of bits.
1794 if (!isShiftedMask_64(Mask)) return true;
1796 unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1798 // The amount of shift we're trying to fit into the addressing mode is taken
1799 // from the trailing zeros of the mask.
1800 unsigned AMShiftAmt = countTrailingZeros(Mask);
1802 // There is nothing we can do here unless the mask is removing some bits.
1803 // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1804 if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
1806 MVT VT = N.getSimpleValueType();
1808 SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1809 SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1810 SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1811 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1812 SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1813 SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1815 // Insert the new nodes into the topological ordering. We must do this in
1816 // a valid topological ordering as nothing is going to go back and re-sort
1817 // these nodes. We continually insert before 'N' in sequence as this is
1818 // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1819 // hierarchy left to express.
1820 insertDAGNode(DAG, N, NewSRLAmt);
1821 insertDAGNode(DAG, N, NewSRL);
1822 insertDAGNode(DAG, N, NewMask);
1823 insertDAGNode(DAG, N, NewAnd);
1824 insertDAGNode(DAG, N, NewSHLAmt);
1825 insertDAGNode(DAG, N, NewSHL);
1826 DAG.ReplaceAllUsesWith(N, NewSHL);
1827 DAG.RemoveDeadNode(N.getNode());
1829 AM.Scale = 1 << AMShiftAmt;
1830 AM.IndexReg = NewAnd;
1834 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1838 dbgs() << "MatchAddress: ";
1843 return matchAddressBase(N, AM);
1845 // If this is already a %rip relative address, we can only merge immediates
1846 // into it. Instead of handling this in every case, we handle it here.
1847 // RIP relative addressing: %rip + 32-bit displacement!
1848 if (AM.isRIPRelative()) {
1849 // FIXME: JumpTable and ExternalSymbol address currently don't like
1850 // displacements. It isn't very important, but this should be fixed for
1852 if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1855 if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1856 if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1861 switch (N.getOpcode()) {
1863 case ISD::LOCAL_RECOVER: {
1864 if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1865 if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1866 // Use the symbol and don't prefix it.
1867 AM.MCSym = ESNode->getMCSymbol();
1872 case ISD::Constant: {
1873 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1874 if (!foldOffsetIntoAddress(Val, AM))
1879 case X86ISD::Wrapper:
1880 case X86ISD::WrapperRIP:
1881 if (!matchWrapper(N, AM))
1886 if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1890 case ISD::FrameIndex:
1891 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1892 AM.Base_Reg.getNode() == nullptr &&
1893 (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
1894 AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1895 AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1901 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
1904 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1905 unsigned Val = CN->getZExtValue();
1906 // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1907 // that the base operand remains free for further matching. If
1908 // the base doesn't end up getting used, a post-processing step
1909 // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1910 if (Val == 1 || Val == 2 || Val == 3) {
1911 AM.Scale = 1 << Val;
1912 SDValue ShVal = N.getOperand(0);
1914 // Okay, we know that we have a scale by now. However, if the scaled
1915 // value is an add of something and a constant, we can fold the
1916 // constant into the disp field here.
1917 if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1918 AM.IndexReg = ShVal.getOperand(0);
1919 ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1920 uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1921 if (!foldOffsetIntoAddress(Disp, AM))
1925 AM.IndexReg = ShVal;
1932 // Scale must not be used already.
1933 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
1935 // We only handle up to 64-bit values here as those are what matter for
1936 // addressing mode optimizations.
1937 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1938 "Unexpected value size!");
1940 SDValue And = N.getOperand(0);
1941 if (And.getOpcode() != ISD::AND) break;
1942 SDValue X = And.getOperand(0);
1944 // The mask used for the transform is expected to be post-shift, but we
1945 // found the shift first so just apply the shift to the mask before passing
1947 if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1948 !isa<ConstantSDNode>(And.getOperand(1)))
1950 uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1952 // Try to fold the mask and shift into the scale, and return false if we
1954 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1959 case ISD::SMUL_LOHI:
1960 case ISD::UMUL_LOHI:
1961 // A mul_lohi where we need the low part can be folded as a plain multiply.
1962 if (N.getResNo() != 0) break;
1965 case X86ISD::MUL_IMM:
1966 // X*[3,5,9] -> X+X*[2,4,8]
1967 if (AM.BaseType == X86ISelAddressMode::RegBase &&
1968 AM.Base_Reg.getNode() == nullptr &&
1969 AM.IndexReg.getNode() == nullptr) {
1970 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1971 if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
1972 CN->getZExtValue() == 9) {
1973 AM.Scale = unsigned(CN->getZExtValue())-1;
1975 SDValue MulVal = N.getOperand(0);
1978 // Okay, we know that we have a scale by now. However, if the scaled
1979 // value is an add of something and a constant, we can fold the
1980 // constant into the disp field here.
1981 if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
1982 isa<ConstantSDNode>(MulVal.getOperand(1))) {
1983 Reg = MulVal.getOperand(0);
1984 ConstantSDNode *AddVal =
1985 cast<ConstantSDNode>(MulVal.getOperand(1));
1986 uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1987 if (foldOffsetIntoAddress(Disp, AM))
1988 Reg = N.getOperand(0);
1990 Reg = N.getOperand(0);
1993 AM.IndexReg = AM.Base_Reg = Reg;
2000 // Given A-B, if A can be completely folded into the address and
2001 // the index field with the index field unused, use -B as the index.
2002 // This is a win if a has multiple parts that can be folded into
2003 // the address. Also, this saves a mov if the base register has
2004 // other uses, since it avoids a two-address sub instruction, however
2005 // it costs an additional mov if the index register has other uses.
2007 // Add an artificial use to this node so that we can keep track of
2008 // it if it gets CSE'd with a different node.
2009 HandleSDNode Handle(N);
2011 // Test if the LHS of the sub can be folded.
2012 X86ISelAddressMode Backup = AM;
2013 if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2014 N = Handle.getValue();
2018 N = Handle.getValue();
2019 // Test if the index field is free for use.
2020 if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
2026 SDValue RHS = N.getOperand(1);
2027 // If the RHS involves a register with multiple uses, this
2028 // transformation incurs an extra mov, due to the neg instruction
2029 // clobbering its operand.
2030 if (!RHS.getNode()->hasOneUse() ||
2031 RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
2032 RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
2033 RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
2034 (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
2035 RHS.getOperand(0).getValueType() == MVT::i32))
2037 // If the base is a register with multiple uses, this
2038 // transformation may save a mov.
2039 if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
2040 !AM.Base_Reg.getNode()->hasOneUse()) ||
2041 AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2043 // If the folded LHS was interesting, this transformation saves
2044 // address arithmetic.
2045 if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
2046 ((AM.Disp != 0) && (Backup.Disp == 0)) +
2047 (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
2049 // If it doesn't look like it may be an overall win, don't do it.
2055 // Ok, the transformation is legal and appears profitable. Go for it.
2056 // Negation will be emitted later to avoid creating dangling nodes if this
2057 // was an unprofitable LEA.
2059 AM.NegateIndex = true;
2065 if (!matchAdd(N, AM, Depth))
2070 // We want to look through a transform in InstCombine and DAGCombiner that
2071 // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2072 // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2073 // An 'lea' can then be used to match the shift (multiply) and add:
2075 // lea (%rsi, %rdi, 8), %rax
2076 if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2077 !matchAdd(N, AM, Depth))
2082 // Perform some heroic transforms on an and of a constant-count shift
2083 // with a constant to enable use of the scaled offset field.
2085 // Scale must not be used already.
2086 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
2088 // We only handle up to 64-bit values here as those are what matter for
2089 // addressing mode optimizations.
2090 assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2091 "Unexpected value size!");
2093 if (!isa<ConstantSDNode>(N.getOperand(1)))
2096 if (N.getOperand(0).getOpcode() == ISD::SRL) {
2097 SDValue Shift = N.getOperand(0);
2098 SDValue X = Shift.getOperand(0);
2100 uint64_t Mask = N.getConstantOperandVal(1);
2102 // Try to fold the mask and shift into an extract and scale.
2103 if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2106 // Try to fold the mask and shift directly into the scale.
2107 if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2110 // Try to fold the mask and shift into BEXTR and scale.
2111 if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2115 // Try to swap the mask and shift to place shifts which can be done as
2116 // a scale on the outside of the mask.
2117 if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2122 case ISD::ZERO_EXTEND: {
2123 // Try to widen a zexted shift left to the same size as its use, so we can
2124 // match the shift as a scale factor.
2125 if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
2127 if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse())
2130 // Give up if the shift is not a valid scale factor [1,2,3].
2131 SDValue Shl = N.getOperand(0);
2132 auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2133 if (!ShAmtC || ShAmtC->getZExtValue() > 3)
2136 // The narrow shift must only shift out zero bits (it must be 'nuw').
2137 // That makes it safe to widen to the destination type.
2138 APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2139 ShAmtC->getZExtValue());
2140 if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2143 // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2144 MVT VT = N.getSimpleValueType();
2146 SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2147 SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2149 // Convert the shift to scale factor.
2150 AM.Scale = 1 << ShAmtC->getZExtValue();
2153 insertDAGNode(*CurDAG, N, Zext);
2154 insertDAGNode(*CurDAG, N, NewShl);
2155 CurDAG->ReplaceAllUsesWith(N, NewShl);
2156 CurDAG->RemoveDeadNode(N.getNode());
2161 return matchAddressBase(N, AM);
2164 /// Helper for MatchAddress. Add the specified node to the
2165 /// specified addressing mode without any further recursion.
2166 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2167 // Is the base register already occupied?
2168 if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
2169 // If so, check to see if the scale index register is set.
2170 if (!AM.IndexReg.getNode()) {
2176 // Otherwise, we cannot select it.
2180 // Default, generate it as a register.
2181 AM.BaseType = X86ISelAddressMode::RegBase;
2186 /// Helper for selectVectorAddr. Handles things that can be folded into a
2187 /// gather scatter address. The index register and scale should have already
2189 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2190 // TODO: Support other operations.
2191 switch (N.getOpcode()) {
2192 case ISD::Constant: {
2193 uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2194 if (!foldOffsetIntoAddress(Val, AM))
2198 case X86ISD::Wrapper:
2199 if (!matchWrapper(N, AM))
2204 return matchAddressBase(N, AM);
2207 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2208 SDValue &Scale, SDValue &Index,
2209 SDValue &Disp, SDValue &Segment) {
2210 X86ISelAddressMode AM;
2211 auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2212 AM.IndexReg = Mgs->getIndex();
2213 AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2215 unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2216 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2217 if (AddrSpace == 256)
2218 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2219 if (AddrSpace == 257)
2220 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2221 if (AddrSpace == 258)
2222 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2225 MVT VT = N.getSimpleValueType();
2227 // Try to match into the base and displacement fields.
2228 if (matchVectorAddress(N, AM))
2231 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2235 /// Returns true if it is able to pattern match an addressing mode.
2236 /// It returns the operands which make up the maximal addressing mode it can
2237 /// match by reference.
2239 /// Parent is the parent node of the addr operand that is being matched. It
2240 /// is always a load, store, atomic node, or null. It is only null when
2241 /// checking memory operands for inline asm nodes.
2242 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2243 SDValue &Scale, SDValue &Index,
2244 SDValue &Disp, SDValue &Segment) {
2245 X86ISelAddressMode AM;
2248 // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2249 // that are not a MemSDNode, and thus don't have proper addrspace info.
2250 Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
2251 Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
2252 Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
2253 Parent->getOpcode() != X86ISD::ENQCMD && // Fixme
2254 Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme
2255 Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
2256 Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
2257 unsigned AddrSpace =
2258 cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2259 // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2260 if (AddrSpace == 256)
2261 AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2262 if (AddrSpace == 257)
2263 AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2264 if (AddrSpace == 258)
2265 AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2268 // Save the DL and VT before calling matchAddress, it can invalidate N.
2270 MVT VT = N.getSimpleValueType();
2272 if (matchAddress(N, AM))
2275 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2279 // We can only fold a load if all nodes between it and the root node have a
2280 // single use. If there are additional uses, we could end up duplicating the
2282 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2283 while (User != Root) {
2284 if (!User->hasOneUse())
2286 User = *User->use_begin();
2292 /// Match a scalar SSE load. In particular, we want to match a load whose top
2293 /// elements are either undef or zeros. The load flavor is derived from the
2294 /// type of N, which is either v4f32 or v2f64.
2297 /// PatternChainNode: this is the matched node that has a chain input and
2299 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2300 SDValue N, SDValue &Base,
2301 SDValue &Scale, SDValue &Index,
2302 SDValue &Disp, SDValue &Segment,
2303 SDValue &PatternNodeWithChain) {
2304 if (!hasSingleUsesFromRoot(Root, Parent))
2307 // We can allow a full vector load here since narrowing a load is ok unless
2309 if (ISD::isNON_EXTLoad(N.getNode())) {
2310 LoadSDNode *LD = cast<LoadSDNode>(N);
2311 if (!LD->isVolatile() &&
2312 IsProfitableToFold(N, LD, Root) &&
2313 IsLegalToFold(N, Parent, Root, OptLevel)) {
2314 PatternNodeWithChain = N;
2315 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2320 // We can also match the special zero extended load opcode.
2321 if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2322 PatternNodeWithChain = N;
2323 if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2324 IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
2325 auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2326 return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2331 // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2332 // once. Otherwise the load might get duplicated and the chain output of the
2333 // duplicate load will not be observed by all dependencies.
2334 if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
2335 PatternNodeWithChain = N.getOperand(0);
2336 if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2337 IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2338 IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2339 LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2340 return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2349 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2350 if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2351 uint64_t ImmVal = CN->getZExtValue();
2352 if (!isUInt<32>(ImmVal))
2355 Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2359 // In static codegen with small code model, we can get the address of a label
2360 // into a register with 'movl'
2361 if (N->getOpcode() != X86ISD::Wrapper)
2364 N = N.getOperand(0);
2366 // At least GNU as does not accept 'movl' for TPOFF relocations.
2367 // FIXME: We could use 'movl' when we know we are targeting MC.
2368 if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2372 if (N->getOpcode() != ISD::TargetGlobalAddress)
2373 return TM.getCodeModel() == CodeModel::Small;
2375 Optional<ConstantRange> CR =
2376 cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2378 return TM.getCodeModel() == CodeModel::Small;
2380 return CR->getUnsignedMax().ult(1ull << 32);
2383 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2384 SDValue &Scale, SDValue &Index,
2385 SDValue &Disp, SDValue &Segment) {
2386 // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2389 if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2392 RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2393 if (RN && RN->getReg() == 0)
2394 Base = CurDAG->getRegister(0, MVT::i64);
2395 else if (Base.getValueType() == MVT::i32 && !isa<FrameIndexSDNode>(Base)) {
2396 // Base could already be %rip, particularly in the x32 ABI.
2397 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2399 Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2403 RN = dyn_cast<RegisterSDNode>(Index);
2404 if (RN && RN->getReg() == 0)
2405 Index = CurDAG->getRegister(0, MVT::i64);
2407 assert(Index.getValueType() == MVT::i32 &&
2408 "Expect to be extending 32-bit registers for use in LEA");
2409 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2411 Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2418 /// Calls SelectAddr and determines if the maximal addressing
2419 /// mode it matches can be cost effectively emitted as an LEA instruction.
2420 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2421 SDValue &Base, SDValue &Scale,
2422 SDValue &Index, SDValue &Disp,
2424 X86ISelAddressMode AM;
2426 // Save the DL and VT before calling matchAddress, it can invalidate N.
2428 MVT VT = N.getSimpleValueType();
2430 // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2432 SDValue Copy = AM.Segment;
2433 SDValue T = CurDAG->getRegister(0, MVT::i32);
2435 if (matchAddress(N, AM))
2437 assert (T == AM.Segment);
2440 unsigned Complexity = 0;
2441 if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode())
2443 else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2446 if (AM.IndexReg.getNode())
2449 // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2454 // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2455 // to a LEA. This is determined with some experimentation but is by no means
2456 // optimal (especially for code size consideration). LEA is nice because of
2457 // its three-address nature. Tweak the cost function again when we can run
2458 // convertToThreeAddress() at register allocation time.
2459 if (AM.hasSymbolicDisplacement()) {
2460 // For X86-64, always use LEA to materialize RIP-relative addresses.
2461 if (Subtarget->is64Bit())
2467 // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2468 // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2469 // duplicating flag-producing instructions later in the pipeline.
2470 if (N.getOpcode() == ISD::ADD) {
2471 auto isMathWithFlags = [](SDValue V) {
2472 switch (V.getOpcode()) {
2477 /* TODO: These opcodes can be added safely, but we may want to justify
2478 their inclusion for different reasons (better for reg-alloc).
2485 // Value 1 is the flag output of the node - verify it's not dead.
2486 return !SDValue(V.getNode(), 1).use_empty();
2491 // TODO: This could be an 'or' rather than 'and' to make the transform more
2492 // likely to happen. We might want to factor in whether there's a
2493 // load folding opportunity for the math op that disappears with LEA.
2494 if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1)))
2501 // If it isn't worth using an LEA, reject it.
2502 if (Complexity <= 2)
2505 getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2509 /// This is only run on TargetGlobalTLSAddress nodes.
2510 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2511 SDValue &Scale, SDValue &Index,
2512 SDValue &Disp, SDValue &Segment) {
2513 assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2514 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2516 X86ISelAddressMode AM;
2517 AM.GV = GA->getGlobal();
2518 AM.Disp += GA->getOffset();
2519 AM.SymbolFlags = GA->getTargetFlags();
2521 MVT VT = N.getSimpleValueType();
2522 if (VT == MVT::i32) {
2524 AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2527 getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2531 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2532 if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2533 Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2538 // Keep track of the original value type and whether this value was
2539 // truncated. If we see a truncation from pointer type to VT that truncates
2540 // bits that are known to be zero, we can use a narrow reference.
2541 EVT VT = N.getValueType();
2542 bool WasTruncated = false;
2543 if (N.getOpcode() == ISD::TRUNCATE) {
2544 WasTruncated = true;
2545 N = N.getOperand(0);
2548 if (N.getOpcode() != X86ISD::Wrapper)
2551 // We can only use non-GlobalValues as immediates if they were not truncated,
2552 // as we do not have any range information. If we have a GlobalValue and the
2553 // address was not truncated, we can select it as an operand directly.
2554 unsigned Opc = N.getOperand(0)->getOpcode();
2555 if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
2556 Op = N.getOperand(0);
2557 // We can only select the operand directly if we didn't have to look past a
2559 return !WasTruncated;
2562 // Check that the global's range fits into VT.
2563 auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2564 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2565 if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
2568 // Okay, we can use a narrow reference.
2569 Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2570 GA->getOffset(), GA->getTargetFlags());
2574 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2575 SDValue &Base, SDValue &Scale,
2576 SDValue &Index, SDValue &Disp,
2578 if (!ISD::isNON_EXTLoad(N.getNode()) ||
2579 !IsProfitableToFold(N, P, Root) ||
2580 !IsLegalToFold(N, P, Root, OptLevel))
2583 return selectAddr(N.getNode(),
2584 N.getOperand(1), Base, Scale, Index, Disp, Segment);
2587 /// Return an SDNode that returns the value of the global base register.
2588 /// Output instructions required to initialize the global base register,
2590 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2591 unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2592 auto &DL = MF->getDataLayout();
2593 return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2596 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2597 if (N->getOpcode() == ISD::TRUNCATE)
2598 N = N->getOperand(0).getNode();
2599 if (N->getOpcode() != X86ISD::Wrapper)
2602 auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2606 Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2607 return CR && CR->getSignedMin().sge(-1ull << Width) &&
2608 CR->getSignedMax().slt(1ull << Width);
2611 static X86::CondCode getCondFromNode(SDNode *N) {
2612 assert(N->isMachineOpcode() && "Unexpected node");
2613 X86::CondCode CC = X86::COND_INVALID;
2614 unsigned Opc = N->getMachineOpcode();
2615 if (Opc == X86::JCC_1)
2616 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2617 else if (Opc == X86::SETCCr)
2618 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2619 else if (Opc == X86::SETCCm)
2620 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2621 else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2622 Opc == X86::CMOV64rr)
2623 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2624 else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2625 Opc == X86::CMOV64rm)
2626 CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2631 /// Test whether the given X86ISD::CMP node has any users that use a flag
2633 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2634 // Examine each user of the node.
2635 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2637 // Only check things that use the flags.
2638 if (UI.getUse().getResNo() != Flags.getResNo())
2640 // Only examine CopyToReg uses that copy to EFLAGS.
2641 if (UI->getOpcode() != ISD::CopyToReg ||
2642 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2644 // Examine each user of the CopyToReg use.
2645 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2646 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2647 // Only examine the Flag result.
2648 if (FlagUI.getUse().getResNo() != 1) continue;
2649 // Anything unusual: assume conservatively.
2650 if (!FlagUI->isMachineOpcode()) return false;
2651 // Examine the condition code of the user.
2652 X86::CondCode CC = getCondFromNode(*FlagUI);
2655 // Comparisons which only use the zero flag.
2656 case X86::COND_E: case X86::COND_NE:
2658 // Anything else: assume conservatively.
2667 /// Test whether the given X86ISD::CMP node has any uses which require the SF
2668 /// flag to be accurate.
2669 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2670 // Examine each user of the node.
2671 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2673 // Only check things that use the flags.
2674 if (UI.getUse().getResNo() != Flags.getResNo())
2676 // Only examine CopyToReg uses that copy to EFLAGS.
2677 if (UI->getOpcode() != ISD::CopyToReg ||
2678 cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2680 // Examine each user of the CopyToReg use.
2681 for (SDNode::use_iterator FlagUI = UI->use_begin(),
2682 FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
2683 // Only examine the Flag result.
2684 if (FlagUI.getUse().getResNo() != 1) continue;
2685 // Anything unusual: assume conservatively.
2686 if (!FlagUI->isMachineOpcode()) return false;
2687 // Examine the condition code of the user.
2688 X86::CondCode CC = getCondFromNode(*FlagUI);
2691 // Comparisons which don't examine the SF flag.
2692 case X86::COND_A: case X86::COND_AE:
2693 case X86::COND_B: case X86::COND_BE:
2694 case X86::COND_E: case X86::COND_NE:
2695 case X86::COND_O: case X86::COND_NO:
2696 case X86::COND_P: case X86::COND_NP:
2698 // Anything else: assume conservatively.
2707 static bool mayUseCarryFlag(X86::CondCode CC) {
2709 // Comparisons which don't examine the CF flag.
2710 case X86::COND_O: case X86::COND_NO:
2711 case X86::COND_E: case X86::COND_NE:
2712 case X86::COND_S: case X86::COND_NS:
2713 case X86::COND_P: case X86::COND_NP:
2714 case X86::COND_L: case X86::COND_GE:
2715 case X86::COND_G: case X86::COND_LE:
2717 // Anything else: assume conservatively.
2723 /// Test whether the given node which sets flags has any uses which require the
2724 /// CF flag to be accurate.
2725 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2726 // Examine each user of the node.
2727 for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2729 // Only check things that use the flags.
2730 if (UI.getUse().getResNo() != Flags.getResNo())
2733 unsigned UIOpc = UI->getOpcode();
2735 if (UIOpc == ISD::CopyToReg) {
2736 // Only examine CopyToReg uses that copy to EFLAGS.
2737 if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2739 // Examine each user of the CopyToReg use.
2740 for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2741 FlagUI != FlagUE; ++FlagUI) {
2742 // Only examine the Flag result.
2743 if (FlagUI.getUse().getResNo() != 1)
2745 // Anything unusual: assume conservatively.
2746 if (!FlagUI->isMachineOpcode())
2748 // Examine the condition code of the user.
2749 X86::CondCode CC = getCondFromNode(*FlagUI);
2751 if (mayUseCarryFlag(CC))
2755 // This CopyToReg is ok. Move on to the next user.
2759 // This might be an unselected node. So look for the pre-isel opcodes that
2764 // Something unusual. Be conservative.
2766 case X86ISD::SETCC: CCOpNo = 0; break;
2767 case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
2768 case X86ISD::CMOV: CCOpNo = 2; break;
2769 case X86ISD::BRCOND: CCOpNo = 2; break;
2772 X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2773 if (mayUseCarryFlag(CC))
2779 /// Check whether or not the chain ending in StoreNode is suitable for doing
2780 /// the {load; op; store} to modify transformation.
2781 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2782 SDValue StoredVal, SelectionDAG *CurDAG,
2784 LoadSDNode *&LoadNode,
2785 SDValue &InputChain) {
2786 // Is the stored value result 0 of the operation?
2787 if (StoredVal.getResNo() != 0) return false;
2789 // Are there other uses of the operation other than the store?
2790 if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
2792 // Is the store non-extending and non-indexed?
2793 if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2796 SDValue Load = StoredVal->getOperand(LoadOpNo);
2797 // Is the stored value a non-extending and non-indexed load?
2798 if (!ISD::isNormalLoad(Load.getNode())) return false;
2800 // Return LoadNode by reference.
2801 LoadNode = cast<LoadSDNode>(Load);
2803 // Is store the only read of the loaded value?
2804 if (!Load.hasOneUse())
2807 // Is the address of the store the same as the load?
2808 if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2809 LoadNode->getOffset() != StoreNode->getOffset())
2812 bool FoundLoad = false;
2813 SmallVector<SDValue, 4> ChainOps;
2814 SmallVector<const SDNode *, 4> LoopWorklist;
2815 SmallPtrSet<const SDNode *, 16> Visited;
2816 const unsigned int Max = 1024;
2818 // Visualization of Load-Op-Store fusion:
2819 // -------------------------
2821 // *-lines = Chain operand dependencies.
2822 // |-lines = Normal operand dependencies.
2823 // Dependencies flow down and right. n-suffix references multiple nodes.
2831 // * * \ | => A--LD_OP_ST
2839 // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2843 // Ensure the transform is safe by checking for the dual
2844 // dependencies to make sure we do not induce a loop.
2846 // As LD is a predecessor to both OP and ST we can do this by checking:
2847 // a). if LD is a predecessor to a member of Xn or Yn.
2848 // b). if a Zn is a predecessor to ST.
2850 // However, (b) can only occur through being a chain predecessor to
2851 // ST, which is the same as Zn being a member or predecessor of Xn,
2852 // which is a subset of LD being a predecessor of Xn. So it's
2853 // subsumed by check (a).
2855 SDValue Chain = StoreNode->getChain();
2857 // Gather X elements in ChainOps.
2858 if (Chain == Load.getValue(1)) {
2860 ChainOps.push_back(Load.getOperand(0));
2861 } else if (Chain.getOpcode() == ISD::TokenFactor) {
2862 for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
2863 SDValue Op = Chain.getOperand(i);
2864 if (Op == Load.getValue(1)) {
2866 // Drop Load, but keep its chain. No cycle check necessary.
2867 ChainOps.push_back(Load.getOperand(0));
2870 LoopWorklist.push_back(Op.getNode());
2871 ChainOps.push_back(Op);
2878 // Worklist is currently Xn. Add Yn to worklist.
2879 for (SDValue Op : StoredVal->ops())
2880 if (Op.getNode() != LoadNode)
2881 LoopWorklist.push_back(Op.getNode());
2883 // Check (a) if Load is a predecessor to Xn + Yn
2884 if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2889 CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2893 // Change a chain of {load; op; store} of the same value into a simple op
2894 // through memory of that value, if the uses of the modified value and its
2895 // address are suitable.
2897 // The tablegen pattern memory operand pattern is currently not able to match
2898 // the case where the EFLAGS on the original operation are used.
2900 // To move this to tablegen, we'll need to improve tablegen to allow flags to
2901 // be transferred from a node in the pattern to the result node, probably with
2902 // a new keyword. For example, we have this
2903 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2904 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2905 // (implicit EFLAGS)]>;
2906 // but maybe need something like this
2907 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2908 // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2909 // (transferrable EFLAGS)]>;
2911 // Until then, we manually fold these and instruction select the operation
2913 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2914 StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2915 SDValue StoredVal = StoreNode->getOperand(1);
2916 unsigned Opc = StoredVal->getOpcode();
2918 // Before we try to select anything, make sure this is memory operand size
2919 // and opcode we can handle. Note that this must match the code below that
2920 // actually lowers the opcodes.
2921 EVT MemVT = StoreNode->getMemoryVT();
2922 if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
2926 bool IsCommutable = false;
2927 bool IsNegate = false;
2932 IsNegate = isNullConstant(StoredVal.getOperand(0));
2941 IsCommutable = true;
2945 unsigned LoadOpNo = IsNegate ? 1 : 0;
2946 LoadSDNode *LoadNode = nullptr;
2948 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2949 LoadNode, InputChain)) {
2953 // This operation is commutable, try the other operand.
2955 if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2956 LoadNode, InputChain))
2960 SDValue Base, Scale, Index, Disp, Segment;
2961 if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2965 auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2967 switch (MemVT.getSimpleVT().SimpleTy) {
2977 llvm_unreachable("Invalid size!");
2981 MachineSDNode *Result;
2986 unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
2988 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2989 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2995 // Try to match inc/dec.
2996 if (!Subtarget->slowIncDec() || OptForSize) {
2997 bool IsOne = isOneConstant(StoredVal.getOperand(1));
2998 bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
2999 // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3000 if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
3002 ((Opc == X86ISD::ADD) == IsOne)
3003 ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
3004 : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
3005 const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3006 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3017 auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3020 return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3023 return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3026 return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3029 return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3032 return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3035 return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3037 return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3040 llvm_unreachable("Invalid opcode!");
3043 auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3046 return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3048 return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3050 return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3052 return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3054 return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3056 return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3058 return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3060 llvm_unreachable("Invalid opcode!");
3063 auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3066 return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3069 return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3072 return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3075 return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3078 return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3081 return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3084 return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3087 llvm_unreachable("Invalid opcode!");
3091 unsigned NewOpc = SelectRegOpcode(Opc);
3092 SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3094 // See if the operand is a constant that we can fold into an immediate
3096 if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3097 int64_t OperandV = OperandC->getSExtValue();
3099 // Check if we can shrink the operand enough to fit in an immediate (or
3100 // fit into a smaller immediate) by negating it and switching the
3102 if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
3103 ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) ||
3104 (MemVT == MVT::i64 && !isInt<32>(OperandV) &&
3105 isInt<32>(-OperandV))) &&
3106 hasNoCarryFlagUses(StoredVal.getValue(1))) {
3107 OperandV = -OperandV;
3108 Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
3111 // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3112 // the larger immediate operand.
3113 if (MemVT != MVT::i8 && isInt<8>(OperandV)) {
3114 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3115 NewOpc = SelectImm8Opcode(Opc);
3116 } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) {
3117 Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3118 NewOpc = SelectImmOpcode(Opc);
3122 if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
3124 CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3125 StoredVal.getOperand(2), SDValue());
3127 const SDValue Ops[] = {Base, Scale, Index, Disp,
3128 Segment, Operand, CopyTo, CopyTo.getValue(1)};
3129 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3132 const SDValue Ops[] = {Base, Scale, Index, Disp,
3133 Segment, Operand, InputChain};
3134 Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3140 llvm_unreachable("Invalid opcode!");
3143 MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3144 LoadNode->getMemOperand()};
3145 CurDAG->setNodeMemRefs(Result, MemOps);
3147 // Update Load Chain uses as well.
3148 ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3149 ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3150 ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3151 CurDAG->RemoveDeadNode(Node);
3155 // See if this is an X & Mask that we can match to BEXTR/BZHI.
3156 // Where Mask is one of the following patterns:
3157 // a) x & (1 << nbits) - 1
3158 // b) x & ~(-1 << nbits)
3159 // c) x & (-1 >> (32 - y))
3160 // d) x << (32 - y) >> (32 - y)
3161 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3163 (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3164 "Should be either an and-mask, or right-shift after clearing high bits.");
3166 // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3167 if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
3170 MVT NVT = Node->getSimpleValueType(0);
3172 // Only supported for 32 and 64 bits.
3173 if (NVT != MVT::i32 && NVT != MVT::i64)
3178 // If we have BMI2's BZHI, we are ok with muti-use patterns.
3179 // Else, if we only have BMI1's BEXTR, we require one-use.
3180 const bool CanHaveExtraUses = Subtarget->hasBMI2();
3181 auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3182 return CanHaveExtraUses ||
3183 Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
3185 auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
3186 auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
3188 auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3189 if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) {
3190 assert(V.getSimpleValueType() == MVT::i32 &&
3191 V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3192 "Expected i64 -> i32 truncation");
3193 V = V.getOperand(0);
3198 // a) x & ((1 << nbits) + (-1))
3199 auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3200 &NBits](SDValue Mask) -> bool {
3201 // Match `add`. Must only have one use!
3202 if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
3204 // We should be adding all-ones constant (i.e. subtracting one.)
3205 if (!isAllOnesConstant(Mask->getOperand(1)))
3207 // Match `1 << nbits`. Might be truncated. Must only have one use!
3208 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3209 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3211 if (!isOneConstant(M0->getOperand(0)))
3213 NBits = M0->getOperand(1);
3217 auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3218 V = peekThroughOneUseTruncation(V);
3219 return CurDAG->MaskedValueIsAllOnes(
3220 V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3221 NVT.getSizeInBits()));
3224 // b) x & ~(-1 << nbits)
3225 auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3226 &NBits](SDValue Mask) -> bool {
3227 // Match `~()`. Must only have one use!
3228 if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask))
3230 // The -1 only has to be all-ones for the final Node's NVT.
3231 if (!isAllOnes(Mask->getOperand(1)))
3233 // Match `-1 << nbits`. Might be truncated. Must only have one use!
3234 SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3235 if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
3237 // The -1 only has to be all-ones for the final Node's NVT.
3238 if (!isAllOnes(M0->getOperand(0)))
3240 NBits = M0->getOperand(1);
3244 // Match potentially-truncated (bitwidth - y)
3245 auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3246 unsigned Bitwidth) {
3247 // Skip over a truncate of the shift amount.
3248 if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3249 ShiftAmt = ShiftAmt.getOperand(0);
3250 // The trunc should have been the only user of the real shift amount.
3251 if (!checkOneUse(ShiftAmt))
3254 // Match the shift amount as: (bitwidth - y). It should go away, too.
3255 if (ShiftAmt.getOpcode() != ISD::SUB)
3257 auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3258 if (!V0 || V0->getZExtValue() != Bitwidth)
3260 NBits = ShiftAmt.getOperand(1);
3264 // c) x & (-1 >> (32 - y))
3265 auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3266 matchShiftAmt](SDValue Mask) -> bool {
3267 // The mask itself may be truncated.
3268 Mask = peekThroughOneUseTruncation(Mask);
3269 unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3270 // Match `l>>`. Must only have one use!
3271 if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
3273 // We should be shifting truly all-ones constant.
3274 if (!isAllOnesConstant(Mask.getOperand(0)))
3276 SDValue M1 = Mask.getOperand(1);
3277 // The shift amount should not be used externally.
3278 if (!checkOneUse(M1))
3280 return matchShiftAmt(M1, Bitwidth);
3285 // d) x << (32 - y) >> (32 - y)
3286 auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3287 &X](SDNode *Node) -> bool {
3288 if (Node->getOpcode() != ISD::SRL)
3290 SDValue N0 = Node->getOperand(0);
3291 if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
3293 unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3294 SDValue N1 = Node->getOperand(1);
3295 SDValue N01 = N0->getOperand(1);
3296 // Both of the shifts must be by the exact same value.
3297 // There should not be any uses of the shift amount outside of the pattern.
3298 if (N1 != N01 || !checkTwoUse(N1))
3300 if (!matchShiftAmt(N1, Bitwidth))
3302 X = N0->getOperand(0);
3306 auto matchLowBitMask = [matchPatternA, matchPatternB,
3307 matchPatternC](SDValue Mask) -> bool {
3308 return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
3311 if (Node->getOpcode() == ISD::AND) {
3312 X = Node->getOperand(0);
3313 SDValue Mask = Node->getOperand(1);
3315 if (matchLowBitMask(Mask)) {
3319 if (!matchLowBitMask(Mask))
3322 } else if (!matchPatternD(Node))
3327 // Truncate the shift amount.
3328 NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3329 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3331 // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3332 // All the other bits are undefined, we do not care about them.
3333 SDValue ImplDef = SDValue(
3334 CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3335 insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3337 SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32);
3338 insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal);
3340 CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef,
3341 NBits, SRIdxVal), 0);
3342 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3344 if (Subtarget->hasBMI2()) {
3345 // Great, just emit the the BZHI..
3346 if (NVT != MVT::i32) {
3347 // But have to place the bit count into the wide-enough register first.
3348 NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3349 insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3352 SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3353 ReplaceNode(Node, Extract.getNode());
3354 SelectCode(Extract.getNode());
3358 // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3359 // *logically* shifted (potentially with one-use trunc inbetween),
3360 // and the truncation was the only use of the shift,
3361 // and if so look past one-use truncation.
3363 SDValue RealX = peekThroughOneUseTruncation(X);
3364 // FIXME: only if the shift is one-use?
3365 if (RealX != X && RealX.getOpcode() == ISD::SRL)
3369 MVT XVT = X.getSimpleValueType();
3371 // Else, emitting BEXTR requires one more step.
3372 // The 'control' of BEXTR has the pattern of:
3373 // [15...8 bit][ 7...0 bit] location
3374 // [ bit count][ shift] name
3375 // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11
3377 // Shift NBits left by 8 bits, thus producing 'control'.
3378 // This makes the low 8 bits to be zero.
3379 SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3380 SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3381 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3383 // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3384 // FIXME: only if the shift is one-use?
3385 if (X.getOpcode() == ISD::SRL) {
3386 SDValue ShiftAmt = X.getOperand(1);
3387 X = X.getOperand(0);
3389 assert(ShiftAmt.getValueType() == MVT::i8 &&
3390 "Expected shift amount to be i8");
3392 // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3393 // We could zext to i16 in some form, but we intentionally don't do that.
3394 SDValue OrigShiftAmt = ShiftAmt;
3395 ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3396 insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3398 // And now 'or' these low 8 bits of shift amount into the 'control'.
3399 Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3400 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3403 // But have to place the 'control' into the wide-enough register first.
3404 if (XVT != MVT::i32) {
3405 Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3406 insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3409 // And finally, form the BEXTR itself.
3410 SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3412 // The 'X' was originally truncated. Do that now.
3414 insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3415 Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3418 ReplaceNode(Node, Extract.getNode());
3419 SelectCode(Extract.getNode());
3424 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3425 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3426 MVT NVT = Node->getSimpleValueType(0);
3429 SDValue N0 = Node->getOperand(0);
3430 SDValue N1 = Node->getOperand(1);
3432 // If we have TBM we can use an immediate for the control. If we have BMI
3433 // we should only do this if the BEXTR instruction is implemented well.
3434 // Otherwise moving the control into a register makes this more costly.
3435 // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3436 // hoisting the move immediate would make it worthwhile with a less optimal
3438 if (!Subtarget->hasTBM() &&
3439 !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
3442 // Must have a shift right.
3443 if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
3446 // Shift can't have additional users.
3447 if (!N0->hasOneUse())
3450 // Only supported for 32 and 64 bits.
3451 if (NVT != MVT::i32 && NVT != MVT::i64)
3454 // Shift amount and RHS of and must be constant.
3455 ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3456 ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3457 if (!MaskCst || !ShiftCst)
3460 // And RHS must be a mask.
3461 uint64_t Mask = MaskCst->getZExtValue();
3462 if (!isMask_64(Mask))
3465 uint64_t Shift = ShiftCst->getZExtValue();
3466 uint64_t MaskSize = countPopulation(Mask);
3468 // Don't interfere with something that can be handled by extracting AH.
3469 // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3470 if (Shift == 8 && MaskSize == 8)
3473 // Make sure we are only using bits that were in the original value, not
3475 if (Shift + MaskSize > NVT.getSizeInBits())
3478 SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3479 unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
3480 unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
3482 // BMI requires the immediate to placed in a register.
3483 if (!Subtarget->hasTBM()) {
3484 ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
3485 MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
3486 unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
3487 New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
3490 MachineSDNode *NewNode;
3491 SDValue Input = N0->getOperand(0);
3492 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3493 if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3494 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
3495 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3496 NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3497 // Update the chain.
3498 ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3499 // Record the mem-refs
3500 CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3502 NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
3508 // Emit a PCMISTR(I/M) instruction.
3509 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3510 bool MayFoldLoad, const SDLoc &dl,
3511 MVT VT, SDNode *Node) {
3512 SDValue N0 = Node->getOperand(0);
3513 SDValue N1 = Node->getOperand(1);
3514 SDValue Imm = Node->getOperand(2);
3515 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3516 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3518 // Try to fold a load. No need to check alignment.
3519 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3520 if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3521 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3523 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3524 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3525 // Update the chain.
3526 ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3527 // Record the mem-refs
3528 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3532 SDValue Ops[] = { N0, N1, Imm };
3533 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3534 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3538 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3539 // to emit a second instruction after this one. This is needed since we have two
3540 // copyToReg nodes glued before this and we need to continue that glue through.
3541 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3542 bool MayFoldLoad, const SDLoc &dl,
3543 MVT VT, SDNode *Node,
3545 SDValue N0 = Node->getOperand(0);
3546 SDValue N2 = Node->getOperand(2);
3547 SDValue Imm = Node->getOperand(4);
3548 const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3549 Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3551 // Try to fold a load. No need to check alignment.
3552 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3553 if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3554 SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3555 N2.getOperand(0), InFlag };
3556 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3557 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3558 InFlag = SDValue(CNode, 3);
3559 // Update the chain.
3560 ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3561 // Record the mem-refs
3562 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3566 SDValue Ops[] = { N0, N2, Imm, InFlag };
3567 SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3568 MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3569 InFlag = SDValue(CNode, 2);
3573 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3574 EVT VT = N->getValueType(0);
3576 // Only handle scalar shifts.
3580 // Narrower shifts only mask to 5 bits in hardware.
3581 unsigned Size = VT == MVT::i64 ? 64 : 32;
3583 SDValue OrigShiftAmt = N->getOperand(1);
3584 SDValue ShiftAmt = OrigShiftAmt;
3587 // Skip over a truncate of the shift amount.
3588 if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3589 ShiftAmt = ShiftAmt->getOperand(0);
3591 // This function is called after X86DAGToDAGISel::matchBitExtract(),
3592 // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3594 SDValue NewShiftAmt;
3595 if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
3596 SDValue Add0 = ShiftAmt->getOperand(0);
3597 SDValue Add1 = ShiftAmt->getOperand(1);
3598 // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3599 // to avoid the ADD/SUB.
3600 if (isa<ConstantSDNode>(Add1) &&
3601 cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
3603 // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3604 // generate a NEG instead of a SUB of a constant.
3605 } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3606 isa<ConstantSDNode>(Add0) &&
3607 cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
3608 cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
3609 // Insert a negate op.
3610 // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3611 // that uses it that's not a shift.
3612 EVT SubVT = ShiftAmt.getValueType();
3613 SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3614 SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3617 // Insert these operands into a valid topological order so they can
3618 // get selected independently.
3619 insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3620 insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3626 if (NewShiftAmt.getValueType() != MVT::i8) {
3627 // Need to truncate the shift amount.
3628 NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3629 // Add to a correct topological ordering.
3630 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3633 // Insert a new mask to keep the shift amount legal. This should be removed
3634 // by isel patterns.
3635 NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3636 CurDAG->getConstant(Size - 1, DL, MVT::i8));
3637 // Place in a correct topological ordering.
3638 insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3640 SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3642 if (UpdatedNode != N) {
3643 // If we found an existing node, we should replace ourselves with that node
3644 // and wait for it to be selected after its other users.
3645 ReplaceNode(N, UpdatedNode);
3649 // If the original shift amount is now dead, delete it so that we don't run
3651 if (OrigShiftAmt.getNode()->use_empty())
3652 CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3654 // Now that we've optimized the shift amount, defer to normal isel to get
3655 // load folding and legacy vs BMI2 selection without repeating it here.
3660 bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3661 MVT NVT = N->getSimpleValueType(0);
3662 unsigned Opcode = N->getOpcode();
3665 // For operations of the form (x << C1) op C2, check if we can use a smaller
3666 // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3667 SDValue Shift = N->getOperand(0);
3668 SDValue N1 = N->getOperand(1);
3670 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3674 int64_t Val = Cst->getSExtValue();
3676 // If we have an any_extend feeding the AND, look through it to see if there
3677 // is a shift behind it. But only if the AND doesn't use the extended bits.
3678 // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3679 bool FoundAnyExtend = false;
3680 if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() &&
3681 Shift.getOperand(0).getSimpleValueType() == MVT::i32 &&
3683 FoundAnyExtend = true;
3684 Shift = Shift.getOperand(0);
3687 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse())
3690 // i8 is unshrinkable, i16 should be promoted to i32.
3691 if (NVT != MVT::i32 && NVT != MVT::i64)
3694 ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3698 uint64_t ShAmt = ShlCst->getZExtValue();
3700 // Make sure that we don't change the operation by removing bits.
3701 // This only matters for OR and XOR, AND is unaffected.
3702 uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3703 if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
3706 // Check the minimum bitwidth for the new constant.
3707 // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3708 auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3709 if (Opcode == ISD::AND) {
3710 // AND32ri is the same as AND64ri32 with zext imm.
3711 // Try this before sign extended immediates below.
3712 ShiftedVal = (uint64_t)Val >> ShAmt;
3713 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3715 // Also swap order when the AND can become MOVZX.
3716 if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX)
3719 ShiftedVal = Val >> ShAmt;
3720 if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) ||
3721 (!isInt<32>(Val) && isInt<32>(ShiftedVal)))
3723 if (Opcode != ISD::AND) {
3724 // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3725 ShiftedVal = (uint64_t)Val >> ShAmt;
3726 if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal))
3733 if (!CanShrinkImmediate(ShiftedVal))
3736 // Ok, we can reorder to get a smaller immediate.
3738 // But, its possible the original immediate allowed an AND to become MOVZX.
3739 // Doing this late due to avoid the MakedValueIsZero call as late as
3741 if (Opcode == ISD::AND) {
3742 // Find the smallest zext this could possibly be.
3743 unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3744 ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3746 // Figure out which bits need to be zero to achieve that mask.
3747 APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3749 NeededMask &= ~Cst->getAPIntValue();
3751 if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3755 SDValue X = Shift.getOperand(0);
3756 if (FoundAnyExtend) {
3757 SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3758 insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3762 SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3763 insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3764 SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3765 insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3766 SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3767 Shift.getOperand(1));
3768 ReplaceNode(N, NewSHL.getNode());
3769 SelectCode(NewSHL.getNode());
3773 /// If the high bits of an 'and' operand are known zero, try setting the
3774 /// high bits of an 'and' constant operand to produce a smaller encoding by
3775 /// creating a small, sign-extended negative immediate rather than a large
3776 /// positive one. This reverses a transform in SimplifyDemandedBits that
3777 /// shrinks mask constants by clearing bits. There is also a possibility that
3778 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3779 /// case, just replace the 'and'. Return 'true' if the node is replaced.
3780 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3781 // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3782 // have immediate operands.
3783 MVT VT = And->getSimpleValueType(0);
3784 if (VT != MVT::i32 && VT != MVT::i64)
3787 auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3791 // Bail out if the mask constant is already negative. It's can't shrink more.
3792 // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3793 // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3794 // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3795 // are negative too.
3796 APInt MaskVal = And1C->getAPIntValue();
3797 unsigned MaskLZ = MaskVal.countLeadingZeros();
3798 if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
3801 // Don't extend into the upper 32 bits of a 64 bit mask.
3802 if (VT == MVT::i64 && MaskLZ >= 32) {
3804 MaskVal = MaskVal.trunc(32);
3807 SDValue And0 = And->getOperand(0);
3808 APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3809 APInt NegMaskVal = MaskVal | HighZeros;
3811 // If a negative constant would not allow a smaller encoding, there's no need
3812 // to continue. Only change the constant when we know it's a win.
3813 unsigned MinWidth = NegMaskVal.getMinSignedBits();
3814 if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
3817 // Extend masks if we truncated above.
3818 if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
3819 NegMaskVal = NegMaskVal.zext(64);
3820 HighZeros = HighZeros.zext(64);
3823 // The variable operand must be all zeros in the top bits to allow using the
3824 // new, negative constant as the mask.
3825 if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3828 // Check if the mask is -1. In that case, this is an unnecessary instruction
3829 // that escaped earlier analysis.
3830 if (NegMaskVal.isAllOnesValue()) {
3831 ReplaceNode(And, And0.getNode());
3835 // A negative mask allows a smaller encoding. Create a new 'and' node.
3836 SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3837 SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3838 ReplaceNode(And, NewAnd.getNode());
3839 SelectCode(NewAnd.getNode());
3843 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3844 bool FoldedBCast, bool Masked) {
3847 switch (TestVT.SimpleTy) {
3848 default: llvm_unreachable("Unexpected VT!");
3850 return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3852 return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3854 return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3856 return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3858 return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3860 return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3862 return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3864 return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3866 return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3868 return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3870 return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3872 return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3877 switch (TestVT.SimpleTy) {
3878 default: llvm_unreachable("Unexpected VT!");
3880 return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3882 return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3884 return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3886 return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3888 return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3890 return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3894 switch (TestVT.SimpleTy) {
3895 default: llvm_unreachable("Unexpected VT!");
3897 return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
3899 return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
3901 return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
3903 return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
3905 return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
3907 return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
3909 return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
3911 return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
3913 return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
3915 return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
3917 return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
3919 return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
3924 switch (TestVT.SimpleTy) {
3925 default: llvm_unreachable("Unexpected VT!");
3927 return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
3929 return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
3931 return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
3933 return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
3935 return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
3937 return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
3939 return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
3941 return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
3943 return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
3945 return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
3947 return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
3949 return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
3954 switch (TestVT.SimpleTy) {
3955 default: llvm_unreachable("Unexpected VT!");
3957 return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
3959 return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
3961 return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
3963 return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
3965 return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
3967 return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
3971 switch (TestVT.SimpleTy) {
3972 default: llvm_unreachable("Unexpected VT!");
3974 return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
3976 return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
3978 return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
3980 return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
3982 return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
3984 return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
3986 return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
3988 return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
3990 return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
3992 return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
3994 return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
3996 return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
4000 // Try to create VPTESTM instruction. If InMask is not null, it will be used
4001 // to form a masked operation.
4002 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
4004 assert(Subtarget->hasAVX512() && "Expected AVX512!");
4005 assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4008 // Look for equal and not equal compares.
4009 ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4010 if (CC != ISD::SETEQ && CC != ISD::SETNE)
4013 // See if we're comparing against zero. This should have been canonicalized
4014 // to RHS during lowering.
4015 if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
4018 SDValue N0 = Setcc.getOperand(0);
4020 MVT CmpVT = N0.getSimpleValueType();
4021 MVT CmpSVT = CmpVT.getVectorElementType();
4023 // Start with both operands the same. We'll try to refine this.
4028 // Look through single use bitcasts.
4029 SDValue N0Temp = N0;
4030 if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
4031 N0Temp = N0.getOperand(0);
4033 // Look for single use AND.
4034 if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
4035 Src0 = N0Temp.getOperand(0);
4036 Src1 = N0Temp.getOperand(1);
4040 // Without VLX we need to widen the load.
4041 bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
4043 // We can only fold loads if the sources are unique.
4044 bool CanFoldLoads = Src0 != Src1;
4046 // Try to fold loads unless we need to widen.
4047 bool FoldedLoad = false;
4048 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4049 if (!Widen && CanFoldLoads) {
4051 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4054 // And is computative.
4056 FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4059 std::swap(Src0, Src1);
4063 auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4064 // Look through single use bitcasts.
4065 if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
4066 Src = Src.getOperand(0);
4068 if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
4069 Parent = Src.getNode();
4070 Src = Src.getOperand(0);
4071 if (Src.getSimpleValueType() == CmpSVT)
4078 // If we didn't fold a load, try to match broadcast. No widening limitation
4079 // for this. But only 32 and 64 bit types are supported.
4080 bool FoldedBCast = false;
4081 if (!FoldedLoad && CanFoldLoads &&
4082 (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
4083 SDNode *ParentNode = nullptr;
4084 if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4085 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4086 Tmp1, Tmp2, Tmp3, Tmp4);
4089 // Try the other operand.
4091 if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4092 FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4093 Tmp1, Tmp2, Tmp3, Tmp4);
4095 std::swap(Src0, Src1);
4100 auto getMaskRC = [](MVT MaskVT) {
4101 switch (MaskVT.SimpleTy) {
4102 default: llvm_unreachable("Unexpected VT!");
4103 case MVT::v2i1: return X86::VK2RegClassID;
4104 case MVT::v4i1: return X86::VK4RegClassID;
4105 case MVT::v8i1: return X86::VK8RegClassID;
4106 case MVT::v16i1: return X86::VK16RegClassID;
4107 case MVT::v32i1: return X86::VK32RegClassID;
4108 case MVT::v64i1: return X86::VK64RegClassID;
4112 bool IsMasked = InMask.getNode() != nullptr;
4116 MVT ResVT = Setcc.getSimpleValueType();
4119 // Widen the inputs using insert_subreg or copy_to_regclass.
4120 unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
4121 unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
4122 unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4123 CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4124 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4125 SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4127 Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4129 assert(!FoldedLoad && "Shouldn't have folded the load");
4131 Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4135 unsigned RegClass = getMaskRC(MaskVT);
4136 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4137 InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4138 dl, MaskVT, InMask, RC), 0);
4142 bool IsTestN = CC == ISD::SETEQ;
4143 unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4146 MachineSDNode *CNode;
4147 if (FoldedLoad || FoldedBCast) {
4148 SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4151 SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4152 Load.getOperand(0) };
4153 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4155 SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4156 Load.getOperand(0) };
4157 CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4160 // Update the chain.
4161 ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4162 // Record the mem-refs
4163 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
4166 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4168 CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4171 // If we widened, we need to shrink the mask VT.
4173 unsigned RegClass = getMaskRC(ResVT);
4174 SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4175 CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4176 dl, ResVT, SDValue(CNode, 0), RC);
4179 ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4180 CurDAG->RemoveDeadNode(Root);
4184 void X86DAGToDAGISel::Select(SDNode *Node) {
4185 MVT NVT = Node->getSimpleValueType(0);
4186 unsigned Opcode = Node->getOpcode();
4189 if (Node->isMachineOpcode()) {
4190 LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4191 Node->setNodeId(-1);
4192 return; // Already selected.
4197 case ISD::INTRINSIC_VOID: {
4198 unsigned IntNo = Node->getConstantOperandVal(1);
4201 case Intrinsic::x86_sse3_monitor:
4202 case Intrinsic::x86_monitorx:
4203 case Intrinsic::x86_clzero: {
4204 bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4208 case Intrinsic::x86_sse3_monitor:
4209 if (!Subtarget->hasSSE3())
4211 Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
4213 case Intrinsic::x86_monitorx:
4214 if (!Subtarget->hasMWAITX())
4216 Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
4218 case Intrinsic::x86_clzero:
4219 if (!Subtarget->hasCLZERO())
4221 Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
4226 unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
4227 SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4228 Node->getOperand(2), SDValue());
4229 SDValue InFlag = Chain.getValue(1);
4231 if (IntNo == Intrinsic::x86_sse3_monitor ||
4232 IntNo == Intrinsic::x86_monitorx) {
4233 // Copy the other two operands to ECX and EDX.
4234 Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4236 InFlag = Chain.getValue(1);
4237 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4239 InFlag = Chain.getValue(1);
4242 MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4244 ReplaceNode(Node, CNode);
4253 if (Subtarget->isTargetNaCl())
4254 // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4255 // leave the instruction alone.
4257 if (Subtarget->isTarget64BitILP32()) {
4258 // Converts a 32-bit register to a 64-bit, zero-extended version of
4259 // it. This is needed because x86-64 can do many things, but jmp %r32
4260 // ain't one of them.
4261 const SDValue &Target = Node->getOperand(1);
4262 assert(Target.getSimpleValueType() == llvm::MVT::i32);
4263 SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4264 SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4265 Node->getOperand(0), ZextTarget);
4266 ReplaceNode(Node, Brind.getNode());
4267 SelectCode(ZextTarget.getNode());
4268 SelectCode(Brind.getNode());
4273 case X86ISD::GlobalBaseReg:
4274 ReplaceNode(Node, getGlobalBaseReg());
4278 // Just drop all 128/256/512-bit bitcasts.
4279 if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
4281 ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4282 CurDAG->RemoveDeadNode(Node);
4287 case ISD::VSELECT: {
4288 // Replace VSELECT with non-mask conditions with with BLENDV.
4289 if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4292 assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4293 SDValue Blendv = CurDAG->getNode(
4294 X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4295 Node->getOperand(1), Node->getOperand(2));
4296 ReplaceNode(Node, Blendv.getNode());
4297 SelectCode(Blendv.getNode());
4298 // We already called ReplaceUses.
4303 if (matchBitExtract(Node))
4308 if (tryShiftAmountMod(Node))
4313 if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
4314 // Try to form a masked VPTESTM. Operands can be in either order.
4315 SDValue N0 = Node->getOperand(0);
4316 SDValue N1 = Node->getOperand(1);
4317 if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
4318 tryVPTESTM(Node, N0, N1))
4320 if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
4321 tryVPTESTM(Node, N1, N0))
4325 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4326 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4327 CurDAG->RemoveDeadNode(Node);
4330 if (matchBitExtract(Node))
4332 if (AndImmShrink && shrinkAndImmediate(Node))
4338 if (tryShrinkShlLogicImm(Node))
4344 // Try to avoid folding immediates with multiple uses for optsize.
4345 // This code tries to select to register form directly to avoid going
4346 // through the isel table which might fold the immediate. We can't change
4347 // the patterns on the add/sub/and/or/xor with immediate paterns in the
4348 // tablegen files to check immediate use count without making the patterns
4349 // unavailable to the fast-isel table.
4353 // Only handle i8/i16/i32/i64.
4354 if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
4357 SDValue N0 = Node->getOperand(0);
4358 SDValue N1 = Node->getOperand(1);
4360 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4364 int64_t Val = Cst->getSExtValue();
4366 // Make sure its an immediate that is considered foldable.
4367 // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4368 if (!isInt<8>(Val) && !isInt<32>(Val))
4371 // Check if we should avoid folding this immediate.
4372 if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4375 // We should not fold the immediate. So we need a register form instead.
4376 unsigned ROpc, MOpc;
4377 switch (NVT.SimpleTy) {
4378 default: llvm_unreachable("Unexpected VT!");
4381 default: llvm_unreachable("Unexpected opcode!");
4382 case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
4383 case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
4384 case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4385 case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break;
4386 case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
4391 default: llvm_unreachable("Unexpected opcode!");
4392 case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4393 case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4394 case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4395 case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break;
4396 case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4401 default: llvm_unreachable("Unexpected opcode!");
4402 case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
4403 case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
4404 case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
4405 case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break;
4406 case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
4411 default: llvm_unreachable("Unexpected opcode!");
4412 case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4413 case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
4414 case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
4415 case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break;
4416 case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
4421 // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4423 // If this is a not a subtract, we can still try to fold a load.
4424 if (Opcode != ISD::SUB) {
4425 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4426 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4427 SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4428 SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4429 MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4430 // Update the chain.
4431 ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4432 // Record the mem-refs
4433 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4434 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4435 CurDAG->RemoveDeadNode(Node);
4440 CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4445 // i16/i32/i64 are handled with isel patterns.
4449 case X86ISD::UMUL: {
4450 SDValue N0 = Node->getOperand(0);
4451 SDValue N1 = Node->getOperand(1);
4453 unsigned LoReg, ROpc, MOpc;
4454 switch (NVT.SimpleTy) {
4455 default: llvm_unreachable("Unsupported VT!");
4458 ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
4459 MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
4478 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4479 bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4480 // Multiply is commmutative.
4482 FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4487 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4488 N0, SDValue()).getValue(1);
4490 MachineSDNode *CNode;
4492 // i16/i32/i64 use an instruction that produces a low and high result even
4493 // though only the low result is used.
4496 VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4498 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4500 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4502 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4504 // Update the chain.
4505 ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
4506 // Record the mem-refs
4507 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4509 // i16/i32/i64 use an instruction that produces a low and high result even
4510 // though only the low result is used.
4513 VTs = CurDAG->getVTList(NVT, MVT::i32);
4515 VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4517 CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4520 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4521 ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
4522 CurDAG->RemoveDeadNode(Node);
4526 case ISD::SMUL_LOHI:
4527 case ISD::UMUL_LOHI: {
4528 SDValue N0 = Node->getOperand(0);
4529 SDValue N1 = Node->getOperand(1);
4532 bool isSigned = Opcode == ISD::SMUL_LOHI;
4534 switch (NVT.SimpleTy) {
4535 default: llvm_unreachable("Unsupported VT!");
4536 case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
4537 case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
4540 switch (NVT.SimpleTy) {
4541 default: llvm_unreachable("Unsupported VT!");
4542 case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
4543 case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
4547 unsigned SrcReg, LoReg, HiReg;
4549 default: llvm_unreachable("Unknown MUL opcode!");
4552 SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4556 SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4560 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4561 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4562 // Multiply is commmutative.
4564 foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4569 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4570 N0, SDValue()).getValue(1);
4573 MachineSDNode *CNode = nullptr;
4574 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4576 SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4577 CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4578 Chain = SDValue(CNode, 0);
4579 InFlag = SDValue(CNode, 1);
4581 // Update the chain.
4582 ReplaceUses(N1.getValue(1), Chain);
4583 // Record the mem-refs
4584 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4586 SDValue Ops[] = { N1, InFlag };
4587 SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4588 SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4589 InFlag = SDValue(CNode, 0);
4592 // Copy the low half of the result, if it is needed.
4593 if (!SDValue(Node, 0).use_empty()) {
4594 assert(LoReg && "Register for low half is not defined!");
4595 SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4597 InFlag = ResLo.getValue(2);
4598 ReplaceUses(SDValue(Node, 0), ResLo);
4599 LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4602 // Copy the high half of the result, if it is needed.
4603 if (!SDValue(Node, 1).use_empty()) {
4604 assert(HiReg && "Register for high half is not defined!");
4605 SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4607 InFlag = ResHi.getValue(2);
4608 ReplaceUses(SDValue(Node, 1), ResHi);
4609 LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4613 CurDAG->RemoveDeadNode(Node);
4618 case ISD::UDIVREM: {
4619 SDValue N0 = Node->getOperand(0);
4620 SDValue N1 = Node->getOperand(1);
4623 bool isSigned = Opcode == ISD::SDIVREM;
4625 switch (NVT.SimpleTy) {
4626 default: llvm_unreachable("Unsupported VT!");
4627 case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
4628 case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
4629 case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
4630 case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
4633 switch (NVT.SimpleTy) {
4634 default: llvm_unreachable("Unsupported VT!");
4635 case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
4636 case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
4637 case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
4638 case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
4642 unsigned LoReg, HiReg, ClrReg;
4643 unsigned SExtOpcode;
4644 switch (NVT.SimpleTy) {
4645 default: llvm_unreachable("Unsupported VT!");
4647 LoReg = X86::AL; ClrReg = HiReg = X86::AH;
4648 SExtOpcode = X86::CBW;
4651 LoReg = X86::AX; HiReg = X86::DX;
4653 SExtOpcode = X86::CWD;
4656 LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4657 SExtOpcode = X86::CDQ;
4660 LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4661 SExtOpcode = X86::CQO;
4665 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4666 bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4667 bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4670 if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
4671 // Special case for div8, just use a move with zero extension to AX to
4672 // clear the upper 8 bits (AH).
4673 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4674 MachineSDNode *Move;
4675 if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4676 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4677 Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
4679 Chain = SDValue(Move, 1);
4680 ReplaceUses(N0.getValue(1), Chain);
4681 // Record the mem-refs
4682 CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4684 Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
4685 Chain = CurDAG->getEntryNode();
4687 Chain = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
4689 InFlag = Chain.getValue(1);
4692 CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4693 LoReg, N0, SDValue()).getValue(1);
4694 if (isSigned && !signBitIsZero) {
4695 // Sign extend the low part into the high part.
4697 SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4699 // Zero out the high part, effectively zero extending the input.
4700 SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4701 switch (NVT.SimpleTy) {
4704 SDValue(CurDAG->getMachineNode(
4705 TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4706 CurDAG->getTargetConstant(X86::sub_16bit, dl,
4714 SDValue(CurDAG->getMachineNode(
4715 TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4716 CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4717 CurDAG->getTargetConstant(X86::sub_32bit, dl,
4722 llvm_unreachable("Unexpected division source");
4725 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4726 ClrNode, InFlag).getValue(1);
4731 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4733 MachineSDNode *CNode =
4734 CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4735 InFlag = SDValue(CNode, 1);
4736 // Update the chain.
4737 ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4738 // Record the mem-refs
4739 CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4742 SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4745 // Prevent use of AH in a REX instruction by explicitly copying it to
4746 // an ABCD_L register.
4748 // The current assumption of the register allocator is that isel
4749 // won't generate explicit references to the GR8_ABCD_H registers. If
4750 // the allocator and/or the backend get enhanced to be more robust in
4751 // that regard, this can be, and should be, removed.
4752 if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
4753 SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4754 unsigned AHExtOpcode =
4755 isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
4757 SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4758 MVT::Glue, AHCopy, InFlag);
4759 SDValue Result(RNode, 0);
4760 InFlag = SDValue(RNode, 1);
4763 CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4765 ReplaceUses(SDValue(Node, 1), Result);
4766 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4769 // Copy the division (low) result, if it is needed.
4770 if (!SDValue(Node, 0).use_empty()) {
4771 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4772 LoReg, NVT, InFlag);
4773 InFlag = Result.getValue(2);
4774 ReplaceUses(SDValue(Node, 0), Result);
4775 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4778 // Copy the remainder (high) result, if it is needed.
4779 if (!SDValue(Node, 1).use_empty()) {
4780 SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4781 HiReg, NVT, InFlag);
4782 InFlag = Result.getValue(2);
4783 ReplaceUses(SDValue(Node, 1), Result);
4784 LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4787 CurDAG->RemoveDeadNode(Node);
4792 SDValue N0 = Node->getOperand(0);
4793 SDValue N1 = Node->getOperand(1);
4795 // Optimizations for TEST compares.
4796 if (!isNullConstant(N1))
4799 // Save the original VT of the compare.
4800 MVT CmpVT = N0.getSimpleValueType();
4802 // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4803 // by a test instruction. The test should be removed later by
4804 // analyzeCompare if we are using only the zero flag.
4805 // TODO: Should we check the users and use the BEXTR flags directly?
4806 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
4807 if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4808 unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
4810 SDValue BEXTR = SDValue(NewNode, 0);
4811 NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4812 ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4813 CurDAG->RemoveDeadNode(Node);
4818 // We can peek through truncates, but we need to be careful below.
4819 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
4820 N0 = N0.getOperand(0);
4822 // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
4823 // use a smaller encoding.
4824 // Look past the truncate if CMP is the only use of it.
4825 if (N0.getOpcode() == ISD::AND &&
4826 N0.getNode()->hasOneUse() &&
4827 N0.getValueType() != MVT::i8) {
4828 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
4830 uint64_t Mask = C->getZExtValue();
4832 // Check if we can replace AND+IMM64 with a shift. This is possible for
4833 // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
4835 if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
4836 onlyUsesZeroFlag(SDValue(Node, 0))) {
4837 if (isMask_64(~Mask)) {
4838 unsigned TrailingZeros = countTrailingZeros(Mask);
4839 SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
4841 SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
4842 N0.getOperand(0), Imm), 0);
4843 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4844 MVT::i32, Shift, Shift);
4845 ReplaceNode(Node, Test);
4848 if (isMask_64(Mask)) {
4849 unsigned LeadingZeros = countLeadingZeros(Mask);
4850 SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
4852 SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
4853 N0.getOperand(0), Imm), 0);
4854 MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4855 MVT::i32, Shift, Shift);
4856 ReplaceNode(Node, Test);
4863 unsigned ROpc, MOpc;
4865 // For each of these checks we need to be careful if the sign flag is
4866 // being used. It is only safe to use the sign flag in two conditions,
4867 // either the sign bit in the shrunken mask is zero or the final test
4868 // size is equal to the original compare size.
4870 if (isUInt<8>(Mask) &&
4871 (!(Mask & 0x80) || CmpVT == MVT::i8 ||
4872 hasNoSignFlagUses(SDValue(Node, 0)))) {
4873 // For example, convert "testl %eax, $8" to "testb %al, $8"
4875 SubRegOp = X86::sub_8bit;
4876 ROpc = X86::TEST8ri;
4877 MOpc = X86::TEST8mi;
4878 } else if (OptForMinSize && isUInt<16>(Mask) &&
4879 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
4880 hasNoSignFlagUses(SDValue(Node, 0)))) {
4881 // For example, "testl %eax, $32776" to "testw %ax, $32776".
4882 // NOTE: We only want to form TESTW instructions if optimizing for
4883 // min size. Otherwise we only save one byte and possibly get a length
4884 // changing prefix penalty in the decoders.
4886 SubRegOp = X86::sub_16bit;
4887 ROpc = X86::TEST16ri;
4888 MOpc = X86::TEST16mi;
4889 } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
4890 ((!(Mask & 0x80000000) &&
4891 // Without minsize 16-bit Cmps can get here so we need to
4892 // be sure we calculate the correct sign flag if needed.
4893 (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
4894 CmpVT == MVT::i32 ||
4895 hasNoSignFlagUses(SDValue(Node, 0)))) {
4896 // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
4897 // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
4898 // Otherwize, we find ourselves in a position where we have to do
4899 // promotion. If previous passes did not promote the and, we assume
4900 // they had a good reason not to and do not promote here.
4902 SubRegOp = X86::sub_32bit;
4903 ROpc = X86::TEST32ri;
4904 MOpc = X86::TEST32mi;
4906 // No eligible transformation was found.
4910 SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
4911 SDValue Reg = N0.getOperand(0);
4913 // Emit a testl or testw.
4914 MachineSDNode *NewNode;
4915 SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4916 if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4917 SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4918 Reg.getOperand(0) };
4919 NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
4920 // Update the chain.
4921 ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
4922 // Record the mem-refs
4923 CurDAG->setNodeMemRefs(NewNode,
4924 {cast<LoadSDNode>(Reg)->getMemOperand()});
4926 // Extract the subregister if necessary.
4927 if (N0.getValueType() != VT)
4928 Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
4930 NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
4932 // Replace CMP with TEST.
4933 ReplaceNode(Node, NewNode);
4938 case X86ISD::PCMPISTR: {
4939 if (!Subtarget->hasSSE42())
4942 bool NeedIndex = !SDValue(Node, 0).use_empty();
4943 bool NeedMask = !SDValue(Node, 1).use_empty();
4944 // We can't fold a load if we are going to make two instructions.
4945 bool MayFoldLoad = !NeedIndex || !NeedMask;
4947 MachineSDNode *CNode;
4949 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
4950 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
4951 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
4952 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4954 if (NeedIndex || !NeedMask) {
4955 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
4956 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
4957 CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
4958 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4961 // Connect the flag usage to the last instruction created.
4962 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4963 CurDAG->RemoveDeadNode(Node);
4966 case X86ISD::PCMPESTR: {
4967 if (!Subtarget->hasSSE42())
4970 // Copy the two implicit register inputs.
4971 SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
4972 Node->getOperand(1),
4973 SDValue()).getValue(1);
4974 InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
4975 Node->getOperand(3), InFlag).getValue(1);
4977 bool NeedIndex = !SDValue(Node, 0).use_empty();
4978 bool NeedMask = !SDValue(Node, 1).use_empty();
4979 // We can't fold a load if we are going to make two instructions.
4980 bool MayFoldLoad = !NeedIndex || !NeedMask;
4982 MachineSDNode *CNode;
4984 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
4985 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
4986 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
4988 ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4990 if (NeedIndex || !NeedMask) {
4991 unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
4992 unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
4993 CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
4994 ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4996 // Connect the flag usage to the last instruction created.
4997 ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
4998 CurDAG->RemoveDeadNode(Node);
5003 if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
5010 if (foldLoadStoreIntoMemOperand(Node))
5016 case ISD::FNEARBYINT:
5018 // Replace fp rounding with their X86 specific equivalent so we don't
5019 // need 2 sets of patterns.
5020 // FIXME: This can only happen when the nodes started as STRICT_* and have
5021 // been mutated into their non-STRICT equivalents. Eventually this
5022 // mutation will be removed and we should switch the STRICT_ nodes to a
5023 // strict version of RNDSCALE in PreProcessISelDAG.
5025 switch (Node->getOpcode()) {
5026 default: llvm_unreachable("Unexpected opcode!");
5027 case ISD::FCEIL: Imm = 0xA; break;
5028 case ISD::FFLOOR: Imm = 0x9; break;
5029 case ISD::FTRUNC: Imm = 0xB; break;
5030 case ISD::FNEARBYINT: Imm = 0xC; break;
5031 case ISD::FRINT: Imm = 0x4; break;
5034 SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
5035 Node->getValueType(0),
5036 Node->getOperand(0),
5037 CurDAG->getConstant(Imm, dl, MVT::i8));
5038 ReplaceNode(Node, Res.getNode());
5039 SelectCode(Res.getNode());
5047 bool X86DAGToDAGISel::
5048 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
5049 std::vector<SDValue> &OutOps) {
5050 SDValue Op0, Op1, Op2, Op3, Op4;
5051 switch (ConstraintID) {
5053 llvm_unreachable("Unexpected asm memory constraint");
5054 case InlineAsm::Constraint_i:
5055 // FIXME: It seems strange that 'i' is needed here since it's supposed to
5056 // be an immediate and not a memory constraint.
5058 case InlineAsm::Constraint_o: // offsetable ??
5059 case InlineAsm::Constraint_v: // not offsetable ??
5060 case InlineAsm::Constraint_m: // memory
5061 case InlineAsm::Constraint_X:
5062 if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
5067 OutOps.push_back(Op0);
5068 OutOps.push_back(Op1);
5069 OutOps.push_back(Op2);
5070 OutOps.push_back(Op3);
5071 OutOps.push_back(Op4);
5075 /// This pass converts a legalized DAG into a X86-specific DAG,
5076 /// ready for instruction scheduling.
5077 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
5078 CodeGenOpt::Level OptLevel) {
5079 return new X86DAGToDAGISel(TM, OptLevel);