lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86InstrBuilder.h"
  19 #include "X86MachineFunctionInfo.h"
  20 #include "X86TargetMachine.h"
  21 #include "X86TargetObjectFile.h"
  22 #include "llvm/ADT/SmallBitVector.h"
  23 #include "llvm/ADT/SmallSet.h"
  24 #include "llvm/ADT/Statistic.h"
  25 #include "llvm/ADT/StringExtras.h"
  26 #include "llvm/ADT/StringSwitch.h"
  27 #include "llvm/ADT/VariadicFunction.h"
  28 #include "llvm/CodeGen/IntrinsicLowering.h"
  29 #include "llvm/CodeGen/MachineFrameInfo.h"
  30 #include "llvm/CodeGen/MachineFunction.h"
  31 #include "llvm/CodeGen/MachineInstrBuilder.h"
  32 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  33 #include "llvm/CodeGen/MachineModuleInfo.h"
  34 #include "llvm/CodeGen/MachineRegisterInfo.h"
  35 #include "llvm/IR/CallSite.h"
  36 #include "llvm/IR/CallingConv.h"
  37 #include "llvm/IR/Constants.h"
  38 #include "llvm/IR/DerivedTypes.h"
  39 #include "llvm/IR/Function.h"
  40 #include "llvm/IR/GlobalAlias.h"
  41 #include "llvm/IR/GlobalVariable.h"
  42 #include "llvm/IR/Instructions.h"
  43 #include "llvm/IR/Intrinsics.h"
  44 #include "llvm/MC/MCAsmInfo.h"
  45 #include "llvm/MC/MCContext.h"
  46 #include "llvm/MC/MCExpr.h"
  47 #include "llvm/MC/MCSymbol.h"
  48 #include "llvm/Support/CommandLine.h"
  49 #include "llvm/Support/Debug.h"
  50 #include "llvm/Support/ErrorHandling.h"
  51 #include "llvm/Support/MathExtras.h"
  52 #include "llvm/Target/TargetOptions.h"
  53 #include "X86IntrinsicsInfo.h"
  54 #include <bitset>
  55 #include <numeric>
  56 #include <cctype>
  57 using namespace llvm;
  58
  59 #define DEBUG_TYPE "x86-isel"
  60
  61 STATISTIC(NumTailCalls, "Number of tail calls");
  62
  63 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  64     "x86-experimental-vector-widening-legalization", cl::init(false),
  65     cl::desc("Enable an experimental vector type legalization through widening "
  66              "rather than promotion."),
  67     cl::Hidden);
  68
  69 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  70     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  71     cl::desc("Enable an experimental vector shuffle lowering code path."),
  72     cl::Hidden);
  73
  74 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  75     "x86-experimental-vector-shuffle-legality", cl::init(false),
  76     cl::desc("Enable experimental shuffle legality based on the experimental "
  77              "shuffle lowering. Should only be used with the experimental "
  78              "shuffle lowering."),
  79     cl::Hidden);
  80
  81 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  82     "x86-recip-refinement-steps", cl::init(1),
  83     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  84              "result of the hardware reciprocal estimate instruction."),
  85     cl::NotHidden);
  86
  87 // Forward declarations.
  88 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  89                        SDValue V2);
  90
  91 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  92                                 SelectionDAG &DAG, SDLoc dl,
  93                                 unsigned vectorWidth) {
  94   assert((vectorWidth == 128 || vectorWidth == 256) &&
  95          "Unsupported vector width");
  96   EVT VT = Vec.getValueType();
  97   EVT ElVT = VT.getVectorElementType();
  98   unsigned Factor = VT.getSizeInBits()/vectorWidth;
  99   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 100                                   VT.getVectorNumElements()/Factor);
 101
 102   // Extract from UNDEF is UNDEF.
 103   if (Vec.getOpcode() == ISD::UNDEF)
 104     return DAG.getUNDEF(ResultVT);
 105
 106   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 107   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 108
 109   // This is the index of the first element of the vectorWidth-bit chunk
 110   // we want.
 111   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 112                                * ElemsPerChunk);
 113
 114   // If the input is a buildvector just emit a smaller one.
 115   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 116     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 117                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 118                                     ElemsPerChunk));
 119
 120   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 121   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 122 }
 123
 124 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 125 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 126 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 127 /// instructions or a simple subregister reference. Idx is an index in the
 128 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 129 /// lowering EXTRACT_VECTOR_ELT operations easier.
 130 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 131                                    SelectionDAG &DAG, SDLoc dl) {
 132   assert((Vec.getValueType().is256BitVector() ||
 133           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 134   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 135 }
 136
 137 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 138 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 139                                    SelectionDAG &DAG, SDLoc dl) {
 140   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 141   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 142 }
 143
 144 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 145                                unsigned IdxVal, SelectionDAG &DAG,
 146                                SDLoc dl, unsigned vectorWidth) {
 147   assert((vectorWidth == 128 || vectorWidth == 256) &&
 148          "Unsupported vector width");
 149   // Inserting UNDEF is Result
 150   if (Vec.getOpcode() == ISD::UNDEF)
 151     return Result;
 152   EVT VT = Vec.getValueType();
 153   EVT ElVT = VT.getVectorElementType();
 154   EVT ResultVT = Result.getValueType();
 155
 156   // Insert the relevant vectorWidth bits.
 157   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 158
 159   // This is the index of the first element of the vectorWidth-bit chunk
 160   // we want.
 161   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 162                                * ElemsPerChunk);
 163
 164   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 165   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 166 }
 167
 168 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 169 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 170 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 171 /// simple superregister reference.  Idx is an index in the 128 bits
 172 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 173 /// lowering INSERT_VECTOR_ELT operations easier.
 174 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 175                                   SelectionDAG &DAG,SDLoc dl) {
 176   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 177   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 178 }
 179
 180 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 181                                   SelectionDAG &DAG, SDLoc dl) {
 182   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 183   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 184 }
 185
 186 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 187 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 188 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 189 /// large BUILD_VECTORS.
 190 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 191                                    unsigned NumElems, SelectionDAG &DAG,
 192                                    SDLoc dl) {
 193   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 194   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 195 }
 196
 197 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 198                                    unsigned NumElems, SelectionDAG &DAG,
 199                                    SDLoc dl) {
 200   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 201   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 202 }
 203
 204 // FIXME: This should stop caching the target machine as soon as
 205 // we can remove resetOperationActions et al.
 206 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
 207     : TargetLowering(TM) {
 208   Subtarget = &TM.getSubtarget<X86Subtarget>();
 209   X86ScalarSSEf64 = Subtarget->hasSSE2();
 210   X86ScalarSSEf32 = Subtarget->hasSSE1();
 211   TD = getDataLayout();
 212
 213   resetOperationActions();
 214 }
 215
 216 void X86TargetLowering::resetOperationActions() {
 217   const TargetMachine &TM = getTargetMachine();
 218   static bool FirstTimeThrough = true;
 219
 220   // If none of the target options have changed, then we don't need to reset the
 221   // operation actions.
 222   if (!FirstTimeThrough && TO == TM.Options) return;
 223
 224   if (!FirstTimeThrough) {
 225     // Reinitialize the actions.
 226     initActions();
 227     FirstTimeThrough = false;
 228   }
 229
 230   TO = TM.Options;
 231
 232   // Set up the TargetLowering object.
 233   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 234
 235   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 236   setBooleanContents(ZeroOrOneBooleanContent);
 237   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 238   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 239
 240   // For 64-bit, since we have so many registers, use the ILP scheduler.
 241   // For 32-bit, use the register pressure specific scheduling.
 242   // For Atom, always use ILP scheduling.
 243   if (Subtarget->isAtom())
 244     setSchedulingPreference(Sched::ILP);
 245   else if (Subtarget->is64Bit())
 246     setSchedulingPreference(Sched::ILP);
 247   else
 248     setSchedulingPreference(Sched::RegPressure);
 249   const X86RegisterInfo *RegInfo =
 250       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
 251   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 252
 253   // Bypass expensive divides on Atom when compiling with O2.
 254   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 255     if (Subtarget->hasSlowDivide32())
 256       addBypassSlowDiv(32, 8);
 257     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 258       addBypassSlowDiv(64, 16);
 259   }
 260
 261   if (Subtarget->isTargetKnownWindowsMSVC()) {
 262     // Setup Windows compiler runtime calls.
 263     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 264     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 265     setLibcallName(RTLIB::SREM_I64, "_allrem");
 266     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 267     setLibcallName(RTLIB::MUL_I64, "_allmul");
 268     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 269     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 270     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 271     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 272     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 273
 274     // The _ftol2 runtime function has an unusual calling conv, which
 275     // is modeled by a special pseudo-instruction.
 276     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 277     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 278     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 279     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 280   }
 281
 282   if (Subtarget->isTargetDarwin()) {
 283     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 284     setUseUnderscoreSetJmp(false);
 285     setUseUnderscoreLongJmp(false);
 286   } else if (Subtarget->isTargetWindowsGNU()) {
 287     // MS runtime is weird: it exports _setjmp, but longjmp!
 288     setUseUnderscoreSetJmp(true);
 289     setUseUnderscoreLongJmp(false);
 290   } else {
 291     setUseUnderscoreSetJmp(true);
 292     setUseUnderscoreLongJmp(true);
 293   }
 294
 295   // Set up the register classes.
 296   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 297   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 298   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 299   if (Subtarget->is64Bit())
 300     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 301
 302   for (MVT VT : MVT::integer_valuetypes())
 303     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 304
 305   // We don't accept any truncstore of integer registers.
 306   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 307   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 308   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 309   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 310   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 311   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 312
 313   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 314
 315   // SETOEQ and SETUNE require checking two conditions.
 316   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 317   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 318   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 319   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 320   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 321   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 322
 323   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 324   // operation.
 325   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 326   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 327   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 328
 329   if (Subtarget->is64Bit()) {
 330     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 331     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 332   } else if (!TM.Options.UseSoftFloat) {
 333     // We have an algorithm for SSE2->double, and we turn this into a
 334     // 64-bit FILD followed by conditional FADD for other targets.
 335     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 336     // We have an algorithm for SSE2, and we turn this into a 64-bit
 337     // FILD for other targets.
 338     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 339   }
 340
 341   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 342   // this operation.
 343   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 344   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 345
 346   if (!TM.Options.UseSoftFloat) {
 347     // SSE has no i16 to fp conversion, only i32
 348     if (X86ScalarSSEf32) {
 349       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 350       // f32 and f64 cases are Legal, f80 case is not
 351       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 352     } else {
 353       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 354       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 355     }
 356   } else {
 357     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 358     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 359   }
 360
 361   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 362   // are Legal, f80 is custom lowered.
 363   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 364   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 365
 366   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 367   // this operation.
 368   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 369   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 370
 371   if (X86ScalarSSEf32) {
 372     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 373     // f32 and f64 cases are Legal, f80 case is not
 374     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 375   } else {
 376     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 377     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 378   }
 379
 380   // Handle FP_TO_UINT by promoting the destination to a larger signed
 381   // conversion.
 382   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 383   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 384   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 385
 386   if (Subtarget->is64Bit()) {
 387     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 388     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 389   } else if (!TM.Options.UseSoftFloat) {
 390     // Since AVX is a superset of SSE3, only check for SSE here.
 391     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 392       // Expand FP_TO_UINT into a select.
 393       // FIXME: We would like to use a Custom expander here eventually to do
 394       // the optimal thing for SSE vs. the default expansion in the legalizer.
 395       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 396     else
 397       // With SSE3 we can use fisttpll to convert to a signed i64; without
 398       // SSE, we're stuck with a fistpll.
 399       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 400   }
 401
 402   if (isTargetFTOL()) {
 403     // Use the _ftol2 runtime function, which has a pseudo-instruction
 404     // to handle its weird calling convention.
 405     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 406   }
 407
 408   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 409   if (!X86ScalarSSEf64) {
 410     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 411     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 412     if (Subtarget->is64Bit()) {
 413       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 414       // Without SSE, i64->f64 goes through memory.
 415       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 416     }
 417   }
 418
 419   // Scalar integer divide and remainder are lowered to use operations that
 420   // produce two results, to match the available instructions. This exposes
 421   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 422   // into a single instruction.
 423   //
 424   // Scalar integer multiply-high is also lowered to use two-result
 425   // operations, to match the available instructions. However, plain multiply
 426   // (low) operations are left as Legal, as there are single-result
 427   // instructions for this in x86. Using the two-result multiply instructions
 428   // when both high and low results are needed must be arranged by dagcombine.
 429   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 430     MVT VT = IntVTs[i];
 431     setOperationAction(ISD::MULHS, VT, Expand);
 432     setOperationAction(ISD::MULHU, VT, Expand);
 433     setOperationAction(ISD::SDIV, VT, Expand);
 434     setOperationAction(ISD::UDIV, VT, Expand);
 435     setOperationAction(ISD::SREM, VT, Expand);
 436     setOperationAction(ISD::UREM, VT, Expand);
 437
 438     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 439     setOperationAction(ISD::ADDC, VT, Custom);
 440     setOperationAction(ISD::ADDE, VT, Custom);
 441     setOperationAction(ISD::SUBC, VT, Custom);
 442     setOperationAction(ISD::SUBE, VT, Custom);
 443   }
 444
 445   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 446   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 447   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 448   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 449   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 450   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 451   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 452   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 453   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 454   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 455   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 456   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 457   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 458   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 459   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 460   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 461   if (Subtarget->is64Bit())
 462     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 463   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 464   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 465   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 466   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 467   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 468   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 469   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 470   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 471
 472   // Promote the i8 variants and force them on up to i32 which has a shorter
 473   // encoding.
 474   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 475   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 476   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 477   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 478   if (Subtarget->hasBMI()) {
 479     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 480     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 481     if (Subtarget->is64Bit())
 482       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 483   } else {
 484     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 485     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 486     if (Subtarget->is64Bit())
 487       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 488   }
 489
 490   if (Subtarget->hasLZCNT()) {
 491     // When promoting the i8 variants, force them to i32 for a shorter
 492     // encoding.
 493     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 494     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 495     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 496     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 497     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 498     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 499     if (Subtarget->is64Bit())
 500       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 501   } else {
 502     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 503     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 504     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 505     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 506     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 507     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 508     if (Subtarget->is64Bit()) {
 509       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 510       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 511     }
 512   }
 513
 514   // Special handling for half-precision floating point conversions.
 515   // If we don't have F16C support, then lower half float conversions
 516   // into library calls.
 517   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 518     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 519     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 520   }
 521
 522   // There's never any support for operations beyond MVT::f32.
 523   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 524   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 525   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 526   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 527
 528   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 529   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 530   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 531   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 532   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 533   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 534
 535   if (Subtarget->hasPOPCNT()) {
 536     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 537   } else {
 538     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 539     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 540     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 541     if (Subtarget->is64Bit())
 542       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 543   }
 544
 545   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 546
 547   if (!Subtarget->hasMOVBE())
 548     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 549
 550   // These should be promoted to a larger select which is supported.
 551   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 552   // X86 wants to expand cmov itself.
 553   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 554   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 555   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 556   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 557   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 558   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 559   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 560   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 561   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 562   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 563   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 564   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 565   if (Subtarget->is64Bit()) {
 566     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 567     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 568   }
 569   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 570   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 571   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 572   // support continuation, user-level threading, and etc.. As a result, no
 573   // other SjLj exception interfaces are implemented and please don't build
 574   // your own exception handling based on them.
 575   // LLVM/Clang supports zero-cost DWARF exception handling.
 576   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 577   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 578
 579   // Darwin ABI issue.
 580   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 581   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 582   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 583   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 584   if (Subtarget->is64Bit())
 585     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 586   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 587   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 588   if (Subtarget->is64Bit()) {
 589     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 590     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 591     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 592     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 593     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 594   }
 595   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 596   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 597   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 598   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 599   if (Subtarget->is64Bit()) {
 600     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 601     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 602     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 603   }
 604
 605   if (Subtarget->hasSSE1())
 606     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 607
 608   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 609
 610   // Expand certain atomics
 611   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 612     MVT VT = IntVTs[i];
 613     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 614     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 615     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 616   }
 617
 618   if (Subtarget->hasCmpxchg16b()) {
 619     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 620   }
 621
 622   // FIXME - use subtarget debug flags
 623   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 624       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 625     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 626   }
 627
 628   if (Subtarget->is64Bit()) {
 629     setExceptionPointerRegister(X86::RAX);
 630     setExceptionSelectorRegister(X86::RDX);
 631   } else {
 632     setExceptionPointerRegister(X86::EAX);
 633     setExceptionSelectorRegister(X86::EDX);
 634   }
 635   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 636   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 637
 638   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 639   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 640
 641   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 642   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 643
 644   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 645   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 646   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 647   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 648     // TargetInfo::X86_64ABIBuiltinVaList
 649     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 650     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 651   } else {
 652     // TargetInfo::CharPtrBuiltinVaList
 653     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 654     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 655   }
 656
 657   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 658   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 659
 660   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 661
 662   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 663     // f32 and f64 use SSE.
 664     // Set up the FP register classes.
 665     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 666     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 667
 668     // Use ANDPD to simulate FABS.
 669     setOperationAction(ISD::FABS , MVT::f64, Custom);
 670     setOperationAction(ISD::FABS , MVT::f32, Custom);
 671
 672     // Use XORP to simulate FNEG.
 673     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 674     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 675
 676     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 677     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 678     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 679
 680     // Lower this to FGETSIGNx86 plus an AND.
 681     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 682     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 683
 684     // We don't support sin/cos/fmod
 685     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 686     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 687     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 688     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 689     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 690     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 691
 692     // Expand FP immediates into loads from the stack, except for the special
 693     // cases we handle.
 694     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 695     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 696   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 697     // Use SSE for f32, x87 for f64.
 698     // Set up the FP register classes.
 699     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 700     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 701
 702     // Use ANDPS to simulate FABS.
 703     setOperationAction(ISD::FABS , MVT::f32, Custom);
 704
 705     // Use XORP to simulate FNEG.
 706     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 707
 708     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 709
 710     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 711     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 712     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 713
 714     // We don't support sin/cos/fmod
 715     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 716     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 717     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 718
 719     // Special cases we handle for FP constants.
 720     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 721     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 722     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 723     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 724     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 725
 726     if (!TM.Options.UnsafeFPMath) {
 727       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 728       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 729       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 730     }
 731   } else if (!TM.Options.UseSoftFloat) {
 732     // f32 and f64 in x87.
 733     // Set up the FP register classes.
 734     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 735     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 736
 737     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 738     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 739     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 740     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 741
 742     if (!TM.Options.UnsafeFPMath) {
 743       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 744       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 745       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 746       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 747       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 748       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 749     }
 750     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 751     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 752     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 753     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 754     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 755     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 756     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 757     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 758   }
 759
 760   // We don't support FMA.
 761   setOperationAction(ISD::FMA, MVT::f64, Expand);
 762   setOperationAction(ISD::FMA, MVT::f32, Expand);
 763
 764   // Long double always uses X87.
 765   if (!TM.Options.UseSoftFloat) {
 766     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 767     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 768     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 769     {
 770       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 771       addLegalFPImmediate(TmpFlt);  // FLD0
 772       TmpFlt.changeSign();
 773       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 774
 775       bool ignored;
 776       APFloat TmpFlt2(+1.0);
 777       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 778                       &ignored);
 779       addLegalFPImmediate(TmpFlt2);  // FLD1
 780       TmpFlt2.changeSign();
 781       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 782     }
 783
 784     if (!TM.Options.UnsafeFPMath) {
 785       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 786       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 787       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 788     }
 789
 790     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 791     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 792     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 793     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 794     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 795     setOperationAction(ISD::FMA, MVT::f80, Expand);
 796   }
 797
 798   // Always use a library call for pow.
 799   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 800   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 801   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 802
 803   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 804   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 805   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 806   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 807   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 808   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 809   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 810
 811   // First set operation action for all vector types to either promote
 812   // (for widening) or expand (for scalarization). Then we will selectively
 813   // turn on ones that can be effectively codegen'd.
 814   for (MVT VT : MVT::vector_valuetypes()) {
 815     setOperationAction(ISD::ADD , VT, Expand);
 816     setOperationAction(ISD::SUB , VT, Expand);
 817     setOperationAction(ISD::FADD, VT, Expand);
 818     setOperationAction(ISD::FNEG, VT, Expand);
 819     setOperationAction(ISD::FSUB, VT, Expand);
 820     setOperationAction(ISD::MUL , VT, Expand);
 821     setOperationAction(ISD::FMUL, VT, Expand);
 822     setOperationAction(ISD::SDIV, VT, Expand);
 823     setOperationAction(ISD::UDIV, VT, Expand);
 824     setOperationAction(ISD::FDIV, VT, Expand);
 825     setOperationAction(ISD::SREM, VT, Expand);
 826     setOperationAction(ISD::UREM, VT, Expand);
 827     setOperationAction(ISD::LOAD, VT, Expand);
 828     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 829     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 830     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 831     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 832     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 833     setOperationAction(ISD::FABS, VT, Expand);
 834     setOperationAction(ISD::FSIN, VT, Expand);
 835     setOperationAction(ISD::FSINCOS, VT, Expand);
 836     setOperationAction(ISD::FCOS, VT, Expand);
 837     setOperationAction(ISD::FSINCOS, VT, Expand);
 838     setOperationAction(ISD::FREM, VT, Expand);
 839     setOperationAction(ISD::FMA,  VT, Expand);
 840     setOperationAction(ISD::FPOWI, VT, Expand);
 841     setOperationAction(ISD::FSQRT, VT, Expand);
 842     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 843     setOperationAction(ISD::FFLOOR, VT, Expand);
 844     setOperationAction(ISD::FCEIL, VT, Expand);
 845     setOperationAction(ISD::FTRUNC, VT, Expand);
 846     setOperationAction(ISD::FRINT, VT, Expand);
 847     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 848     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 849     setOperationAction(ISD::MULHS, VT, Expand);
 850     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 851     setOperationAction(ISD::MULHU, VT, Expand);
 852     setOperationAction(ISD::SDIVREM, VT, Expand);
 853     setOperationAction(ISD::UDIVREM, VT, Expand);
 854     setOperationAction(ISD::FPOW, VT, Expand);
 855     setOperationAction(ISD::CTPOP, VT, Expand);
 856     setOperationAction(ISD::CTTZ, VT, Expand);
 857     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 858     setOperationAction(ISD::CTLZ, VT, Expand);
 859     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 860     setOperationAction(ISD::SHL, VT, Expand);
 861     setOperationAction(ISD::SRA, VT, Expand);
 862     setOperationAction(ISD::SRL, VT, Expand);
 863     setOperationAction(ISD::ROTL, VT, Expand);
 864     setOperationAction(ISD::ROTR, VT, Expand);
 865     setOperationAction(ISD::BSWAP, VT, Expand);
 866     setOperationAction(ISD::SETCC, VT, Expand);
 867     setOperationAction(ISD::FLOG, VT, Expand);
 868     setOperationAction(ISD::FLOG2, VT, Expand);
 869     setOperationAction(ISD::FLOG10, VT, Expand);
 870     setOperationAction(ISD::FEXP, VT, Expand);
 871     setOperationAction(ISD::FEXP2, VT, Expand);
 872     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 873     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 874     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 875     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 876     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 877     setOperationAction(ISD::TRUNCATE, VT, Expand);
 878     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 879     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 880     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 881     setOperationAction(ISD::VSELECT, VT, Expand);
 882     setOperationAction(ISD::SELECT_CC, VT, Expand);
 883     for (MVT InnerVT : MVT::vector_valuetypes()) {
 884       setTruncStoreAction(InnerVT, VT, Expand);
 885
 886       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 887       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 888
 889       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 890       // types, we have to deal with them whether we ask for Expansion or not.
 891       // Setting Expand causes its own optimisation problems though, so leave
 892       // them legal.
 893       if (VT.getVectorElementType() == MVT::i1)
 894         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 895     }
 896   }
 897
 898   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 899   // with -msoft-float, disable use of MMX as well.
 900   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 901     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 902     // No operations on x86mmx supported, everything uses intrinsics.
 903   }
 904
 905   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 906   // into smaller operations.
 907   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 908   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 909   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 910   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 911   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 912   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 913   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 914   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 915   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 916   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 917   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 918   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 919   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 920   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 921   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 922   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 923   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 924   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 925   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 926   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 927   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 928   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 929   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 930   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 931   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 932   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 933   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 934   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 935   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 936
 937   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 938     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 939
 940     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 941     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 942     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 943     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 944     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 945     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 946     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 947     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 948     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 949     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 950     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 951     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 952     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 953   }
 954
 955   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 956     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 957
 958     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 959     // registers cannot be used even for integer operations.
 960     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 961     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 962     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 963     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 964
 965     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 966     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 967     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 968     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 969     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 970     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 971     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 972     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 973     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 974     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 975     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 976     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 977     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 978     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 979     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 980     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 981     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 982     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 983     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 984     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 985     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 986     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 987
 988     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 989     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 990     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 991     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 992
 993     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 994     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 995     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 996     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 997     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 998
 999     // Only provide customized ctpop vector bit twiddling for vector types we
1000     // know to perform better than using the popcnt instructions on each vector
1001     // element. If popcnt isn't supported, always provide the custom version.
1002     if (!Subtarget->hasPOPCNT()) {
1003       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
1004       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
1005     }
1006
1007     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
1008     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1009       MVT VT = (MVT::SimpleValueType)i;
1010       // Do not attempt to custom lower non-power-of-2 vectors
1011       if (!isPowerOf2_32(VT.getVectorNumElements()))
1012         continue;
1013       // Do not attempt to custom lower non-128-bit vectors
1014       if (!VT.is128BitVector())
1015         continue;
1016       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1017       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1018       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1019     }
1020
1021     // We support custom legalizing of sext and anyext loads for specific
1022     // memory vector types which we can load as a scalar (or sequence of
1023     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1024     // loads these must work with a single scalar load.
1025     for (MVT VT : MVT::integer_vector_valuetypes()) {
1026       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1027       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1028       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1029       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1030       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1031       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1032       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1033       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1034       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1035     }
1036
1037     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1038     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1039     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1040     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1041     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1042     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1043
1044     if (Subtarget->is64Bit()) {
1045       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1046       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1047     }
1048
1049     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1050     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1051       MVT VT = (MVT::SimpleValueType)i;
1052
1053       // Do not attempt to promote non-128-bit vectors
1054       if (!VT.is128BitVector())
1055         continue;
1056
1057       setOperationAction(ISD::AND,    VT, Promote);
1058       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1059       setOperationAction(ISD::OR,     VT, Promote);
1060       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1061       setOperationAction(ISD::XOR,    VT, Promote);
1062       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1063       setOperationAction(ISD::LOAD,   VT, Promote);
1064       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1065       setOperationAction(ISD::SELECT, VT, Promote);
1066       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1067     }
1068
1069     // Custom lower v2i64 and v2f64 selects.
1070     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1071     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1072     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1073     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1074
1075     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1076     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1077
1078     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1079     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1080     // As there is no 64-bit GPR available, we need build a special custom
1081     // sequence to convert from v2i32 to v2f32.
1082     if (!Subtarget->is64Bit())
1083       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1084
1085     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1086     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1087
1088     for (MVT VT : MVT::fp_vector_valuetypes())
1089       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1090
1091     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1092     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1093     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1094   }
1095
1096   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1097     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1098     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1099     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1100     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1101     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1102     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1103     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1104     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1105     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1106     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1107
1108     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1109     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1110     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1111     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1112     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1113     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1114     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1115     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1116     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1117     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1118
1119     // FIXME: Do we need to handle scalar-to-vector here?
1120     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1121
1122     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1123     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1124     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1125     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1126     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1127     // There is no BLENDI for byte vectors. We don't need to custom lower
1128     // some vselects for now.
1129     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1130
1131     // SSE41 brings specific instructions for doing vector sign extend even in
1132     // cases where we don't have SRA.
1133     for (MVT VT : MVT::integer_vector_valuetypes()) {
1134       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1135       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1136       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1137     }
1138
1139     // i8 and i16 vectors are custom because the source register and source
1140     // source memory operand types are not the same width.  f32 vectors are
1141     // custom since the immediate controlling the insert encodes additional
1142     // information.
1143     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1144     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1145     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1146     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1147
1148     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1149     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1150     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1151     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1152
1153     // FIXME: these should be Legal, but that's only for the case where
1154     // the index is constant.  For now custom expand to deal with that.
1155     if (Subtarget->is64Bit()) {
1156       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1157       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1158     }
1159   }
1160
1161   if (Subtarget->hasSSE2()) {
1162     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1163     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1164
1165     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1166     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1167
1168     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1169     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1170
1171     // In the customized shift lowering, the legal cases in AVX2 will be
1172     // recognized.
1173     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1174     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1175
1176     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1177     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1178
1179     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1180   }
1181
1182   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1183     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1184     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1185     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1186     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1187     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1188     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1189
1190     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1191     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1192     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1193
1194     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1195     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1196     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1197     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1198     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1199     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1200     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1201     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1202     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1203     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1204     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1205     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1206
1207     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1208     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1209     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1210     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1211     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1212     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1213     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1214     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1215     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1216     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1217     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1218     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1219
1220     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1221     // even though v8i16 is a legal type.
1222     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1223     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1224     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1225
1226     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1227     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1228     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1229
1230     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1231     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1232
1233     for (MVT VT : MVT::fp_vector_valuetypes())
1234       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1235
1236     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1237     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1238
1239     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1240     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1241
1242     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1243     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1244
1245     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1246     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1247     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1248     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1249
1250     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1251     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1252     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1253
1254     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1255     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1256     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1257     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1258
1259     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1260     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1261     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1262     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1263     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1264     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1265     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1266     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1267     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1268     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1269     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1270     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1271
1272     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1273       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1274       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1275       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1276       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1277       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1278       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1279     }
1280
1281     if (Subtarget->hasInt256()) {
1282       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1283       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1284       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1285       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1286
1287       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1288       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1289       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1290       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1291
1292       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1293       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1294       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1295       // Don't lower v32i8 because there is no 128-bit byte mul
1296
1297       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1298       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1299       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1300       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1301
1302       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1303       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1304
1305       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1306       // when we have a 256bit-wide blend with immediate.
1307       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1308
1309       // Only provide customized ctpop vector bit twiddling for vector types we
1310       // know to perform better than using the popcnt instructions on each
1311       // vector element. If popcnt isn't supported, always provide the custom
1312       // version.
1313       if (!Subtarget->hasPOPCNT())
1314         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1315
1316       // Custom CTPOP always performs better on natively supported v8i32
1317       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1318     } else {
1319       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1320       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1321       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1322       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1323
1324       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1325       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1326       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1327       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1328
1329       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1330       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1331       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1332       // Don't lower v32i8 because there is no 128-bit byte mul
1333     }
1334
1335     // In the customized shift lowering, the legal cases in AVX2 will be
1336     // recognized.
1337     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1338     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1339
1340     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1341     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1342
1343     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1344
1345     // Custom lower several nodes for 256-bit types.
1346     for (MVT VT : MVT::vector_valuetypes()) {
1347       if (VT.getScalarSizeInBits() >= 32) {
1348         setOperationAction(ISD::MLOAD,  VT, Legal);
1349         setOperationAction(ISD::MSTORE, VT, Legal);
1350       }
1351       // Extract subvector is special because the value type
1352       // (result) is 128-bit but the source is 256-bit wide.
1353       if (VT.is128BitVector()) {
1354         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1355       }
1356       // Do not attempt to custom lower other non-256-bit vectors
1357       if (!VT.is256BitVector())
1358         continue;
1359
1360       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1361       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1362       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1363       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1364       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1365       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1366       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1367     }
1368
1369     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1370     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1371       MVT VT = (MVT::SimpleValueType)i;
1372
1373       // Do not attempt to promote non-256-bit vectors
1374       if (!VT.is256BitVector())
1375         continue;
1376
1377       setOperationAction(ISD::AND,    VT, Promote);
1378       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1379       setOperationAction(ISD::OR,     VT, Promote);
1380       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1381       setOperationAction(ISD::XOR,    VT, Promote);
1382       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1383       setOperationAction(ISD::LOAD,   VT, Promote);
1384       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1385       setOperationAction(ISD::SELECT, VT, Promote);
1386       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1387     }
1388   }
1389
1390   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1391     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1392     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1393     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1394     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1395
1396     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1397     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1398     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1399
1400     for (MVT VT : MVT::fp_vector_valuetypes())
1401       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1402
1403     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1404     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1405     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1406     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1407     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1408     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1409     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1410     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1411     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1412     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1413
1414     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1415     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1416     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1417     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1419     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1420
1421     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1422     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1423     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1424     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1425     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1426     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1427     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1428     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1429
1430     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1431     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1432     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1433     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1434     if (Subtarget->is64Bit()) {
1435       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1436       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1437       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1438       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1439     }
1440     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1441     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1442     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1443     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1444     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1445     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1446     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1447     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1448     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1449     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1450     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1451     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1452     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1453     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1454
1455     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1456     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1457     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1458     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1459     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1460     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1461     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1462     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1463     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1464     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1465     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1466     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1467     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1468
1469     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1470     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1471     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1472     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1473     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1474     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1475
1476     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1477     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1478
1479     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1480
1481     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1482     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1483     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1484     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1485     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1486     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1487     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1488     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1489     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1490
1491     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1492     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1493
1494     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1495     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1496
1497     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1498
1499     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1500     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1501
1502     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1503     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1504
1505     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1506     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1507
1508     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1509     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1510     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1511     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1512     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1513     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1514
1515     if (Subtarget->hasCDI()) {
1516       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1517       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1518     }
1519
1520     // Custom lower several nodes.
1521     for (MVT VT : MVT::vector_valuetypes()) {
1522       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1523       // Extract subvector is special because the value type
1524       // (result) is 256/128-bit but the source is 512-bit wide.
1525       if (VT.is128BitVector() || VT.is256BitVector()) {
1526         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1527       }
1528       if (VT.getVectorElementType() == MVT::i1)
1529         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1530
1531       // Do not attempt to custom lower other non-512-bit vectors
1532       if (!VT.is512BitVector())
1533         continue;
1534
1535       if ( EltSize >= 32) {
1536         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1537         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1538         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1539         setOperationAction(ISD::VSELECT,             VT, Legal);
1540         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1541         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1542         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1543         setOperationAction(ISD::MLOAD,               VT, Legal);
1544         setOperationAction(ISD::MSTORE,              VT, Legal);
1545       }
1546     }
1547     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1548       MVT VT = (MVT::SimpleValueType)i;
1549
1550       // Do not attempt to promote non-512-bit vectors.
1551       if (!VT.is512BitVector())
1552         continue;
1553
1554       setOperationAction(ISD::SELECT, VT, Promote);
1555       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1556     }
1557   }// has  AVX-512
1558
1559   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1560     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1561     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1562
1563     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1564     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1565
1566     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1567     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1568     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1569     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1570     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1571     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1572     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1573     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1574     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1575
1576     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1577       const MVT VT = (MVT::SimpleValueType)i;
1578
1579       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1580
1581       // Do not attempt to promote non-512-bit vectors.
1582       if (!VT.is512BitVector())
1583         continue;
1584
1585       if (EltSize < 32) {
1586         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1587         setOperationAction(ISD::VSELECT,             VT, Legal);
1588       }
1589     }
1590   }
1591
1592   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1593     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1594     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1595
1596     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1597     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1598     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1599
1600     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1601     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1602     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1603     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1604     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1605     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1606   }
1607
1608   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1609   // of this type with custom code.
1610   for (MVT VT : MVT::vector_valuetypes())
1611     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1612
1613   // We want to custom lower some of our intrinsics.
1614   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1615   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1616   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1617   if (!Subtarget->is64Bit())
1618     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1619
1620   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1621   // handle type legalization for these operations here.
1622   //
1623   // FIXME: We really should do custom legalization for addition and
1624   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1625   // than generic legalization for 64-bit multiplication-with-overflow, though.
1626   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1627     // Add/Sub/Mul with overflow operations are custom lowered.
1628     MVT VT = IntVTs[i];
1629     setOperationAction(ISD::SADDO, VT, Custom);
1630     setOperationAction(ISD::UADDO, VT, Custom);
1631     setOperationAction(ISD::SSUBO, VT, Custom);
1632     setOperationAction(ISD::USUBO, VT, Custom);
1633     setOperationAction(ISD::SMULO, VT, Custom);
1634     setOperationAction(ISD::UMULO, VT, Custom);
1635   }
1636
1637
1638   if (!Subtarget->is64Bit()) {
1639     // These libcalls are not available in 32-bit.
1640     setLibcallName(RTLIB::SHL_I128, nullptr);
1641     setLibcallName(RTLIB::SRL_I128, nullptr);
1642     setLibcallName(RTLIB::SRA_I128, nullptr);
1643   }
1644
1645   // Combine sin / cos into one node or libcall if possible.
1646   if (Subtarget->hasSinCos()) {
1647     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1648     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1649     if (Subtarget->isTargetDarwin()) {
1650       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1651       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1652       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1653       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1654     }
1655   }
1656
1657   if (Subtarget->isTargetWin64()) {
1658     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1659     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1660     setOperationAction(ISD::SREM, MVT::i128, Custom);
1661     setOperationAction(ISD::UREM, MVT::i128, Custom);
1662     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1663     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1664   }
1665
1666   // We have target-specific dag combine patterns for the following nodes:
1667   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1668   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1669   setTargetDAGCombine(ISD::VSELECT);
1670   setTargetDAGCombine(ISD::SELECT);
1671   setTargetDAGCombine(ISD::SHL);
1672   setTargetDAGCombine(ISD::SRA);
1673   setTargetDAGCombine(ISD::SRL);
1674   setTargetDAGCombine(ISD::OR);
1675   setTargetDAGCombine(ISD::AND);
1676   setTargetDAGCombine(ISD::ADD);
1677   setTargetDAGCombine(ISD::FADD);
1678   setTargetDAGCombine(ISD::FSUB);
1679   setTargetDAGCombine(ISD::FMA);
1680   setTargetDAGCombine(ISD::SUB);
1681   setTargetDAGCombine(ISD::LOAD);
1682   setTargetDAGCombine(ISD::STORE);
1683   setTargetDAGCombine(ISD::ZERO_EXTEND);
1684   setTargetDAGCombine(ISD::ANY_EXTEND);
1685   setTargetDAGCombine(ISD::SIGN_EXTEND);
1686   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1687   setTargetDAGCombine(ISD::TRUNCATE);
1688   setTargetDAGCombine(ISD::SINT_TO_FP);
1689   setTargetDAGCombine(ISD::SETCC);
1690   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1691   setTargetDAGCombine(ISD::BUILD_VECTOR);
1692   if (Subtarget->is64Bit())
1693     setTargetDAGCombine(ISD::MUL);
1694   setTargetDAGCombine(ISD::XOR);
1695
1696   computeRegisterProperties();
1697
1698   // On Darwin, -Os means optimize for size without hurting performance,
1699   // do not reduce the limit.
1700   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1701   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1702   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1703   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1704   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1705   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1706   setPrefLoopAlignment(4); // 2^4 bytes.
1707
1708   // Predictable cmov don't hurt on atom because it's in-order.
1709   PredictableSelectIsExpensive = !Subtarget->isAtom();
1710   EnableExtLdPromotion = true;
1711   setPrefFunctionAlignment(4); // 2^4 bytes.
1712
1713   verifyIntrinsicTables();
1714 }
1715
1716 // This has so far only been implemented for 64-bit MachO.
1717 bool X86TargetLowering::useLoadStackGuardNode() const {
1718   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1719 }
1720
1721 TargetLoweringBase::LegalizeTypeAction
1722 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1723   if (ExperimentalVectorWideningLegalization &&
1724       VT.getVectorNumElements() != 1 &&
1725       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1726     return TypeWidenVector;
1727
1728   return TargetLoweringBase::getPreferredVectorAction(VT);
1729 }
1730
1731 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1732   if (!VT.isVector())
1733     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1734
1735   const unsigned NumElts = VT.getVectorNumElements();
1736   const EVT EltVT = VT.getVectorElementType();
1737   if (VT.is512BitVector()) {
1738     if (Subtarget->hasAVX512())
1739       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1740           EltVT == MVT::f32 || EltVT == MVT::f64)
1741         switch(NumElts) {
1742         case  8: return MVT::v8i1;
1743         case 16: return MVT::v16i1;
1744       }
1745     if (Subtarget->hasBWI())
1746       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1747         switch(NumElts) {
1748         case 32: return MVT::v32i1;
1749         case 64: return MVT::v64i1;
1750       }
1751   }
1752
1753   if (VT.is256BitVector() || VT.is128BitVector()) {
1754     if (Subtarget->hasVLX())
1755       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1756           EltVT == MVT::f32 || EltVT == MVT::f64)
1757         switch(NumElts) {
1758         case 2: return MVT::v2i1;
1759         case 4: return MVT::v4i1;
1760         case 8: return MVT::v8i1;
1761       }
1762     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1763       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1764         switch(NumElts) {
1765         case  8: return MVT::v8i1;
1766         case 16: return MVT::v16i1;
1767         case 32: return MVT::v32i1;
1768       }
1769   }
1770
1771   return VT.changeVectorElementTypeToInteger();
1772 }
1773
1774 /// Helper for getByValTypeAlignment to determine
1775 /// the desired ByVal argument alignment.
1776 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1777   if (MaxAlign == 16)
1778     return;
1779   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1780     if (VTy->getBitWidth() == 128)
1781       MaxAlign = 16;
1782   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1783     unsigned EltAlign = 0;
1784     getMaxByValAlign(ATy->getElementType(), EltAlign);
1785     if (EltAlign > MaxAlign)
1786       MaxAlign = EltAlign;
1787   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1788     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1789       unsigned EltAlign = 0;
1790       getMaxByValAlign(STy->getElementType(i), EltAlign);
1791       if (EltAlign > MaxAlign)
1792         MaxAlign = EltAlign;
1793       if (MaxAlign == 16)
1794         break;
1795     }
1796   }
1797 }
1798
1799 /// Return the desired alignment for ByVal aggregate
1800 /// function arguments in the caller parameter area. For X86, aggregates
1801 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1802 /// are at 4-byte boundaries.
1803 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1804   if (Subtarget->is64Bit()) {
1805     // Max of 8 and alignment of type.
1806     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1807     if (TyAlign > 8)
1808       return TyAlign;
1809     return 8;
1810   }
1811
1812   unsigned Align = 4;
1813   if (Subtarget->hasSSE1())
1814     getMaxByValAlign(Ty, Align);
1815   return Align;
1816 }
1817
1818 /// Returns the target specific optimal type for load
1819 /// and store operations as a result of memset, memcpy, and memmove
1820 /// lowering. If DstAlign is zero that means it's safe to destination
1821 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1822 /// means there isn't a need to check it against alignment requirement,
1823 /// probably because the source does not need to be loaded. If 'IsMemset' is
1824 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1825 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1826 /// source is constant so it does not need to be loaded.
1827 /// It returns EVT::Other if the type should be determined using generic
1828 /// target-independent logic.
1829 EVT
1830 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1831                                        unsigned DstAlign, unsigned SrcAlign,
1832                                        bool IsMemset, bool ZeroMemset,
1833                                        bool MemcpyStrSrc,
1834                                        MachineFunction &MF) const {
1835   const Function *F = MF.getFunction();
1836   if ((!IsMemset || ZeroMemset) &&
1837       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1838                                        Attribute::NoImplicitFloat)) {
1839     if (Size >= 16 &&
1840         (Subtarget->isUnalignedMemAccessFast() ||
1841          ((DstAlign == 0 || DstAlign >= 16) &&
1842           (SrcAlign == 0 || SrcAlign >= 16)))) {
1843       if (Size >= 32) {
1844         if (Subtarget->hasInt256())
1845           return MVT::v8i32;
1846         if (Subtarget->hasFp256())
1847           return MVT::v8f32;
1848       }
1849       if (Subtarget->hasSSE2())
1850         return MVT::v4i32;
1851       if (Subtarget->hasSSE1())
1852         return MVT::v4f32;
1853     } else if (!MemcpyStrSrc && Size >= 8 &&
1854                !Subtarget->is64Bit() &&
1855                Subtarget->hasSSE2()) {
1856       // Do not use f64 to lower memcpy if source is string constant. It's
1857       // better to use i32 to avoid the loads.
1858       return MVT::f64;
1859     }
1860   }
1861   if (Subtarget->is64Bit() && Size >= 8)
1862     return MVT::i64;
1863   return MVT::i32;
1864 }
1865
1866 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1867   if (VT == MVT::f32)
1868     return X86ScalarSSEf32;
1869   else if (VT == MVT::f64)
1870     return X86ScalarSSEf64;
1871   return true;
1872 }
1873
1874 bool
1875 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1876                                                   unsigned,
1877                                                   unsigned,
1878                                                   bool *Fast) const {
1879   if (Fast)
1880     *Fast = Subtarget->isUnalignedMemAccessFast();
1881   return true;
1882 }
1883
1884 /// Return the entry encoding for a jump table in the
1885 /// current function.  The returned value is a member of the
1886 /// MachineJumpTableInfo::JTEntryKind enum.
1887 unsigned X86TargetLowering::getJumpTableEncoding() const {
1888   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1889   // symbol.
1890   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1891       Subtarget->isPICStyleGOT())
1892     return MachineJumpTableInfo::EK_Custom32;
1893
1894   // Otherwise, use the normal jump table encoding heuristics.
1895   return TargetLowering::getJumpTableEncoding();
1896 }
1897
1898 const MCExpr *
1899 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1900                                              const MachineBasicBlock *MBB,
1901                                              unsigned uid,MCContext &Ctx) const{
1902   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1903          Subtarget->isPICStyleGOT());
1904   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1905   // entries.
1906   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1907                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1908 }
1909
1910 /// Returns relocation base for the given PIC jumptable.
1911 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1912                                                     SelectionDAG &DAG) const {
1913   if (!Subtarget->is64Bit())
1914     // This doesn't have SDLoc associated with it, but is not really the
1915     // same as a Register.
1916     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1917   return Table;
1918 }
1919
1920 /// This returns the relocation base for the given PIC jumptable,
1921 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1922 const MCExpr *X86TargetLowering::
1923 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1924                              MCContext &Ctx) const {
1925   // X86-64 uses RIP relative addressing based on the jump table label.
1926   if (Subtarget->isPICStyleRIPRel())
1927     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1928
1929   // Otherwise, the reference is relative to the PIC base.
1930   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1931 }
1932
1933 // FIXME: Why this routine is here? Move to RegInfo!
1934 std::pair<const TargetRegisterClass*, uint8_t>
1935 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1936   const TargetRegisterClass *RRC = nullptr;
1937   uint8_t Cost = 1;
1938   switch (VT.SimpleTy) {
1939   default:
1940     return TargetLowering::findRepresentativeClass(VT);
1941   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1942     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1943     break;
1944   case MVT::x86mmx:
1945     RRC = &X86::VR64RegClass;
1946     break;
1947   case MVT::f32: case MVT::f64:
1948   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1949   case MVT::v4f32: case MVT::v2f64:
1950   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1951   case MVT::v4f64:
1952     RRC = &X86::VR128RegClass;
1953     break;
1954   }
1955   return std::make_pair(RRC, Cost);
1956 }
1957
1958 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1959                                                unsigned &Offset) const {
1960   if (!Subtarget->isTargetLinux())
1961     return false;
1962
1963   if (Subtarget->is64Bit()) {
1964     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1965     Offset = 0x28;
1966     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1967       AddressSpace = 256;
1968     else
1969       AddressSpace = 257;
1970   } else {
1971     // %gs:0x14 on i386
1972     Offset = 0x14;
1973     AddressSpace = 256;
1974   }
1975   return true;
1976 }
1977
1978 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1979                                             unsigned DestAS) const {
1980   assert(SrcAS != DestAS && "Expected different address spaces!");
1981
1982   return SrcAS < 256 && DestAS < 256;
1983 }
1984
1985 //===----------------------------------------------------------------------===//
1986 //               Return Value Calling Convention Implementation
1987 //===----------------------------------------------------------------------===//
1988
1989 #include "X86GenCallingConv.inc"
1990
1991 bool
1992 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1993                                   MachineFunction &MF, bool isVarArg,
1994                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1995                         LLVMContext &Context) const {
1996   SmallVector<CCValAssign, 16> RVLocs;
1997   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
1998   return CCInfo.CheckReturn(Outs, RetCC_X86);
1999 }
2000
2001 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2002   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2003   return ScratchRegs;
2004 }
2005
2006 SDValue
2007 X86TargetLowering::LowerReturn(SDValue Chain,
2008                                CallingConv::ID CallConv, bool isVarArg,
2009                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2010                                const SmallVectorImpl<SDValue> &OutVals,
2011                                SDLoc dl, SelectionDAG &DAG) const {
2012   MachineFunction &MF = DAG.getMachineFunction();
2013   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2014
2015   SmallVector<CCValAssign, 16> RVLocs;
2016   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2017   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2018
2019   SDValue Flag;
2020   SmallVector<SDValue, 6> RetOps;
2021   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2022   // Operand #1 = Bytes To Pop
2023   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2024                    MVT::i16));
2025
2026   // Copy the result values into the output registers.
2027   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2028     CCValAssign &VA = RVLocs[i];
2029     assert(VA.isRegLoc() && "Can only return in registers!");
2030     SDValue ValToCopy = OutVals[i];
2031     EVT ValVT = ValToCopy.getValueType();
2032
2033     // Promote values to the appropriate types.
2034     if (VA.getLocInfo() == CCValAssign::SExt)
2035       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2036     else if (VA.getLocInfo() == CCValAssign::ZExt)
2037       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2038     else if (VA.getLocInfo() == CCValAssign::AExt)
2039       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2040     else if (VA.getLocInfo() == CCValAssign::BCvt)
2041       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2042
2043     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2044            "Unexpected FP-extend for return value.");
2045
2046     // If this is x86-64, and we disabled SSE, we can't return FP values,
2047     // or SSE or MMX vectors.
2048     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2049          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2050           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2051       report_fatal_error("SSE register return with SSE disabled");
2052     }
2053     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2054     // llvm-gcc has never done it right and no one has noticed, so this
2055     // should be OK for now.
2056     if (ValVT == MVT::f64 &&
2057         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2058       report_fatal_error("SSE2 register return with SSE2 disabled");
2059
2060     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2061     // the RET instruction and handled by the FP Stackifier.
2062     if (VA.getLocReg() == X86::FP0 ||
2063         VA.getLocReg() == X86::FP1) {
2064       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2065       // change the value to the FP stack register class.
2066       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2067         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2068       RetOps.push_back(ValToCopy);
2069       // Don't emit a copytoreg.
2070       continue;
2071     }
2072
2073     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2074     // which is returned in RAX / RDX.
2075     if (Subtarget->is64Bit()) {
2076       if (ValVT == MVT::x86mmx) {
2077         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2078           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2079           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2080                                   ValToCopy);
2081           // If we don't have SSE2 available, convert to v4f32 so the generated
2082           // register is legal.
2083           if (!Subtarget->hasSSE2())
2084             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2085         }
2086       }
2087     }
2088
2089     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2090     Flag = Chain.getValue(1);
2091     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2092   }
2093
2094   // The x86-64 ABIs require that for returning structs by value we copy
2095   // the sret argument into %rax/%eax (depending on ABI) for the return.
2096   // Win32 requires us to put the sret argument to %eax as well.
2097   // We saved the argument into a virtual register in the entry block,
2098   // so now we copy the value out and into %rax/%eax.
2099   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2100       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2101     MachineFunction &MF = DAG.getMachineFunction();
2102     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2103     unsigned Reg = FuncInfo->getSRetReturnReg();
2104     assert(Reg &&
2105            "SRetReturnReg should have been set in LowerFormalArguments().");
2106     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2107
2108     unsigned RetValReg
2109         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2110           X86::RAX : X86::EAX;
2111     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2112     Flag = Chain.getValue(1);
2113
2114     // RAX/EAX now acts like a return value.
2115     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2116   }
2117
2118   RetOps[0] = Chain;  // Update chain.
2119
2120   // Add the flag if we have it.
2121   if (Flag.getNode())
2122     RetOps.push_back(Flag);
2123
2124   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2125 }
2126
2127 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2128   if (N->getNumValues() != 1)
2129     return false;
2130   if (!N->hasNUsesOfValue(1, 0))
2131     return false;
2132
2133   SDValue TCChain = Chain;
2134   SDNode *Copy = *N->use_begin();
2135   if (Copy->getOpcode() == ISD::CopyToReg) {
2136     // If the copy has a glue operand, we conservatively assume it isn't safe to
2137     // perform a tail call.
2138     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2139       return false;
2140     TCChain = Copy->getOperand(0);
2141   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2142     return false;
2143
2144   bool HasRet = false;
2145   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2146        UI != UE; ++UI) {
2147     if (UI->getOpcode() != X86ISD::RET_FLAG)
2148       return false;
2149     // If we are returning more than one value, we can definitely
2150     // not make a tail call see PR19530
2151     if (UI->getNumOperands() > 4)
2152       return false;
2153     if (UI->getNumOperands() == 4 &&
2154         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2155       return false;
2156     HasRet = true;
2157   }
2158
2159   if (!HasRet)
2160     return false;
2161
2162   Chain = TCChain;
2163   return true;
2164 }
2165
2166 EVT
2167 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2168                                             ISD::NodeType ExtendKind) const {
2169   MVT ReturnMVT;
2170   // TODO: Is this also valid on 32-bit?
2171   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2172     ReturnMVT = MVT::i8;
2173   else
2174     ReturnMVT = MVT::i32;
2175
2176   EVT MinVT = getRegisterType(Context, ReturnMVT);
2177   return VT.bitsLT(MinVT) ? MinVT : VT;
2178 }
2179
2180 /// Lower the result values of a call into the
2181 /// appropriate copies out of appropriate physical registers.
2182 ///
2183 SDValue
2184 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2185                                    CallingConv::ID CallConv, bool isVarArg,
2186                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2187                                    SDLoc dl, SelectionDAG &DAG,
2188                                    SmallVectorImpl<SDValue> &InVals) const {
2189
2190   // Assign locations to each value returned by this call.
2191   SmallVector<CCValAssign, 16> RVLocs;
2192   bool Is64Bit = Subtarget->is64Bit();
2193   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2194                  *DAG.getContext());
2195   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2196
2197   // Copy all of the result registers out of their specified physreg.
2198   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2199     CCValAssign &VA = RVLocs[i];
2200     EVT CopyVT = VA.getValVT();
2201
2202     // If this is x86-64, and we disabled SSE, we can't return FP values
2203     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2204         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2205       report_fatal_error("SSE register return with SSE disabled");
2206     }
2207
2208     // If we prefer to use the value in xmm registers, copy it out as f80 and
2209     // use a truncate to move it from fp stack reg to xmm reg.
2210     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2211         isScalarFPTypeInSSEReg(VA.getValVT()))
2212       CopyVT = MVT::f80;
2213
2214     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2215                                CopyVT, InFlag).getValue(1);
2216     SDValue Val = Chain.getValue(0);
2217
2218     if (CopyVT != VA.getValVT())
2219       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2220                         // This truncation won't change the value.
2221                         DAG.getIntPtrConstant(1));
2222
2223     InFlag = Chain.getValue(2);
2224     InVals.push_back(Val);
2225   }
2226
2227   return Chain;
2228 }
2229
2230 //===----------------------------------------------------------------------===//
2231 //                C & StdCall & Fast Calling Convention implementation
2232 //===----------------------------------------------------------------------===//
2233 //  StdCall calling convention seems to be standard for many Windows' API
2234 //  routines and around. It differs from C calling convention just a little:
2235 //  callee should clean up the stack, not caller. Symbols should be also
2236 //  decorated in some fancy way :) It doesn't support any vector arguments.
2237 //  For info on fast calling convention see Fast Calling Convention (tail call)
2238 //  implementation LowerX86_32FastCCCallTo.
2239
2240 /// CallIsStructReturn - Determines whether a call uses struct return
2241 /// semantics.
2242 enum StructReturnType {
2243   NotStructReturn,
2244   RegStructReturn,
2245   StackStructReturn
2246 };
2247 static StructReturnType
2248 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2249   if (Outs.empty())
2250     return NotStructReturn;
2251
2252   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2253   if (!Flags.isSRet())
2254     return NotStructReturn;
2255   if (Flags.isInReg())
2256     return RegStructReturn;
2257   return StackStructReturn;
2258 }
2259
2260 /// Determines whether a function uses struct return semantics.
2261 static StructReturnType
2262 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2263   if (Ins.empty())
2264     return NotStructReturn;
2265
2266   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2267   if (!Flags.isSRet())
2268     return NotStructReturn;
2269   if (Flags.isInReg())
2270     return RegStructReturn;
2271   return StackStructReturn;
2272 }
2273
2274 /// Make a copy of an aggregate at address specified by "Src" to address
2275 /// "Dst" with size and alignment information specified by the specific
2276 /// parameter attribute. The copy will be passed as a byval function parameter.
2277 static SDValue
2278 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2279                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2280                           SDLoc dl) {
2281   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2282
2283   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2284                        /*isVolatile*/false, /*AlwaysInline=*/true,
2285                        MachinePointerInfo(), MachinePointerInfo());
2286 }
2287
2288 /// Return true if the calling convention is one that
2289 /// supports tail call optimization.
2290 static bool IsTailCallConvention(CallingConv::ID CC) {
2291   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2292           CC == CallingConv::HiPE);
2293 }
2294
2295 /// \brief Return true if the calling convention is a C calling convention.
2296 static bool IsCCallConvention(CallingConv::ID CC) {
2297   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2298           CC == CallingConv::X86_64_SysV);
2299 }
2300
2301 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2302   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2303     return false;
2304
2305   CallSite CS(CI);
2306   CallingConv::ID CalleeCC = CS.getCallingConv();
2307   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2308     return false;
2309
2310   return true;
2311 }
2312
2313 /// Return true if the function is being made into
2314 /// a tailcall target by changing its ABI.
2315 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2316                                    bool GuaranteedTailCallOpt) {
2317   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2318 }
2319
2320 SDValue
2321 X86TargetLowering::LowerMemArgument(SDValue Chain,
2322                                     CallingConv::ID CallConv,
2323                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2324                                     SDLoc dl, SelectionDAG &DAG,
2325                                     const CCValAssign &VA,
2326                                     MachineFrameInfo *MFI,
2327                                     unsigned i) const {
2328   // Create the nodes corresponding to a load from this parameter slot.
2329   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2330   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2331       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2332   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2333   EVT ValVT;
2334
2335   // If value is passed by pointer we have address passed instead of the value
2336   // itself.
2337   if (VA.getLocInfo() == CCValAssign::Indirect)
2338     ValVT = VA.getLocVT();
2339   else
2340     ValVT = VA.getValVT();
2341
2342   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2343   // changed with more analysis.
2344   // In case of tail call optimization mark all arguments mutable. Since they
2345   // could be overwritten by lowering of arguments in case of a tail call.
2346   if (Flags.isByVal()) {
2347     unsigned Bytes = Flags.getByValSize();
2348     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2349     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2350     return DAG.getFrameIndex(FI, getPointerTy());
2351   } else {
2352     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2353                                     VA.getLocMemOffset(), isImmutable);
2354     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2355     return DAG.getLoad(ValVT, dl, Chain, FIN,
2356                        MachinePointerInfo::getFixedStack(FI),
2357                        false, false, false, 0);
2358   }
2359 }
2360
2361 // FIXME: Get this from tablegen.
2362 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2363                                                 const X86Subtarget *Subtarget) {
2364   assert(Subtarget->is64Bit());
2365
2366   if (Subtarget->isCallingConvWin64(CallConv)) {
2367     static const MCPhysReg GPR64ArgRegsWin64[] = {
2368       X86::RCX, X86::RDX, X86::R8,  X86::R9
2369     };
2370     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2371   }
2372
2373   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2374     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2375   };
2376   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2377 }
2378
2379 // FIXME: Get this from tablegen.
2380 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2381                                                 CallingConv::ID CallConv,
2382                                                 const X86Subtarget *Subtarget) {
2383   assert(Subtarget->is64Bit());
2384   if (Subtarget->isCallingConvWin64(CallConv)) {
2385     // The XMM registers which might contain var arg parameters are shadowed
2386     // in their paired GPR.  So we only need to save the GPR to their home
2387     // slots.
2388     // TODO: __vectorcall will change this.
2389     return None;
2390   }
2391
2392   const Function *Fn = MF.getFunction();
2393   bool NoImplicitFloatOps = Fn->getAttributes().
2394       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2395   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2396          "SSE register cannot be used when SSE is disabled!");
2397   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2398       !Subtarget->hasSSE1())
2399     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2400     // registers.
2401     return None;
2402
2403   static const MCPhysReg XMMArgRegs64Bit[] = {
2404     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2405     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2406   };
2407   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2408 }
2409
2410 SDValue
2411 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2412                                         CallingConv::ID CallConv,
2413                                         bool isVarArg,
2414                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2415                                         SDLoc dl,
2416                                         SelectionDAG &DAG,
2417                                         SmallVectorImpl<SDValue> &InVals)
2418                                           const {
2419   MachineFunction &MF = DAG.getMachineFunction();
2420   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2421
2422   const Function* Fn = MF.getFunction();
2423   if (Fn->hasExternalLinkage() &&
2424       Subtarget->isTargetCygMing() &&
2425       Fn->getName() == "main")
2426     FuncInfo->setForceFramePointer(true);
2427
2428   MachineFrameInfo *MFI = MF.getFrameInfo();
2429   bool Is64Bit = Subtarget->is64Bit();
2430   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2431
2432   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2433          "Var args not supported with calling convention fastcc, ghc or hipe");
2434
2435   // Assign locations to all of the incoming arguments.
2436   SmallVector<CCValAssign, 16> ArgLocs;
2437   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2438
2439   // Allocate shadow area for Win64
2440   if (IsWin64)
2441     CCInfo.AllocateStack(32, 8);
2442
2443   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2444
2445   unsigned LastVal = ~0U;
2446   SDValue ArgValue;
2447   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2448     CCValAssign &VA = ArgLocs[i];
2449     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2450     // places.
2451     assert(VA.getValNo() != LastVal &&
2452            "Don't support value assigned to multiple locs yet");
2453     (void)LastVal;
2454     LastVal = VA.getValNo();
2455
2456     if (VA.isRegLoc()) {
2457       EVT RegVT = VA.getLocVT();
2458       const TargetRegisterClass *RC;
2459       if (RegVT == MVT::i32)
2460         RC = &X86::GR32RegClass;
2461       else if (Is64Bit && RegVT == MVT::i64)
2462         RC = &X86::GR64RegClass;
2463       else if (RegVT == MVT::f32)
2464         RC = &X86::FR32RegClass;
2465       else if (RegVT == MVT::f64)
2466         RC = &X86::FR64RegClass;
2467       else if (RegVT.is512BitVector())
2468         RC = &X86::VR512RegClass;
2469       else if (RegVT.is256BitVector())
2470         RC = &X86::VR256RegClass;
2471       else if (RegVT.is128BitVector())
2472         RC = &X86::VR128RegClass;
2473       else if (RegVT == MVT::x86mmx)
2474         RC = &X86::VR64RegClass;
2475       else if (RegVT == MVT::i1)
2476         RC = &X86::VK1RegClass;
2477       else if (RegVT == MVT::v8i1)
2478         RC = &X86::VK8RegClass;
2479       else if (RegVT == MVT::v16i1)
2480         RC = &X86::VK16RegClass;
2481       else if (RegVT == MVT::v32i1)
2482         RC = &X86::VK32RegClass;
2483       else if (RegVT == MVT::v64i1)
2484         RC = &X86::VK64RegClass;
2485       else
2486         llvm_unreachable("Unknown argument type!");
2487
2488       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2489       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2490
2491       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2492       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2493       // right size.
2494       if (VA.getLocInfo() == CCValAssign::SExt)
2495         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2496                                DAG.getValueType(VA.getValVT()));
2497       else if (VA.getLocInfo() == CCValAssign::ZExt)
2498         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2499                                DAG.getValueType(VA.getValVT()));
2500       else if (VA.getLocInfo() == CCValAssign::BCvt)
2501         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2502
2503       if (VA.isExtInLoc()) {
2504         // Handle MMX values passed in XMM regs.
2505         if (RegVT.isVector())
2506           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2507         else
2508           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2509       }
2510     } else {
2511       assert(VA.isMemLoc());
2512       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2513     }
2514
2515     // If value is passed via pointer - do a load.
2516     if (VA.getLocInfo() == CCValAssign::Indirect)
2517       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2518                              MachinePointerInfo(), false, false, false, 0);
2519
2520     InVals.push_back(ArgValue);
2521   }
2522
2523   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2524     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2525       // The x86-64 ABIs require that for returning structs by value we copy
2526       // the sret argument into %rax/%eax (depending on ABI) for the return.
2527       // Win32 requires us to put the sret argument to %eax as well.
2528       // Save the argument into a virtual register so that we can access it
2529       // from the return points.
2530       if (Ins[i].Flags.isSRet()) {
2531         unsigned Reg = FuncInfo->getSRetReturnReg();
2532         if (!Reg) {
2533           MVT PtrTy = getPointerTy();
2534           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2535           FuncInfo->setSRetReturnReg(Reg);
2536         }
2537         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2538         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2539         break;
2540       }
2541     }
2542   }
2543
2544   unsigned StackSize = CCInfo.getNextStackOffset();
2545   // Align stack specially for tail calls.
2546   if (FuncIsMadeTailCallSafe(CallConv,
2547                              MF.getTarget().Options.GuaranteedTailCallOpt))
2548     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2549
2550   // If the function takes variable number of arguments, make a frame index for
2551   // the start of the first vararg value... for expansion of llvm.va_start. We
2552   // can skip this if there are no va_start calls.
2553   if (MFI->hasVAStart() &&
2554       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2555                    CallConv != CallingConv::X86_ThisCall))) {
2556     FuncInfo->setVarArgsFrameIndex(
2557         MFI->CreateFixedObject(1, StackSize, true));
2558   }
2559
2560   // Figure out if XMM registers are in use.
2561   assert(!(MF.getTarget().Options.UseSoftFloat &&
2562            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2563                                             Attribute::NoImplicitFloat)) &&
2564          "SSE register cannot be used when SSE is disabled!");
2565
2566   // 64-bit calling conventions support varargs and register parameters, so we
2567   // have to do extra work to spill them in the prologue.
2568   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2569     // Find the first unallocated argument registers.
2570     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2571     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2572     unsigned NumIntRegs =
2573         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2574     unsigned NumXMMRegs =
2575         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2576     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2577            "SSE register cannot be used when SSE is disabled!");
2578
2579     // Gather all the live in physical registers.
2580     SmallVector<SDValue, 6> LiveGPRs;
2581     SmallVector<SDValue, 8> LiveXMMRegs;
2582     SDValue ALVal;
2583     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2584       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2585       LiveGPRs.push_back(
2586           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2587     }
2588     if (!ArgXMMs.empty()) {
2589       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2590       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2591       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2592         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2593         LiveXMMRegs.push_back(
2594             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2595       }
2596     }
2597
2598     if (IsWin64) {
2599       const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
2600       // Get to the caller-allocated home save location.  Add 8 to account
2601       // for the return address.
2602       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2603       FuncInfo->setRegSaveFrameIndex(
2604           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2605       // Fixup to set vararg frame on shadow area (4 x i64).
2606       if (NumIntRegs < 4)
2607         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2608     } else {
2609       // For X86-64, if there are vararg parameters that are passed via
2610       // registers, then we must store them to their spots on the stack so
2611       // they may be loaded by deferencing the result of va_next.
2612       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2613       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2614       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2615           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2616     }
2617
2618     // Store the integer parameter registers.
2619     SmallVector<SDValue, 8> MemOps;
2620     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2621                                       getPointerTy());
2622     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2623     for (SDValue Val : LiveGPRs) {
2624       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2625                                 DAG.getIntPtrConstant(Offset));
2626       SDValue Store =
2627         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2628                      MachinePointerInfo::getFixedStack(
2629                        FuncInfo->getRegSaveFrameIndex(), Offset),
2630                      false, false, 0);
2631       MemOps.push_back(Store);
2632       Offset += 8;
2633     }
2634
2635     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2636       // Now store the XMM (fp + vector) parameter registers.
2637       SmallVector<SDValue, 12> SaveXMMOps;
2638       SaveXMMOps.push_back(Chain);
2639       SaveXMMOps.push_back(ALVal);
2640       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2641                              FuncInfo->getRegSaveFrameIndex()));
2642       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2643                              FuncInfo->getVarArgsFPOffset()));
2644       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2645                         LiveXMMRegs.end());
2646       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2647                                    MVT::Other, SaveXMMOps));
2648     }
2649
2650     if (!MemOps.empty())
2651       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2652   }
2653
2654   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2655     // Find the largest legal vector type.
2656     MVT VecVT = MVT::Other;
2657     // FIXME: Only some x86_32 calling conventions support AVX512.
2658     if (Subtarget->hasAVX512() &&
2659         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2660                      CallConv == CallingConv::Intel_OCL_BI)))
2661       VecVT = MVT::v16f32;
2662     else if (Subtarget->hasAVX())
2663       VecVT = MVT::v8f32;
2664     else if (Subtarget->hasSSE2())
2665       VecVT = MVT::v4f32;
2666
2667     // We forward some GPRs and some vector types.
2668     SmallVector<MVT, 2> RegParmTypes;
2669     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2670     RegParmTypes.push_back(IntVT);
2671     if (VecVT != MVT::Other)
2672       RegParmTypes.push_back(VecVT);
2673
2674     // Compute the set of forwarded registers. The rest are scratch.
2675     SmallVectorImpl<ForwardedRegister> &Forwards =
2676         FuncInfo->getForwardedMustTailRegParms();
2677     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2678
2679     // Conservatively forward AL on x86_64, since it might be used for varargs.
2680     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2681       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2682       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2683     }
2684
2685     // Copy all forwards from physical to virtual registers.
2686     for (ForwardedRegister &F : Forwards) {
2687       // FIXME: Can we use a less constrained schedule?
2688       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2689       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2690       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2691     }
2692   }
2693
2694   // Some CCs need callee pop.
2695   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2696                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2697     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2698   } else {
2699     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2700     // If this is an sret function, the return should pop the hidden pointer.
2701     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2702         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2703         argsAreStructReturn(Ins) == StackStructReturn)
2704       FuncInfo->setBytesToPopOnReturn(4);
2705   }
2706
2707   if (!Is64Bit) {
2708     // RegSaveFrameIndex is X86-64 only.
2709     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2710     if (CallConv == CallingConv::X86_FastCall ||
2711         CallConv == CallingConv::X86_ThisCall)
2712       // fastcc functions can't have varargs.
2713       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2714   }
2715
2716   FuncInfo->setArgumentStackSize(StackSize);
2717
2718   return Chain;
2719 }
2720
2721 SDValue
2722 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2723                                     SDValue StackPtr, SDValue Arg,
2724                                     SDLoc dl, SelectionDAG &DAG,
2725                                     const CCValAssign &VA,
2726                                     ISD::ArgFlagsTy Flags) const {
2727   unsigned LocMemOffset = VA.getLocMemOffset();
2728   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2729   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2730   if (Flags.isByVal())
2731     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2732
2733   return DAG.getStore(Chain, dl, Arg, PtrOff,
2734                       MachinePointerInfo::getStack(LocMemOffset),
2735                       false, false, 0);
2736 }
2737
2738 /// Emit a load of return address if tail call
2739 /// optimization is performed and it is required.
2740 SDValue
2741 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2742                                            SDValue &OutRetAddr, SDValue Chain,
2743                                            bool IsTailCall, bool Is64Bit,
2744                                            int FPDiff, SDLoc dl) const {
2745   // Adjust the Return address stack slot.
2746   EVT VT = getPointerTy();
2747   OutRetAddr = getReturnAddressFrameIndex(DAG);
2748
2749   // Load the "old" Return address.
2750   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2751                            false, false, false, 0);
2752   return SDValue(OutRetAddr.getNode(), 1);
2753 }
2754
2755 /// Emit a store of the return address if tail call
2756 /// optimization is performed and it is required (FPDiff!=0).
2757 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2758                                         SDValue Chain, SDValue RetAddrFrIdx,
2759                                         EVT PtrVT, unsigned SlotSize,
2760                                         int FPDiff, SDLoc dl) {
2761   // Store the return address to the appropriate stack slot.
2762   if (!FPDiff) return Chain;
2763   // Calculate the new stack slot for the return address.
2764   int NewReturnAddrFI =
2765     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2766                                          false);
2767   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2768   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2769                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2770                        false, false, 0);
2771   return Chain;
2772 }
2773
2774 SDValue
2775 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2776                              SmallVectorImpl<SDValue> &InVals) const {
2777   SelectionDAG &DAG                     = CLI.DAG;
2778   SDLoc &dl                             = CLI.DL;
2779   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2780   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2781   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2782   SDValue Chain                         = CLI.Chain;
2783   SDValue Callee                        = CLI.Callee;
2784   CallingConv::ID CallConv              = CLI.CallConv;
2785   bool &isTailCall                      = CLI.IsTailCall;
2786   bool isVarArg                         = CLI.IsVarArg;
2787
2788   MachineFunction &MF = DAG.getMachineFunction();
2789   bool Is64Bit        = Subtarget->is64Bit();
2790   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2791   StructReturnType SR = callIsStructReturn(Outs);
2792   bool IsSibcall      = false;
2793   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2794
2795   if (MF.getTarget().Options.DisableTailCalls)
2796     isTailCall = false;
2797
2798   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2799   if (IsMustTail) {
2800     // Force this to be a tail call.  The verifier rules are enough to ensure
2801     // that we can lower this successfully without moving the return address
2802     // around.
2803     isTailCall = true;
2804   } else if (isTailCall) {
2805     // Check if it's really possible to do a tail call.
2806     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2807                     isVarArg, SR != NotStructReturn,
2808                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2809                     Outs, OutVals, Ins, DAG);
2810
2811     // Sibcalls are automatically detected tailcalls which do not require
2812     // ABI changes.
2813     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2814       IsSibcall = true;
2815
2816     if (isTailCall)
2817       ++NumTailCalls;
2818   }
2819
2820   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2821          "Var args not supported with calling convention fastcc, ghc or hipe");
2822
2823   // Analyze operands of the call, assigning locations to each operand.
2824   SmallVector<CCValAssign, 16> ArgLocs;
2825   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2826
2827   // Allocate shadow area for Win64
2828   if (IsWin64)
2829     CCInfo.AllocateStack(32, 8);
2830
2831   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2832
2833   // Get a count of how many bytes are to be pushed on the stack.
2834   unsigned NumBytes = CCInfo.getNextStackOffset();
2835   if (IsSibcall)
2836     // This is a sibcall. The memory operands are available in caller's
2837     // own caller's stack.
2838     NumBytes = 0;
2839   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2840            IsTailCallConvention(CallConv))
2841     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2842
2843   int FPDiff = 0;
2844   if (isTailCall && !IsSibcall && !IsMustTail) {
2845     // Lower arguments at fp - stackoffset + fpdiff.
2846     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2847
2848     FPDiff = NumBytesCallerPushed - NumBytes;
2849
2850     // Set the delta of movement of the returnaddr stackslot.
2851     // But only set if delta is greater than previous delta.
2852     if (FPDiff < X86Info->getTCReturnAddrDelta())
2853       X86Info->setTCReturnAddrDelta(FPDiff);
2854   }
2855
2856   unsigned NumBytesToPush = NumBytes;
2857   unsigned NumBytesToPop = NumBytes;
2858
2859   // If we have an inalloca argument, all stack space has already been allocated
2860   // for us and be right at the top of the stack.  We don't support multiple
2861   // arguments passed in memory when using inalloca.
2862   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2863     NumBytesToPush = 0;
2864     if (!ArgLocs.back().isMemLoc())
2865       report_fatal_error("cannot use inalloca attribute on a register "
2866                          "parameter");
2867     if (ArgLocs.back().getLocMemOffset() != 0)
2868       report_fatal_error("any parameter with the inalloca attribute must be "
2869                          "the only memory argument");
2870   }
2871
2872   if (!IsSibcall)
2873     Chain = DAG.getCALLSEQ_START(
2874         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2875
2876   SDValue RetAddrFrIdx;
2877   // Load return address for tail calls.
2878   if (isTailCall && FPDiff)
2879     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2880                                     Is64Bit, FPDiff, dl);
2881
2882   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2883   SmallVector<SDValue, 8> MemOpChains;
2884   SDValue StackPtr;
2885
2886   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2887   // of tail call optimization arguments are handle later.
2888   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
2889       DAG.getSubtarget().getRegisterInfo());
2890   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2891     // Skip inalloca arguments, they have already been written.
2892     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2893     if (Flags.isInAlloca())
2894       continue;
2895
2896     CCValAssign &VA = ArgLocs[i];
2897     EVT RegVT = VA.getLocVT();
2898     SDValue Arg = OutVals[i];
2899     bool isByVal = Flags.isByVal();
2900
2901     // Promote the value if needed.
2902     switch (VA.getLocInfo()) {
2903     default: llvm_unreachable("Unknown loc info!");
2904     case CCValAssign::Full: break;
2905     case CCValAssign::SExt:
2906       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2907       break;
2908     case CCValAssign::ZExt:
2909       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2910       break;
2911     case CCValAssign::AExt:
2912       if (RegVT.is128BitVector()) {
2913         // Special case: passing MMX values in XMM registers.
2914         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2915         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2916         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2917       } else
2918         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2919       break;
2920     case CCValAssign::BCvt:
2921       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2922       break;
2923     case CCValAssign::Indirect: {
2924       // Store the argument.
2925       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2926       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2927       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2928                            MachinePointerInfo::getFixedStack(FI),
2929                            false, false, 0);
2930       Arg = SpillSlot;
2931       break;
2932     }
2933     }
2934
2935     if (VA.isRegLoc()) {
2936       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2937       if (isVarArg && IsWin64) {
2938         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2939         // shadow reg if callee is a varargs function.
2940         unsigned ShadowReg = 0;
2941         switch (VA.getLocReg()) {
2942         case X86::XMM0: ShadowReg = X86::RCX; break;
2943         case X86::XMM1: ShadowReg = X86::RDX; break;
2944         case X86::XMM2: ShadowReg = X86::R8; break;
2945         case X86::XMM3: ShadowReg = X86::R9; break;
2946         }
2947         if (ShadowReg)
2948           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2949       }
2950     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2951       assert(VA.isMemLoc());
2952       if (!StackPtr.getNode())
2953         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2954                                       getPointerTy());
2955       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2956                                              dl, DAG, VA, Flags));
2957     }
2958   }
2959
2960   if (!MemOpChains.empty())
2961     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2962
2963   if (Subtarget->isPICStyleGOT()) {
2964     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2965     // GOT pointer.
2966     if (!isTailCall) {
2967       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2968                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2969     } else {
2970       // If we are tail calling and generating PIC/GOT style code load the
2971       // address of the callee into ECX. The value in ecx is used as target of
2972       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2973       // for tail calls on PIC/GOT architectures. Normally we would just put the
2974       // address of GOT into ebx and then call target@PLT. But for tail calls
2975       // ebx would be restored (since ebx is callee saved) before jumping to the
2976       // target@PLT.
2977
2978       // Note: The actual moving to ECX is done further down.
2979       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2980       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2981           !G->getGlobal()->hasProtectedVisibility())
2982         Callee = LowerGlobalAddress(Callee, DAG);
2983       else if (isa<ExternalSymbolSDNode>(Callee))
2984         Callee = LowerExternalSymbol(Callee, DAG);
2985     }
2986   }
2987
2988   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2989     // From AMD64 ABI document:
2990     // For calls that may call functions that use varargs or stdargs
2991     // (prototype-less calls or calls to functions containing ellipsis (...) in
2992     // the declaration) %al is used as hidden argument to specify the number
2993     // of SSE registers used. The contents of %al do not need to match exactly
2994     // the number of registers, but must be an ubound on the number of SSE
2995     // registers used and is in the range 0 - 8 inclusive.
2996
2997     // Count the number of XMM registers allocated.
2998     static const MCPhysReg XMMArgRegs[] = {
2999       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3000       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3001     };
3002     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3003     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3004            && "SSE registers cannot be used when SSE is disabled");
3005
3006     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3007                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3008   }
3009
3010   if (isVarArg && IsMustTail) {
3011     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3012     for (const auto &F : Forwards) {
3013       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3014       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3015     }
3016   }
3017
3018   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3019   // don't need this because the eligibility check rejects calls that require
3020   // shuffling arguments passed in memory.
3021   if (!IsSibcall && isTailCall) {
3022     // Force all the incoming stack arguments to be loaded from the stack
3023     // before any new outgoing arguments are stored to the stack, because the
3024     // outgoing stack slots may alias the incoming argument stack slots, and
3025     // the alias isn't otherwise explicit. This is slightly more conservative
3026     // than necessary, because it means that each store effectively depends
3027     // on every argument instead of just those arguments it would clobber.
3028     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3029
3030     SmallVector<SDValue, 8> MemOpChains2;
3031     SDValue FIN;
3032     int FI = 0;
3033     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3034       CCValAssign &VA = ArgLocs[i];
3035       if (VA.isRegLoc())
3036         continue;
3037       assert(VA.isMemLoc());
3038       SDValue Arg = OutVals[i];
3039       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3040       // Skip inalloca arguments.  They don't require any work.
3041       if (Flags.isInAlloca())
3042         continue;
3043       // Create frame index.
3044       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3045       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3046       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3047       FIN = DAG.getFrameIndex(FI, getPointerTy());
3048
3049       if (Flags.isByVal()) {
3050         // Copy relative to framepointer.
3051         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3052         if (!StackPtr.getNode())
3053           StackPtr = DAG.getCopyFromReg(Chain, dl,
3054                                         RegInfo->getStackRegister(),
3055                                         getPointerTy());
3056         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3057
3058         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3059                                                          ArgChain,
3060                                                          Flags, DAG, dl));
3061       } else {
3062         // Store relative to framepointer.
3063         MemOpChains2.push_back(
3064           DAG.getStore(ArgChain, dl, Arg, FIN,
3065                        MachinePointerInfo::getFixedStack(FI),
3066                        false, false, 0));
3067       }
3068     }
3069
3070     if (!MemOpChains2.empty())
3071       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3072
3073     // Store the return address to the appropriate stack slot.
3074     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3075                                      getPointerTy(), RegInfo->getSlotSize(),
3076                                      FPDiff, dl);
3077   }
3078
3079   // Build a sequence of copy-to-reg nodes chained together with token chain
3080   // and flag operands which copy the outgoing args into registers.
3081   SDValue InFlag;
3082   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3083     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3084                              RegsToPass[i].second, InFlag);
3085     InFlag = Chain.getValue(1);
3086   }
3087
3088   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3089     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3090     // In the 64-bit large code model, we have to make all calls
3091     // through a register, since the call instruction's 32-bit
3092     // pc-relative offset may not be large enough to hold the whole
3093     // address.
3094   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3095     // If the callee is a GlobalAddress node (quite common, every direct call
3096     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3097     // it.
3098     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3099
3100     // We should use extra load for direct calls to dllimported functions in
3101     // non-JIT mode.
3102     const GlobalValue *GV = G->getGlobal();
3103     if (!GV->hasDLLImportStorageClass()) {
3104       unsigned char OpFlags = 0;
3105       bool ExtraLoad = false;
3106       unsigned WrapperKind = ISD::DELETED_NODE;
3107
3108       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3109       // external symbols most go through the PLT in PIC mode.  If the symbol
3110       // has hidden or protected visibility, or if it is static or local, then
3111       // we don't need to use the PLT - we can directly call it.
3112       if (Subtarget->isTargetELF() &&
3113           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3114           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3115         OpFlags = X86II::MO_PLT;
3116       } else if (Subtarget->isPICStyleStubAny() &&
3117                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3118                  (!Subtarget->getTargetTriple().isMacOSX() ||
3119                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3120         // PC-relative references to external symbols should go through $stub,
3121         // unless we're building with the leopard linker or later, which
3122         // automatically synthesizes these stubs.
3123         OpFlags = X86II::MO_DARWIN_STUB;
3124       } else if (Subtarget->isPICStyleRIPRel() &&
3125                  isa<Function>(GV) &&
3126                  cast<Function>(GV)->getAttributes().
3127                    hasAttribute(AttributeSet::FunctionIndex,
3128                                 Attribute::NonLazyBind)) {
3129         // If the function is marked as non-lazy, generate an indirect call
3130         // which loads from the GOT directly. This avoids runtime overhead
3131         // at the cost of eager binding (and one extra byte of encoding).
3132         OpFlags = X86II::MO_GOTPCREL;
3133         WrapperKind = X86ISD::WrapperRIP;
3134         ExtraLoad = true;
3135       }
3136
3137       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3138                                           G->getOffset(), OpFlags);
3139
3140       // Add a wrapper if needed.
3141       if (WrapperKind != ISD::DELETED_NODE)
3142         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3143       // Add extra indirection if needed.
3144       if (ExtraLoad)
3145         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3146                              MachinePointerInfo::getGOT(),
3147                              false, false, false, 0);
3148     }
3149   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3150     unsigned char OpFlags = 0;
3151
3152     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3153     // external symbols should go through the PLT.
3154     if (Subtarget->isTargetELF() &&
3155         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3156       OpFlags = X86II::MO_PLT;
3157     } else if (Subtarget->isPICStyleStubAny() &&
3158                (!Subtarget->getTargetTriple().isMacOSX() ||
3159                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3160       // PC-relative references to external symbols should go through $stub,
3161       // unless we're building with the leopard linker or later, which
3162       // automatically synthesizes these stubs.
3163       OpFlags = X86II::MO_DARWIN_STUB;
3164     }
3165
3166     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3167                                          OpFlags);
3168   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
3169     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3170     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3171   }
3172
3173   // Returns a chain & a flag for retval copy to use.
3174   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3175   SmallVector<SDValue, 8> Ops;
3176
3177   if (!IsSibcall && isTailCall) {
3178     Chain = DAG.getCALLSEQ_END(Chain,
3179                                DAG.getIntPtrConstant(NumBytesToPop, true),
3180                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3181     InFlag = Chain.getValue(1);
3182   }
3183
3184   Ops.push_back(Chain);
3185   Ops.push_back(Callee);
3186
3187   if (isTailCall)
3188     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3189
3190   // Add argument registers to the end of the list so that they are known live
3191   // into the call.
3192   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3193     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3194                                   RegsToPass[i].second.getValueType()));
3195
3196   // Add a register mask operand representing the call-preserved registers.
3197   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
3198   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3199   assert(Mask && "Missing call preserved mask for calling convention");
3200   Ops.push_back(DAG.getRegisterMask(Mask));
3201
3202   if (InFlag.getNode())
3203     Ops.push_back(InFlag);
3204
3205   if (isTailCall) {
3206     // We used to do:
3207     //// If this is the first return lowered for this function, add the regs
3208     //// to the liveout set for the function.
3209     // This isn't right, although it's probably harmless on x86; liveouts
3210     // should be computed from returns not tail calls.  Consider a void
3211     // function making a tail call to a function returning int.
3212     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3213   }
3214
3215   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3216   InFlag = Chain.getValue(1);
3217
3218   // Create the CALLSEQ_END node.
3219   unsigned NumBytesForCalleeToPop;
3220   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3221                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3222     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3223   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3224            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3225            SR == StackStructReturn)
3226     // If this is a call to a struct-return function, the callee
3227     // pops the hidden struct pointer, so we have to push it back.
3228     // This is common for Darwin/X86, Linux & Mingw32 targets.
3229     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3230     NumBytesForCalleeToPop = 4;
3231   else
3232     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3233
3234   // Returns a flag for retval copy to use.
3235   if (!IsSibcall) {
3236     Chain = DAG.getCALLSEQ_END(Chain,
3237                                DAG.getIntPtrConstant(NumBytesToPop, true),
3238                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3239                                                      true),
3240                                InFlag, dl);
3241     InFlag = Chain.getValue(1);
3242   }
3243
3244   // Handle result values, copying them out of physregs into vregs that we
3245   // return.
3246   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3247                          Ins, dl, DAG, InVals);
3248 }
3249
3250 //===----------------------------------------------------------------------===//
3251 //                Fast Calling Convention (tail call) implementation
3252 //===----------------------------------------------------------------------===//
3253
3254 //  Like std call, callee cleans arguments, convention except that ECX is
3255 //  reserved for storing the tail called function address. Only 2 registers are
3256 //  free for argument passing (inreg). Tail call optimization is performed
3257 //  provided:
3258 //                * tailcallopt is enabled
3259 //                * caller/callee are fastcc
3260 //  On X86_64 architecture with GOT-style position independent code only local
3261 //  (within module) calls are supported at the moment.
3262 //  To keep the stack aligned according to platform abi the function
3263 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3264 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3265 //  If a tail called function callee has more arguments than the caller the
3266 //  caller needs to make sure that there is room to move the RETADDR to. This is
3267 //  achieved by reserving an area the size of the argument delta right after the
3268 //  original RETADDR, but before the saved framepointer or the spilled registers
3269 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3270 //  stack layout:
3271 //    arg1
3272 //    arg2
3273 //    RETADDR
3274 //    [ new RETADDR
3275 //      move area ]
3276 //    (possible EBP)
3277 //    ESI
3278 //    EDI
3279 //    local1 ..
3280
3281 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3282 /// for a 16 byte align requirement.
3283 unsigned
3284 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3285                                                SelectionDAG& DAG) const {
3286   MachineFunction &MF = DAG.getMachineFunction();
3287   const TargetMachine &TM = MF.getTarget();
3288   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3289       TM.getSubtargetImpl()->getRegisterInfo());
3290   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
3291   unsigned StackAlignment = TFI.getStackAlignment();
3292   uint64_t AlignMask = StackAlignment - 1;
3293   int64_t Offset = StackSize;
3294   unsigned SlotSize = RegInfo->getSlotSize();
3295   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3296     // Number smaller than 12 so just add the difference.
3297     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3298   } else {
3299     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3300     Offset = ((~AlignMask) & Offset) + StackAlignment +
3301       (StackAlignment-SlotSize);
3302   }
3303   return Offset;
3304 }
3305
3306 /// MatchingStackOffset - Return true if the given stack call argument is
3307 /// already available in the same position (relatively) of the caller's
3308 /// incoming argument stack.
3309 static
3310 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3311                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3312                          const X86InstrInfo *TII) {
3313   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3314   int FI = INT_MAX;
3315   if (Arg.getOpcode() == ISD::CopyFromReg) {
3316     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3317     if (!TargetRegisterInfo::isVirtualRegister(VR))
3318       return false;
3319     MachineInstr *Def = MRI->getVRegDef(VR);
3320     if (!Def)
3321       return false;
3322     if (!Flags.isByVal()) {
3323       if (!TII->isLoadFromStackSlot(Def, FI))
3324         return false;
3325     } else {
3326       unsigned Opcode = Def->getOpcode();
3327       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3328           Def->getOperand(1).isFI()) {
3329         FI = Def->getOperand(1).getIndex();
3330         Bytes = Flags.getByValSize();
3331       } else
3332         return false;
3333     }
3334   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3335     if (Flags.isByVal())
3336       // ByVal argument is passed in as a pointer but it's now being
3337       // dereferenced. e.g.
3338       // define @foo(%struct.X* %A) {
3339       //   tail call @bar(%struct.X* byval %A)
3340       // }
3341       return false;
3342     SDValue Ptr = Ld->getBasePtr();
3343     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3344     if (!FINode)
3345       return false;
3346     FI = FINode->getIndex();
3347   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3348     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3349     FI = FINode->getIndex();
3350     Bytes = Flags.getByValSize();
3351   } else
3352     return false;
3353
3354   assert(FI != INT_MAX);
3355   if (!MFI->isFixedObjectIndex(FI))
3356     return false;
3357   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3358 }
3359
3360 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3361 /// for tail call optimization. Targets which want to do tail call
3362 /// optimization should implement this function.
3363 bool
3364 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3365                                                      CallingConv::ID CalleeCC,
3366                                                      bool isVarArg,
3367                                                      bool isCalleeStructRet,
3368                                                      bool isCallerStructRet,
3369                                                      Type *RetTy,
3370                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3371                                     const SmallVectorImpl<SDValue> &OutVals,
3372                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3373                                                      SelectionDAG &DAG) const {
3374   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3375     return false;
3376
3377   // If -tailcallopt is specified, make fastcc functions tail-callable.
3378   const MachineFunction &MF = DAG.getMachineFunction();
3379   const Function *CallerF = MF.getFunction();
3380
3381   // If the function return type is x86_fp80 and the callee return type is not,
3382   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3383   // perform a tailcall optimization here.
3384   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3385     return false;
3386
3387   CallingConv::ID CallerCC = CallerF->getCallingConv();
3388   bool CCMatch = CallerCC == CalleeCC;
3389   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3390   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3391
3392   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3393     if (IsTailCallConvention(CalleeCC) && CCMatch)
3394       return true;
3395     return false;
3396   }
3397
3398   // Look for obvious safe cases to perform tail call optimization that do not
3399   // require ABI changes. This is what gcc calls sibcall.
3400
3401   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3402   // emit a special epilogue.
3403   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3404       DAG.getSubtarget().getRegisterInfo());
3405   if (RegInfo->needsStackRealignment(MF))
3406     return false;
3407
3408   // Also avoid sibcall optimization if either caller or callee uses struct
3409   // return semantics.
3410   if (isCalleeStructRet || isCallerStructRet)
3411     return false;
3412
3413   // An stdcall/thiscall caller is expected to clean up its arguments; the
3414   // callee isn't going to do that.
3415   // FIXME: this is more restrictive than needed. We could produce a tailcall
3416   // when the stack adjustment matches. For example, with a thiscall that takes
3417   // only one argument.
3418   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3419                    CallerCC == CallingConv::X86_ThisCall))
3420     return false;
3421
3422   // Do not sibcall optimize vararg calls unless all arguments are passed via
3423   // registers.
3424   if (isVarArg && !Outs.empty()) {
3425
3426     // Optimizing for varargs on Win64 is unlikely to be safe without
3427     // additional testing.
3428     if (IsCalleeWin64 || IsCallerWin64)
3429       return false;
3430
3431     SmallVector<CCValAssign, 16> ArgLocs;
3432     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3433                    *DAG.getContext());
3434
3435     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3436     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3437       if (!ArgLocs[i].isRegLoc())
3438         return false;
3439   }
3440
3441   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3442   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3443   // this into a sibcall.
3444   bool Unused = false;
3445   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3446     if (!Ins[i].Used) {
3447       Unused = true;
3448       break;
3449     }
3450   }
3451   if (Unused) {
3452     SmallVector<CCValAssign, 16> RVLocs;
3453     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3454                    *DAG.getContext());
3455     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3456     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3457       CCValAssign &VA = RVLocs[i];
3458       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3459         return false;
3460     }
3461   }
3462
3463   // If the calling conventions do not match, then we'd better make sure the
3464   // results are returned in the same way as what the caller expects.
3465   if (!CCMatch) {
3466     SmallVector<CCValAssign, 16> RVLocs1;
3467     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3468                     *DAG.getContext());
3469     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3470
3471     SmallVector<CCValAssign, 16> RVLocs2;
3472     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3473                     *DAG.getContext());
3474     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3475
3476     if (RVLocs1.size() != RVLocs2.size())
3477       return false;
3478     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3479       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3480         return false;
3481       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3482         return false;
3483       if (RVLocs1[i].isRegLoc()) {
3484         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3485           return false;
3486       } else {
3487         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3488           return false;
3489       }
3490     }
3491   }
3492
3493   // If the callee takes no arguments then go on to check the results of the
3494   // call.
3495   if (!Outs.empty()) {
3496     // Check if stack adjustment is needed. For now, do not do this if any
3497     // argument is passed on the stack.
3498     SmallVector<CCValAssign, 16> ArgLocs;
3499     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3500                    *DAG.getContext());
3501
3502     // Allocate shadow area for Win64
3503     if (IsCalleeWin64)
3504       CCInfo.AllocateStack(32, 8);
3505
3506     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3507     if (CCInfo.getNextStackOffset()) {
3508       MachineFunction &MF = DAG.getMachineFunction();
3509       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3510         return false;
3511
3512       // Check if the arguments are already laid out in the right way as
3513       // the caller's fixed stack objects.
3514       MachineFrameInfo *MFI = MF.getFrameInfo();
3515       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3516       const X86InstrInfo *TII =
3517           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
3518       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3519         CCValAssign &VA = ArgLocs[i];
3520         SDValue Arg = OutVals[i];
3521         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3522         if (VA.getLocInfo() == CCValAssign::Indirect)
3523           return false;
3524         if (!VA.isRegLoc()) {
3525           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3526                                    MFI, MRI, TII))
3527             return false;
3528         }
3529       }
3530     }
3531
3532     // If the tailcall address may be in a register, then make sure it's
3533     // possible to register allocate for it. In 32-bit, the call address can
3534     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3535     // callee-saved registers are restored. These happen to be the same
3536     // registers used to pass 'inreg' arguments so watch out for those.
3537     if (!Subtarget->is64Bit() &&
3538         ((!isa<GlobalAddressSDNode>(Callee) &&
3539           !isa<ExternalSymbolSDNode>(Callee)) ||
3540          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3541       unsigned NumInRegs = 0;
3542       // In PIC we need an extra register to formulate the address computation
3543       // for the callee.
3544       unsigned MaxInRegs =
3545         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3546
3547       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3548         CCValAssign &VA = ArgLocs[i];
3549         if (!VA.isRegLoc())
3550           continue;
3551         unsigned Reg = VA.getLocReg();
3552         switch (Reg) {
3553         default: break;
3554         case X86::EAX: case X86::EDX: case X86::ECX:
3555           if (++NumInRegs == MaxInRegs)
3556             return false;
3557           break;
3558         }
3559       }
3560     }
3561   }
3562
3563   return true;
3564 }
3565
3566 FastISel *
3567 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3568                                   const TargetLibraryInfo *libInfo) const {
3569   return X86::createFastISel(funcInfo, libInfo);
3570 }
3571
3572 //===----------------------------------------------------------------------===//
3573 //                           Other Lowering Hooks
3574 //===----------------------------------------------------------------------===//
3575
3576 static bool MayFoldLoad(SDValue Op) {
3577   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3578 }
3579
3580 static bool MayFoldIntoStore(SDValue Op) {
3581   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3582 }
3583
3584 static bool isTargetShuffle(unsigned Opcode) {
3585   switch(Opcode) {
3586   default: return false;
3587   case X86ISD::BLENDI:
3588   case X86ISD::PSHUFB:
3589   case X86ISD::PSHUFD:
3590   case X86ISD::PSHUFHW:
3591   case X86ISD::PSHUFLW:
3592   case X86ISD::SHUFP:
3593   case X86ISD::PALIGNR:
3594   case X86ISD::MOVLHPS:
3595   case X86ISD::MOVLHPD:
3596   case X86ISD::MOVHLPS:
3597   case X86ISD::MOVLPS:
3598   case X86ISD::MOVLPD:
3599   case X86ISD::MOVSHDUP:
3600   case X86ISD::MOVSLDUP:
3601   case X86ISD::MOVDDUP:
3602   case X86ISD::MOVSS:
3603   case X86ISD::MOVSD:
3604   case X86ISD::UNPCKL:
3605   case X86ISD::UNPCKH:
3606   case X86ISD::VPERMILPI:
3607   case X86ISD::VPERM2X128:
3608   case X86ISD::VPERMI:
3609     return true;
3610   }
3611 }
3612
3613 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3614                                     SDValue V1, SelectionDAG &DAG) {
3615   switch(Opc) {
3616   default: llvm_unreachable("Unknown x86 shuffle node");
3617   case X86ISD::MOVSHDUP:
3618   case X86ISD::MOVSLDUP:
3619   case X86ISD::MOVDDUP:
3620     return DAG.getNode(Opc, dl, VT, V1);
3621   }
3622 }
3623
3624 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3625                                     SDValue V1, unsigned TargetMask,
3626                                     SelectionDAG &DAG) {
3627   switch(Opc) {
3628   default: llvm_unreachable("Unknown x86 shuffle node");
3629   case X86ISD::PSHUFD:
3630   case X86ISD::PSHUFHW:
3631   case X86ISD::PSHUFLW:
3632   case X86ISD::VPERMILPI:
3633   case X86ISD::VPERMI:
3634     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3635   }
3636 }
3637
3638 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3639                                     SDValue V1, SDValue V2, unsigned TargetMask,
3640                                     SelectionDAG &DAG) {
3641   switch(Opc) {
3642   default: llvm_unreachable("Unknown x86 shuffle node");
3643   case X86ISD::PALIGNR:
3644   case X86ISD::VALIGN:
3645   case X86ISD::SHUFP:
3646   case X86ISD::VPERM2X128:
3647     return DAG.getNode(Opc, dl, VT, V1, V2,
3648                        DAG.getConstant(TargetMask, MVT::i8));
3649   }
3650 }
3651
3652 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3653                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3654   switch(Opc) {
3655   default: llvm_unreachable("Unknown x86 shuffle node");
3656   case X86ISD::MOVLHPS:
3657   case X86ISD::MOVLHPD:
3658   case X86ISD::MOVHLPS:
3659   case X86ISD::MOVLPS:
3660   case X86ISD::MOVLPD:
3661   case X86ISD::MOVSS:
3662   case X86ISD::MOVSD:
3663   case X86ISD::UNPCKL:
3664   case X86ISD::UNPCKH:
3665     return DAG.getNode(Opc, dl, VT, V1, V2);
3666   }
3667 }
3668
3669 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3670   MachineFunction &MF = DAG.getMachineFunction();
3671   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3672       DAG.getSubtarget().getRegisterInfo());
3673   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3674   int ReturnAddrIndex = FuncInfo->getRAIndex();
3675
3676   if (ReturnAddrIndex == 0) {
3677     // Set up a frame object for the return address.
3678     unsigned SlotSize = RegInfo->getSlotSize();
3679     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3680                                                            -(int64_t)SlotSize,
3681                                                            false);
3682     FuncInfo->setRAIndex(ReturnAddrIndex);
3683   }
3684
3685   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3686 }
3687
3688 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3689                                        bool hasSymbolicDisplacement) {
3690   // Offset should fit into 32 bit immediate field.
3691   if (!isInt<32>(Offset))
3692     return false;
3693
3694   // If we don't have a symbolic displacement - we don't have any extra
3695   // restrictions.
3696   if (!hasSymbolicDisplacement)
3697     return true;
3698
3699   // FIXME: Some tweaks might be needed for medium code model.
3700   if (M != CodeModel::Small && M != CodeModel::Kernel)
3701     return false;
3702
3703   // For small code model we assume that latest object is 16MB before end of 31
3704   // bits boundary. We may also accept pretty large negative constants knowing
3705   // that all objects are in the positive half of address space.
3706   if (M == CodeModel::Small && Offset < 16*1024*1024)
3707     return true;
3708
3709   // For kernel code model we know that all object resist in the negative half
3710   // of 32bits address space. We may not accept negative offsets, since they may
3711   // be just off and we may accept pretty large positive ones.
3712   if (M == CodeModel::Kernel && Offset >= 0)
3713     return true;
3714
3715   return false;
3716 }
3717
3718 /// isCalleePop - Determines whether the callee is required to pop its
3719 /// own arguments. Callee pop is necessary to support tail calls.
3720 bool X86::isCalleePop(CallingConv::ID CallingConv,
3721                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3722   switch (CallingConv) {
3723   default:
3724     return false;
3725   case CallingConv::X86_StdCall:
3726   case CallingConv::X86_FastCall:
3727   case CallingConv::X86_ThisCall:
3728     return !is64Bit;
3729   case CallingConv::Fast:
3730   case CallingConv::GHC:
3731   case CallingConv::HiPE:
3732     if (IsVarArg)
3733       return false;
3734     return TailCallOpt;
3735   }
3736 }
3737
3738 /// \brief Return true if the condition is an unsigned comparison operation.
3739 static bool isX86CCUnsigned(unsigned X86CC) {
3740   switch (X86CC) {
3741   default: llvm_unreachable("Invalid integer condition!");
3742   case X86::COND_E:     return true;
3743   case X86::COND_G:     return false;
3744   case X86::COND_GE:    return false;
3745   case X86::COND_L:     return false;
3746   case X86::COND_LE:    return false;
3747   case X86::COND_NE:    return true;
3748   case X86::COND_B:     return true;
3749   case X86::COND_A:     return true;
3750   case X86::COND_BE:    return true;
3751   case X86::COND_AE:    return true;
3752   }
3753   llvm_unreachable("covered switch fell through?!");
3754 }
3755
3756 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3757 /// specific condition code, returning the condition code and the LHS/RHS of the
3758 /// comparison to make.
3759 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3760                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3761   if (!isFP) {
3762     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3763       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3764         // X > -1   -> X == 0, jump !sign.
3765         RHS = DAG.getConstant(0, RHS.getValueType());
3766         return X86::COND_NS;
3767       }
3768       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3769         // X < 0   -> X == 0, jump on sign.
3770         return X86::COND_S;
3771       }
3772       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3773         // X < 1   -> X <= 0
3774         RHS = DAG.getConstant(0, RHS.getValueType());
3775         return X86::COND_LE;
3776       }
3777     }
3778
3779     switch (SetCCOpcode) {
3780     default: llvm_unreachable("Invalid integer condition!");
3781     case ISD::SETEQ:  return X86::COND_E;
3782     case ISD::SETGT:  return X86::COND_G;
3783     case ISD::SETGE:  return X86::COND_GE;
3784     case ISD::SETLT:  return X86::COND_L;
3785     case ISD::SETLE:  return X86::COND_LE;
3786     case ISD::SETNE:  return X86::COND_NE;
3787     case ISD::SETULT: return X86::COND_B;
3788     case ISD::SETUGT: return X86::COND_A;
3789     case ISD::SETULE: return X86::COND_BE;
3790     case ISD::SETUGE: return X86::COND_AE;
3791     }
3792   }
3793
3794   // First determine if it is required or is profitable to flip the operands.
3795
3796   // If LHS is a foldable load, but RHS is not, flip the condition.
3797   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3798       !ISD::isNON_EXTLoad(RHS.getNode())) {
3799     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3800     std::swap(LHS, RHS);
3801   }
3802
3803   switch (SetCCOpcode) {
3804   default: break;
3805   case ISD::SETOLT:
3806   case ISD::SETOLE:
3807   case ISD::SETUGT:
3808   case ISD::SETUGE:
3809     std::swap(LHS, RHS);
3810     break;
3811   }
3812
3813   // On a floating point condition, the flags are set as follows:
3814   // ZF  PF  CF   op
3815   //  0 | 0 | 0 | X > Y
3816   //  0 | 0 | 1 | X < Y
3817   //  1 | 0 | 0 | X == Y
3818   //  1 | 1 | 1 | unordered
3819   switch (SetCCOpcode) {
3820   default: llvm_unreachable("Condcode should be pre-legalized away");
3821   case ISD::SETUEQ:
3822   case ISD::SETEQ:   return X86::COND_E;
3823   case ISD::SETOLT:              // flipped
3824   case ISD::SETOGT:
3825   case ISD::SETGT:   return X86::COND_A;
3826   case ISD::SETOLE:              // flipped
3827   case ISD::SETOGE:
3828   case ISD::SETGE:   return X86::COND_AE;
3829   case ISD::SETUGT:              // flipped
3830   case ISD::SETULT:
3831   case ISD::SETLT:   return X86::COND_B;
3832   case ISD::SETUGE:              // flipped
3833   case ISD::SETULE:
3834   case ISD::SETLE:   return X86::COND_BE;
3835   case ISD::SETONE:
3836   case ISD::SETNE:   return X86::COND_NE;
3837   case ISD::SETUO:   return X86::COND_P;
3838   case ISD::SETO:    return X86::COND_NP;
3839   case ISD::SETOEQ:
3840   case ISD::SETUNE:  return X86::COND_INVALID;
3841   }
3842 }
3843
3844 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3845 /// code. Current x86 isa includes the following FP cmov instructions:
3846 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3847 static bool hasFPCMov(unsigned X86CC) {
3848   switch (X86CC) {
3849   default:
3850     return false;
3851   case X86::COND_B:
3852   case X86::COND_BE:
3853   case X86::COND_E:
3854   case X86::COND_P:
3855   case X86::COND_A:
3856   case X86::COND_AE:
3857   case X86::COND_NE:
3858   case X86::COND_NP:
3859     return true;
3860   }
3861 }
3862
3863 /// isFPImmLegal - Returns true if the target can instruction select the
3864 /// specified FP immediate natively. If false, the legalizer will
3865 /// materialize the FP immediate as a load from a constant pool.
3866 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3867   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3868     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3869       return true;
3870   }
3871   return false;
3872 }
3873
3874 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3875                                               ISD::LoadExtType ExtTy,
3876                                               EVT NewVT) const {
3877   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3878   // relocation target a movq or addq instruction: don't let the load shrink.
3879   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3880   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3881     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3882       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3883   return true;
3884 }
3885
3886 /// \brief Returns true if it is beneficial to convert a load of a constant
3887 /// to just the constant itself.
3888 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3889                                                           Type *Ty) const {
3890   assert(Ty->isIntegerTy());
3891
3892   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3893   if (BitSize == 0 || BitSize > 64)
3894     return false;
3895   return true;
3896 }
3897
3898 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3899                                                 unsigned Index) const {
3900   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3901     return false;
3902
3903   return (Index == 0 || Index == ResVT.getVectorNumElements());
3904 }
3905
3906 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3907   // Speculate cttz only if we can directly use TZCNT.
3908   return Subtarget->hasBMI();
3909 }
3910
3911 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3912   // Speculate ctlz only if we can directly use LZCNT.
3913   return Subtarget->hasLZCNT();
3914 }
3915
3916 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3917 /// the specified range (L, H].
3918 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3919   return (Val < 0) || (Val >= Low && Val < Hi);
3920 }
3921
3922 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3923 /// specified value.
3924 static bool isUndefOrEqual(int Val, int CmpVal) {
3925   return (Val < 0 || Val == CmpVal);
3926 }
3927
3928 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3929 /// from position Pos and ending in Pos+Size, falls within the specified
3930 /// sequential range (Low, Low+Size]. or is undef.
3931 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3932                                        unsigned Pos, unsigned Size, int Low) {
3933   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3934     if (!isUndefOrEqual(Mask[i], Low))
3935       return false;
3936   return true;
3937 }
3938
3939 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3940 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3941 /// operand - by default will match for first operand.
3942 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3943                          bool TestSecondOperand = false) {
3944   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3945       VT != MVT::v2f64 && VT != MVT::v2i64)
3946     return false;
3947
3948   unsigned NumElems = VT.getVectorNumElements();
3949   unsigned Lo = TestSecondOperand ? NumElems : 0;
3950   unsigned Hi = Lo + NumElems;
3951
3952   for (unsigned i = 0; i < NumElems; ++i)
3953     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3954       return false;
3955
3956   return true;
3957 }
3958
3959 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3960 /// is suitable for input to PSHUFHW.
3961 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3962   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3963     return false;
3964
3965   // Lower quadword copied in order or undef.
3966   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3967     return false;
3968
3969   // Upper quadword shuffled.
3970   for (unsigned i = 4; i != 8; ++i)
3971     if (!isUndefOrInRange(Mask[i], 4, 8))
3972       return false;
3973
3974   if (VT == MVT::v16i16) {
3975     // Lower quadword copied in order or undef.
3976     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3977       return false;
3978
3979     // Upper quadword shuffled.
3980     for (unsigned i = 12; i != 16; ++i)
3981       if (!isUndefOrInRange(Mask[i], 12, 16))
3982         return false;
3983   }
3984
3985   return true;
3986 }
3987
3988 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3989 /// is suitable for input to PSHUFLW.
3990 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3991   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3992     return false;
3993
3994   // Upper quadword copied in order.
3995   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3996     return false;
3997
3998   // Lower quadword shuffled.
3999   for (unsigned i = 0; i != 4; ++i)
4000     if (!isUndefOrInRange(Mask[i], 0, 4))
4001       return false;
4002
4003   if (VT == MVT::v16i16) {
4004     // Upper quadword copied in order.
4005     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4006       return false;
4007
4008     // Lower quadword shuffled.
4009     for (unsigned i = 8; i != 12; ++i)
4010       if (!isUndefOrInRange(Mask[i], 8, 12))
4011         return false;
4012   }
4013
4014   return true;
4015 }
4016
4017 /// \brief Return true if the mask specifies a shuffle of elements that is
4018 /// suitable for input to intralane (palignr) or interlane (valign) vector
4019 /// right-shift.
4020 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4021   unsigned NumElts = VT.getVectorNumElements();
4022   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4023   unsigned NumLaneElts = NumElts/NumLanes;
4024
4025   // Do not handle 64-bit element shuffles with palignr.
4026   if (NumLaneElts == 2)
4027     return false;
4028
4029   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4030     unsigned i;
4031     for (i = 0; i != NumLaneElts; ++i) {
4032       if (Mask[i+l] >= 0)
4033         break;
4034     }
4035
4036     // Lane is all undef, go to next lane
4037     if (i == NumLaneElts)
4038       continue;
4039
4040     int Start = Mask[i+l];
4041
4042     // Make sure its in this lane in one of the sources
4043     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4044         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4045       return false;
4046
4047     // If not lane 0, then we must match lane 0
4048     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4049       return false;
4050
4051     // Correct second source to be contiguous with first source
4052     if (Start >= (int)NumElts)
4053       Start -= NumElts - NumLaneElts;
4054
4055     // Make sure we're shifting in the right direction.
4056     if (Start <= (int)(i+l))
4057       return false;
4058
4059     Start -= i;
4060
4061     // Check the rest of the elements to see if they are consecutive.
4062     for (++i; i != NumLaneElts; ++i) {
4063       int Idx = Mask[i+l];
4064
4065       // Make sure its in this lane
4066       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4067           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4068         return false;
4069
4070       // If not lane 0, then we must match lane 0
4071       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4072         return false;
4073
4074       if (Idx >= (int)NumElts)
4075         Idx -= NumElts - NumLaneElts;
4076
4077       if (!isUndefOrEqual(Idx, Start+i))
4078         return false;
4079
4080     }
4081   }
4082
4083   return true;
4084 }
4085
4086 /// \brief Return true if the node specifies a shuffle of elements that is
4087 /// suitable for input to PALIGNR.
4088 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4089                           const X86Subtarget *Subtarget) {
4090   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4091       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4092       VT.is512BitVector())
4093     // FIXME: Add AVX512BW.
4094     return false;
4095
4096   return isAlignrMask(Mask, VT, false);
4097 }
4098
4099 /// \brief Return true if the node specifies a shuffle of elements that is
4100 /// suitable for input to VALIGN.
4101 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4102                           const X86Subtarget *Subtarget) {
4103   // FIXME: Add AVX512VL.
4104   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4105     return false;
4106   return isAlignrMask(Mask, VT, true);
4107 }
4108
4109 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4110 /// the two vector operands have swapped position.
4111 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4112                                      unsigned NumElems) {
4113   for (unsigned i = 0; i != NumElems; ++i) {
4114     int idx = Mask[i];
4115     if (idx < 0)
4116       continue;
4117     else if (idx < (int)NumElems)
4118       Mask[i] = idx + NumElems;
4119     else
4120       Mask[i] = idx - NumElems;
4121   }
4122 }
4123
4124 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4125 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4126 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4127 /// reverse of what x86 shuffles want.
4128 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4129
4130   unsigned NumElems = VT.getVectorNumElements();
4131   unsigned NumLanes = VT.getSizeInBits()/128;
4132   unsigned NumLaneElems = NumElems/NumLanes;
4133
4134   if (NumLaneElems != 2 && NumLaneElems != 4)
4135     return false;
4136
4137   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4138   bool symetricMaskRequired =
4139     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4140
4141   // VSHUFPSY divides the resulting vector into 4 chunks.
4142   // The sources are also splitted into 4 chunks, and each destination
4143   // chunk must come from a different source chunk.
4144   //
4145   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4146   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4147   //
4148   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4149   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4150   //
4151   // VSHUFPDY divides the resulting vector into 4 chunks.
4152   // The sources are also splitted into 4 chunks, and each destination
4153   // chunk must come from a different source chunk.
4154   //
4155   //  SRC1 =>      X3       X2       X1       X0
4156   //  SRC2 =>      Y3       Y2       Y1       Y0
4157   //
4158   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4159   //
4160   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4161   unsigned HalfLaneElems = NumLaneElems/2;
4162   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4163     for (unsigned i = 0; i != NumLaneElems; ++i) {
4164       int Idx = Mask[i+l];
4165       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4166       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4167         return false;
4168       // For VSHUFPSY, the mask of the second half must be the same as the
4169       // first but with the appropriate offsets. This works in the same way as
4170       // VPERMILPS works with masks.
4171       if (!symetricMaskRequired || Idx < 0)
4172         continue;
4173       if (MaskVal[i] < 0) {
4174         MaskVal[i] = Idx - l;
4175         continue;
4176       }
4177       if ((signed)(Idx - l) != MaskVal[i])
4178         return false;
4179     }
4180   }
4181
4182   return true;
4183 }
4184
4185 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4186 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4187 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4188   if (!VT.is128BitVector())
4189     return false;
4190
4191   unsigned NumElems = VT.getVectorNumElements();
4192
4193   if (NumElems != 4)
4194     return false;
4195
4196   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4197   return isUndefOrEqual(Mask[0], 6) &&
4198          isUndefOrEqual(Mask[1], 7) &&
4199          isUndefOrEqual(Mask[2], 2) &&
4200          isUndefOrEqual(Mask[3], 3);
4201 }
4202
4203 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4204 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4205 /// <2, 3, 2, 3>
4206 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4207   if (!VT.is128BitVector())
4208     return false;
4209
4210   unsigned NumElems = VT.getVectorNumElements();
4211
4212   if (NumElems != 4)
4213     return false;
4214
4215   return isUndefOrEqual(Mask[0], 2) &&
4216          isUndefOrEqual(Mask[1], 3) &&
4217          isUndefOrEqual(Mask[2], 2) &&
4218          isUndefOrEqual(Mask[3], 3);
4219 }
4220
4221 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4222 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4223 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4224   if (!VT.is128BitVector())
4225     return false;
4226
4227   unsigned NumElems = VT.getVectorNumElements();
4228
4229   if (NumElems != 2 && NumElems != 4)
4230     return false;
4231
4232   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4233     if (!isUndefOrEqual(Mask[i], i + NumElems))
4234       return false;
4235
4236   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4237     if (!isUndefOrEqual(Mask[i], i))
4238       return false;
4239
4240   return true;
4241 }
4242
4243 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4244 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4245 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4246   if (!VT.is128BitVector())
4247     return false;
4248
4249   unsigned NumElems = VT.getVectorNumElements();
4250
4251   if (NumElems != 2 && NumElems != 4)
4252     return false;
4253
4254   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4255     if (!isUndefOrEqual(Mask[i], i))
4256       return false;
4257
4258   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4259     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4260       return false;
4261
4262   return true;
4263 }
4264
4265 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4266 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4267 /// i. e: If all but one element come from the same vector.
4268 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4269   // TODO: Deal with AVX's VINSERTPS
4270   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4271     return false;
4272
4273   unsigned CorrectPosV1 = 0;
4274   unsigned CorrectPosV2 = 0;
4275   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4276     if (Mask[i] == -1) {
4277       ++CorrectPosV1;
4278       ++CorrectPosV2;
4279       continue;
4280     }
4281
4282     if (Mask[i] == i)
4283       ++CorrectPosV1;
4284     else if (Mask[i] == i + 4)
4285       ++CorrectPosV2;
4286   }
4287
4288   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4289     // We have 3 elements (undefs count as elements from any vector) from one
4290     // vector, and one from another.
4291     return true;
4292
4293   return false;
4294 }
4295
4296 //
4297 // Some special combinations that can be optimized.
4298 //
4299 static
4300 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4301                                SelectionDAG &DAG) {
4302   MVT VT = SVOp->getSimpleValueType(0);
4303   SDLoc dl(SVOp);
4304
4305   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4306     return SDValue();
4307
4308   ArrayRef<int> Mask = SVOp->getMask();
4309
4310   // These are the special masks that may be optimized.
4311   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4312   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4313   bool MatchEvenMask = true;
4314   bool MatchOddMask  = true;
4315   for (int i=0; i<8; ++i) {
4316     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4317       MatchEvenMask = false;
4318     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4319       MatchOddMask = false;
4320   }
4321
4322   if (!MatchEvenMask && !MatchOddMask)
4323     return SDValue();
4324
4325   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4326
4327   SDValue Op0 = SVOp->getOperand(0);
4328   SDValue Op1 = SVOp->getOperand(1);
4329
4330   if (MatchEvenMask) {
4331     // Shift the second operand right to 32 bits.
4332     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4333     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4334   } else {
4335     // Shift the first operand left to 32 bits.
4336     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4337     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4338   }
4339   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4340   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4341 }
4342
4343 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4344 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4345 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4346                          bool HasInt256, bool V2IsSplat = false) {
4347
4348   assert(VT.getSizeInBits() >= 128 &&
4349          "Unsupported vector type for unpckl");
4350
4351   unsigned NumElts = VT.getVectorNumElements();
4352   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4353       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4354     return false;
4355
4356   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4357          "Unsupported vector type for unpckh");
4358
4359   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4360   unsigned NumLanes = VT.getSizeInBits()/128;
4361   unsigned NumLaneElts = NumElts/NumLanes;
4362
4363   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4364     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4365       int BitI  = Mask[l+i];
4366       int BitI1 = Mask[l+i+1];
4367       if (!isUndefOrEqual(BitI, j))
4368         return false;
4369       if (V2IsSplat) {
4370         if (!isUndefOrEqual(BitI1, NumElts))
4371           return false;
4372       } else {
4373         if (!isUndefOrEqual(BitI1, j + NumElts))
4374           return false;
4375       }
4376     }
4377   }
4378
4379   return true;
4380 }
4381
4382 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4383 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4384 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4385                          bool HasInt256, bool V2IsSplat = false) {
4386   assert(VT.getSizeInBits() >= 128 &&
4387          "Unsupported vector type for unpckh");
4388
4389   unsigned NumElts = VT.getVectorNumElements();
4390   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4391       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4392     return false;
4393
4394   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4395          "Unsupported vector type for unpckh");
4396
4397   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4398   unsigned NumLanes = VT.getSizeInBits()/128;
4399   unsigned NumLaneElts = NumElts/NumLanes;
4400
4401   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4402     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4403       int BitI  = Mask[l+i];
4404       int BitI1 = Mask[l+i+1];
4405       if (!isUndefOrEqual(BitI, j))
4406         return false;
4407       if (V2IsSplat) {
4408         if (isUndefOrEqual(BitI1, NumElts))
4409           return false;
4410       } else {
4411         if (!isUndefOrEqual(BitI1, j+NumElts))
4412           return false;
4413       }
4414     }
4415   }
4416   return true;
4417 }
4418
4419 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4420 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4421 /// <0, 0, 1, 1>
4422 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4423   unsigned NumElts = VT.getVectorNumElements();
4424   bool Is256BitVec = VT.is256BitVector();
4425
4426   if (VT.is512BitVector())
4427     return false;
4428   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4429          "Unsupported vector type for unpckh");
4430
4431   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4432       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4433     return false;
4434
4435   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4436   // FIXME: Need a better way to get rid of this, there's no latency difference
4437   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4438   // the former later. We should also remove the "_undef" special mask.
4439   if (NumElts == 4 && Is256BitVec)
4440     return false;
4441
4442   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4443   // independently on 128-bit lanes.
4444   unsigned NumLanes = VT.getSizeInBits()/128;
4445   unsigned NumLaneElts = NumElts/NumLanes;
4446
4447   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4448     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4449       int BitI  = Mask[l+i];
4450       int BitI1 = Mask[l+i+1];
4451
4452       if (!isUndefOrEqual(BitI, j))
4453         return false;
4454       if (!isUndefOrEqual(BitI1, j))
4455         return false;
4456     }
4457   }
4458
4459   return true;
4460 }
4461
4462 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4463 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4464 /// <2, 2, 3, 3>
4465 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4466   unsigned NumElts = VT.getVectorNumElements();
4467
4468   if (VT.is512BitVector())
4469     return false;
4470
4471   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4472          "Unsupported vector type for unpckh");
4473
4474   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4475       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4476     return false;
4477
4478   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4479   // independently on 128-bit lanes.
4480   unsigned NumLanes = VT.getSizeInBits()/128;
4481   unsigned NumLaneElts = NumElts/NumLanes;
4482
4483   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4484     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4485       int BitI  = Mask[l+i];
4486       int BitI1 = Mask[l+i+1];
4487       if (!isUndefOrEqual(BitI, j))
4488         return false;
4489       if (!isUndefOrEqual(BitI1, j))
4490         return false;
4491     }
4492   }
4493   return true;
4494 }
4495
4496 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4497 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4498 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4499   if (!VT.is512BitVector())
4500     return false;
4501
4502   unsigned NumElts = VT.getVectorNumElements();
4503   unsigned HalfSize = NumElts/2;
4504   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4505     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4506       *Imm = 1;
4507       return true;
4508     }
4509   }
4510   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4511     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4512       *Imm = 0;
4513       return true;
4514     }
4515   }
4516   return false;
4517 }
4518
4519 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4520 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4521 /// MOVSD, and MOVD, i.e. setting the lowest element.
4522 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4523   if (VT.getVectorElementType().getSizeInBits() < 32)
4524     return false;
4525   if (!VT.is128BitVector())
4526     return false;
4527
4528   unsigned NumElts = VT.getVectorNumElements();
4529
4530   if (!isUndefOrEqual(Mask[0], NumElts))
4531     return false;
4532
4533   for (unsigned i = 1; i != NumElts; ++i)
4534     if (!isUndefOrEqual(Mask[i], i))
4535       return false;
4536
4537   return true;
4538 }
4539
4540 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4541 /// as permutations between 128-bit chunks or halves. As an example: this
4542 /// shuffle bellow:
4543 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4544 /// The first half comes from the second half of V1 and the second half from the
4545 /// the second half of V2.
4546 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4547   if (!HasFp256 || !VT.is256BitVector())
4548     return false;
4549
4550   // The shuffle result is divided into half A and half B. In total the two
4551   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4552   // B must come from C, D, E or F.
4553   unsigned HalfSize = VT.getVectorNumElements()/2;
4554   bool MatchA = false, MatchB = false;
4555
4556   // Check if A comes from one of C, D, E, F.
4557   for (unsigned Half = 0; Half != 4; ++Half) {
4558     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4559       MatchA = true;
4560       break;
4561     }
4562   }
4563
4564   // Check if B comes from one of C, D, E, F.
4565   for (unsigned Half = 0; Half != 4; ++Half) {
4566     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4567       MatchB = true;
4568       break;
4569     }
4570   }
4571
4572   return MatchA && MatchB;
4573 }
4574
4575 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4576 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4577 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4578   MVT VT = SVOp->getSimpleValueType(0);
4579
4580   unsigned HalfSize = VT.getVectorNumElements()/2;
4581
4582   unsigned FstHalf = 0, SndHalf = 0;
4583   for (unsigned i = 0; i < HalfSize; ++i) {
4584     if (SVOp->getMaskElt(i) > 0) {
4585       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4586       break;
4587     }
4588   }
4589   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4590     if (SVOp->getMaskElt(i) > 0) {
4591       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4592       break;
4593     }
4594   }
4595
4596   return (FstHalf | (SndHalf << 4));
4597 }
4598
4599 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4600 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4601   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4602   if (EltSize < 32)
4603     return false;
4604
4605   unsigned NumElts = VT.getVectorNumElements();
4606   Imm8 = 0;
4607   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4608     for (unsigned i = 0; i != NumElts; ++i) {
4609       if (Mask[i] < 0)
4610         continue;
4611       Imm8 |= Mask[i] << (i*2);
4612     }
4613     return true;
4614   }
4615
4616   unsigned LaneSize = 4;
4617   SmallVector<int, 4> MaskVal(LaneSize, -1);
4618
4619   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4620     for (unsigned i = 0; i != LaneSize; ++i) {
4621       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4622         return false;
4623       if (Mask[i+l] < 0)
4624         continue;
4625       if (MaskVal[i] < 0) {
4626         MaskVal[i] = Mask[i+l] - l;
4627         Imm8 |= MaskVal[i] << (i*2);
4628         continue;
4629       }
4630       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4631         return false;
4632     }
4633   }
4634   return true;
4635 }
4636
4637 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4638 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4639 /// Note that VPERMIL mask matching is different depending whether theunderlying
4640 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4641 /// to the same elements of the low, but to the higher half of the source.
4642 /// In VPERMILPD the two lanes could be shuffled independently of each other
4643 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4644 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4645   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4646   if (VT.getSizeInBits() < 256 || EltSize < 32)
4647     return false;
4648   bool symetricMaskRequired = (EltSize == 32);
4649   unsigned NumElts = VT.getVectorNumElements();
4650
4651   unsigned NumLanes = VT.getSizeInBits()/128;
4652   unsigned LaneSize = NumElts/NumLanes;
4653   // 2 or 4 elements in one lane
4654
4655   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4656   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4657     for (unsigned i = 0; i != LaneSize; ++i) {
4658       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4659         return false;
4660       if (symetricMaskRequired) {
4661         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4662           ExpectedMaskVal[i] = Mask[i+l] - l;
4663           continue;
4664         }
4665         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4666           return false;
4667       }
4668     }
4669   }
4670   return true;
4671 }
4672
4673 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4674 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4675 /// element of vector 2 and the other elements to come from vector 1 in order.
4676 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4677                                bool V2IsSplat = false, bool V2IsUndef = false) {
4678   if (!VT.is128BitVector())
4679     return false;
4680
4681   unsigned NumOps = VT.getVectorNumElements();
4682   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4683     return false;
4684
4685   if (!isUndefOrEqual(Mask[0], 0))
4686     return false;
4687
4688   for (unsigned i = 1; i != NumOps; ++i)
4689     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4690           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4691           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4692       return false;
4693
4694   return true;
4695 }
4696
4697 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4698 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4699 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4700 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4701                            const X86Subtarget *Subtarget) {
4702   if (!Subtarget->hasSSE3())
4703     return false;
4704
4705   unsigned NumElems = VT.getVectorNumElements();
4706
4707   if ((VT.is128BitVector() && NumElems != 4) ||
4708       (VT.is256BitVector() && NumElems != 8) ||
4709       (VT.is512BitVector() && NumElems != 16))
4710     return false;
4711
4712   // "i+1" is the value the indexed mask element must have
4713   for (unsigned i = 0; i != NumElems; i += 2)
4714     if (!isUndefOrEqual(Mask[i], i+1) ||
4715         !isUndefOrEqual(Mask[i+1], i+1))
4716       return false;
4717
4718   return true;
4719 }
4720
4721 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4722 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4723 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4724 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4725                            const X86Subtarget *Subtarget) {
4726   if (!Subtarget->hasSSE3())
4727     return false;
4728
4729   unsigned NumElems = VT.getVectorNumElements();
4730
4731   if ((VT.is128BitVector() && NumElems != 4) ||
4732       (VT.is256BitVector() && NumElems != 8) ||
4733       (VT.is512BitVector() && NumElems != 16))
4734     return false;
4735
4736   // "i" is the value the indexed mask element must have
4737   for (unsigned i = 0; i != NumElems; i += 2)
4738     if (!isUndefOrEqual(Mask[i], i) ||
4739         !isUndefOrEqual(Mask[i+1], i))
4740       return false;
4741
4742   return true;
4743 }
4744
4745 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4746 /// specifies a shuffle of elements that is suitable for input to 256-bit
4747 /// version of MOVDDUP.
4748 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4749   if (!HasFp256 || !VT.is256BitVector())
4750     return false;
4751
4752   unsigned NumElts = VT.getVectorNumElements();
4753   if (NumElts != 4)
4754     return false;
4755
4756   for (unsigned i = 0; i != NumElts/2; ++i)
4757     if (!isUndefOrEqual(Mask[i], 0))
4758       return false;
4759   for (unsigned i = NumElts/2; i != NumElts; ++i)
4760     if (!isUndefOrEqual(Mask[i], NumElts/2))
4761       return false;
4762   return true;
4763 }
4764
4765 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4766 /// specifies a shuffle of elements that is suitable for input to 128-bit
4767 /// version of MOVDDUP.
4768 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4769   if (!VT.is128BitVector())
4770     return false;
4771
4772   unsigned e = VT.getVectorNumElements() / 2;
4773   for (unsigned i = 0; i != e; ++i)
4774     if (!isUndefOrEqual(Mask[i], i))
4775       return false;
4776   for (unsigned i = 0; i != e; ++i)
4777     if (!isUndefOrEqual(Mask[e+i], i))
4778       return false;
4779   return true;
4780 }
4781
4782 /// isVEXTRACTIndex - Return true if the specified
4783 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4784 /// suitable for instruction that extract 128 or 256 bit vectors
4785 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4786   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4787   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4788     return false;
4789
4790   // The index should be aligned on a vecWidth-bit boundary.
4791   uint64_t Index =
4792     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4793
4794   MVT VT = N->getSimpleValueType(0);
4795   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4796   bool Result = (Index * ElSize) % vecWidth == 0;
4797
4798   return Result;
4799 }
4800
4801 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4802 /// operand specifies a subvector insert that is suitable for input to
4803 /// insertion of 128 or 256-bit subvectors
4804 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4805   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4806   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4807     return false;
4808   // The index should be aligned on a vecWidth-bit boundary.
4809   uint64_t Index =
4810     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4811
4812   MVT VT = N->getSimpleValueType(0);
4813   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4814   bool Result = (Index * ElSize) % vecWidth == 0;
4815
4816   return Result;
4817 }
4818
4819 bool X86::isVINSERT128Index(SDNode *N) {
4820   return isVINSERTIndex(N, 128);
4821 }
4822
4823 bool X86::isVINSERT256Index(SDNode *N) {
4824   return isVINSERTIndex(N, 256);
4825 }
4826
4827 bool X86::isVEXTRACT128Index(SDNode *N) {
4828   return isVEXTRACTIndex(N, 128);
4829 }
4830
4831 bool X86::isVEXTRACT256Index(SDNode *N) {
4832   return isVEXTRACTIndex(N, 256);
4833 }
4834
4835 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4836 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4837 /// Handles 128-bit and 256-bit.
4838 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4839   MVT VT = N->getSimpleValueType(0);
4840
4841   assert((VT.getSizeInBits() >= 128) &&
4842          "Unsupported vector type for PSHUF/SHUFP");
4843
4844   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4845   // independently on 128-bit lanes.
4846   unsigned NumElts = VT.getVectorNumElements();
4847   unsigned NumLanes = VT.getSizeInBits()/128;
4848   unsigned NumLaneElts = NumElts/NumLanes;
4849
4850   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4851          "Only supports 2, 4 or 8 elements per lane");
4852
4853   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4854   unsigned Mask = 0;
4855   for (unsigned i = 0; i != NumElts; ++i) {
4856     int Elt = N->getMaskElt(i);
4857     if (Elt < 0) continue;
4858     Elt &= NumLaneElts - 1;
4859     unsigned ShAmt = (i << Shift) % 8;
4860     Mask |= Elt << ShAmt;
4861   }
4862
4863   return Mask;
4864 }
4865
4866 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4867 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4868 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4869   MVT VT = N->getSimpleValueType(0);
4870
4871   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4872          "Unsupported vector type for PSHUFHW");
4873
4874   unsigned NumElts = VT.getVectorNumElements();
4875
4876   unsigned Mask = 0;
4877   for (unsigned l = 0; l != NumElts; l += 8) {
4878     // 8 nodes per lane, but we only care about the last 4.
4879     for (unsigned i = 0; i < 4; ++i) {
4880       int Elt = N->getMaskElt(l+i+4);
4881       if (Elt < 0) continue;
4882       Elt &= 0x3; // only 2-bits.
4883       Mask |= Elt << (i * 2);
4884     }
4885   }
4886
4887   return Mask;
4888 }
4889
4890 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4891 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4892 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4893   MVT VT = N->getSimpleValueType(0);
4894
4895   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4896          "Unsupported vector type for PSHUFHW");
4897
4898   unsigned NumElts = VT.getVectorNumElements();
4899
4900   unsigned Mask = 0;
4901   for (unsigned l = 0; l != NumElts; l += 8) {
4902     // 8 nodes per lane, but we only care about the first 4.
4903     for (unsigned i = 0; i < 4; ++i) {
4904       int Elt = N->getMaskElt(l+i);
4905       if (Elt < 0) continue;
4906       Elt &= 0x3; // only 2-bits
4907       Mask |= Elt << (i * 2);
4908     }
4909   }
4910
4911   return Mask;
4912 }
4913
4914 /// \brief Return the appropriate immediate to shuffle the specified
4915 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4916 /// VALIGN (if Interlane is true) instructions.
4917 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4918                                            bool InterLane) {
4919   MVT VT = SVOp->getSimpleValueType(0);
4920   unsigned EltSize = InterLane ? 1 :
4921     VT.getVectorElementType().getSizeInBits() >> 3;
4922
4923   unsigned NumElts = VT.getVectorNumElements();
4924   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4925   unsigned NumLaneElts = NumElts/NumLanes;
4926
4927   int Val = 0;
4928   unsigned i;
4929   for (i = 0; i != NumElts; ++i) {
4930     Val = SVOp->getMaskElt(i);
4931     if (Val >= 0)
4932       break;
4933   }
4934   if (Val >= (int)NumElts)
4935     Val -= NumElts - NumLaneElts;
4936
4937   assert(Val - i > 0 && "PALIGNR imm should be positive");
4938   return (Val - i) * EltSize;
4939 }
4940
4941 /// \brief Return the appropriate immediate to shuffle the specified
4942 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4943 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4944   return getShuffleAlignrImmediate(SVOp, false);
4945 }
4946
4947 /// \brief Return the appropriate immediate to shuffle the specified
4948 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4949 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4950   return getShuffleAlignrImmediate(SVOp, true);
4951 }
4952
4953
4954 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4955   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4956   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4957     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4958
4959   uint64_t Index =
4960     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4961
4962   MVT VecVT = N->getOperand(0).getSimpleValueType();
4963   MVT ElVT = VecVT.getVectorElementType();
4964
4965   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4966   return Index / NumElemsPerChunk;
4967 }
4968
4969 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4970   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4971   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4972     llvm_unreachable("Illegal insert subvector for VINSERT");
4973
4974   uint64_t Index =
4975     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4976
4977   MVT VecVT = N->getSimpleValueType(0);
4978   MVT ElVT = VecVT.getVectorElementType();
4979
4980   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4981   return Index / NumElemsPerChunk;
4982 }
4983
4984 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4985 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4986 /// and VINSERTI128 instructions.
4987 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4988   return getExtractVEXTRACTImmediate(N, 128);
4989 }
4990
4991 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4992 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4993 /// and VINSERTI64x4 instructions.
4994 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4995   return getExtractVEXTRACTImmediate(N, 256);
4996 }
4997
4998 /// getInsertVINSERT128Immediate - Return the appropriate immediate
4999 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5000 /// and VINSERTI128 instructions.
5001 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5002   return getInsertVINSERTImmediate(N, 128);
5003 }
5004
5005 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5006 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5007 /// and VINSERTI64x4 instructions.
5008 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5009   return getInsertVINSERTImmediate(N, 256);
5010 }
5011
5012 /// isZero - Returns true if Elt is a constant integer zero
5013 static bool isZero(SDValue V) {
5014   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5015   return C && C->isNullValue();
5016 }
5017
5018 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5019 /// constant +0.0.
5020 bool X86::isZeroNode(SDValue Elt) {
5021   if (isZero(Elt))
5022     return true;
5023   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5024     return CFP->getValueAPF().isPosZero();
5025   return false;
5026 }
5027
5028 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5029 /// match movhlps. The lower half elements should come from upper half of
5030 /// V1 (and in order), and the upper half elements should come from the upper
5031 /// half of V2 (and in order).
5032 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5033   if (!VT.is128BitVector())
5034     return false;
5035   if (VT.getVectorNumElements() != 4)
5036     return false;
5037   for (unsigned i = 0, e = 2; i != e; ++i)
5038     if (!isUndefOrEqual(Mask[i], i+2))
5039       return false;
5040   for (unsigned i = 2; i != 4; ++i)
5041     if (!isUndefOrEqual(Mask[i], i+4))
5042       return false;
5043   return true;
5044 }
5045
5046 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5047 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5048 /// required.
5049 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5050   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5051     return false;
5052   N = N->getOperand(0).getNode();
5053   if (!ISD::isNON_EXTLoad(N))
5054     return false;
5055   if (LD)
5056     *LD = cast<LoadSDNode>(N);
5057   return true;
5058 }
5059
5060 // Test whether the given value is a vector value which will be legalized
5061 // into a load.
5062 static bool WillBeConstantPoolLoad(SDNode *N) {
5063   if (N->getOpcode() != ISD::BUILD_VECTOR)
5064     return false;
5065
5066   // Check for any non-constant elements.
5067   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5068     switch (N->getOperand(i).getNode()->getOpcode()) {
5069     case ISD::UNDEF:
5070     case ISD::ConstantFP:
5071     case ISD::Constant:
5072       break;
5073     default:
5074       return false;
5075     }
5076
5077   // Vectors of all-zeros and all-ones are materialized with special
5078   // instructions rather than being loaded.
5079   return !ISD::isBuildVectorAllZeros(N) &&
5080          !ISD::isBuildVectorAllOnes(N);
5081 }
5082
5083 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5084 /// match movlp{s|d}. The lower half elements should come from lower half of
5085 /// V1 (and in order), and the upper half elements should come from the upper
5086 /// half of V2 (and in order). And since V1 will become the source of the
5087 /// MOVLP, it must be either a vector load or a scalar load to vector.
5088 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5089                                ArrayRef<int> Mask, MVT VT) {
5090   if (!VT.is128BitVector())
5091     return false;
5092
5093   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5094     return false;
5095   // Is V2 is a vector load, don't do this transformation. We will try to use
5096   // load folding shufps op.
5097   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5098     return false;
5099
5100   unsigned NumElems = VT.getVectorNumElements();
5101
5102   if (NumElems != 2 && NumElems != 4)
5103     return false;
5104   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5105     if (!isUndefOrEqual(Mask[i], i))
5106       return false;
5107   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5108     if (!isUndefOrEqual(Mask[i], i+NumElems))
5109       return false;
5110   return true;
5111 }
5112
5113 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5114 /// to an zero vector.
5115 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5116 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5117   SDValue V1 = N->getOperand(0);
5118   SDValue V2 = N->getOperand(1);
5119   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5120   for (unsigned i = 0; i != NumElems; ++i) {
5121     int Idx = N->getMaskElt(i);
5122     if (Idx >= (int)NumElems) {
5123       unsigned Opc = V2.getOpcode();
5124       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5125         continue;
5126       if (Opc != ISD::BUILD_VECTOR ||
5127           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5128         return false;
5129     } else if (Idx >= 0) {
5130       unsigned Opc = V1.getOpcode();
5131       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5132         continue;
5133       if (Opc != ISD::BUILD_VECTOR ||
5134           !X86::isZeroNode(V1.getOperand(Idx)))
5135         return false;
5136     }
5137   }
5138   return true;
5139 }
5140
5141 /// getZeroVector - Returns a vector of specified type with all zero elements.
5142 ///
5143 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5144                              SelectionDAG &DAG, SDLoc dl) {
5145   assert(VT.isVector() && "Expected a vector type");
5146
5147   // Always build SSE zero vectors as <4 x i32> bitcasted
5148   // to their dest type. This ensures they get CSE'd.
5149   SDValue Vec;
5150   if (VT.is128BitVector()) {  // SSE
5151     if (Subtarget->hasSSE2()) {  // SSE2
5152       SDValue Cst = DAG.getConstant(0, MVT::i32);
5153       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5154     } else { // SSE1
5155       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5156       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5157     }
5158   } else if (VT.is256BitVector()) { // AVX
5159     if (Subtarget->hasInt256()) { // AVX2
5160       SDValue Cst = DAG.getConstant(0, MVT::i32);
5161       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5162       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5163     } else {
5164       // 256-bit logic and arithmetic instructions in AVX are all
5165       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5166       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5167       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5168       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5169     }
5170   } else if (VT.is512BitVector()) { // AVX-512
5171       SDValue Cst = DAG.getConstant(0, MVT::i32);
5172       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5173                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5174       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5175   } else if (VT.getScalarType() == MVT::i1) {
5176     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5177     SDValue Cst = DAG.getConstant(0, MVT::i1);
5178     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5179     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5180   } else
5181     llvm_unreachable("Unexpected vector type");
5182
5183   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5184 }
5185
5186 /// getOnesVector - Returns a vector of specified type with all bits set.
5187 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5188 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5189 /// Then bitcast to their original type, ensuring they get CSE'd.
5190 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5191                              SDLoc dl) {
5192   assert(VT.isVector() && "Expected a vector type");
5193
5194   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5195   SDValue Vec;
5196   if (VT.is256BitVector()) {
5197     if (HasInt256) { // AVX2
5198       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5199       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5200     } else { // AVX
5201       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5202       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5203     }
5204   } else if (VT.is128BitVector()) {
5205     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5206   } else
5207     llvm_unreachable("Unexpected vector type");
5208
5209   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5210 }
5211
5212 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5213 /// that point to V2 points to its first element.
5214 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5215   for (unsigned i = 0; i != NumElems; ++i) {
5216     if (Mask[i] > (int)NumElems) {
5217       Mask[i] = NumElems;
5218     }
5219   }
5220 }
5221
5222 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5223 /// operation of specified width.
5224 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5225                        SDValue V2) {
5226   unsigned NumElems = VT.getVectorNumElements();
5227   SmallVector<int, 8> Mask;
5228   Mask.push_back(NumElems);
5229   for (unsigned i = 1; i != NumElems; ++i)
5230     Mask.push_back(i);
5231   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5232 }
5233
5234 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5235 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5236                           SDValue V2) {
5237   unsigned NumElems = VT.getVectorNumElements();
5238   SmallVector<int, 8> Mask;
5239   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5240     Mask.push_back(i);
5241     Mask.push_back(i + NumElems);
5242   }
5243   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5244 }
5245
5246 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5247 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5248                           SDValue V2) {
5249   unsigned NumElems = VT.getVectorNumElements();
5250   SmallVector<int, 8> Mask;
5251   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5252     Mask.push_back(i + Half);
5253     Mask.push_back(i + NumElems + Half);
5254   }
5255   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5256 }
5257
5258 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5259 // a generic shuffle instruction because the target has no such instructions.
5260 // Generate shuffles which repeat i16 and i8 several times until they can be
5261 // represented by v4f32 and then be manipulated by target suported shuffles.
5262 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5263   MVT VT = V.getSimpleValueType();
5264   int NumElems = VT.getVectorNumElements();
5265   SDLoc dl(V);
5266
5267   while (NumElems > 4) {
5268     if (EltNo < NumElems/2) {
5269       V = getUnpackl(DAG, dl, VT, V, V);
5270     } else {
5271       V = getUnpackh(DAG, dl, VT, V, V);
5272       EltNo -= NumElems/2;
5273     }
5274     NumElems >>= 1;
5275   }
5276   return V;
5277 }
5278
5279 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5280 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5281   MVT VT = V.getSimpleValueType();
5282   SDLoc dl(V);
5283
5284   if (VT.is128BitVector()) {
5285     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5286     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5287     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5288                              &SplatMask[0]);
5289   } else if (VT.is256BitVector()) {
5290     // To use VPERMILPS to splat scalars, the second half of indicies must
5291     // refer to the higher part, which is a duplication of the lower one,
5292     // because VPERMILPS can only handle in-lane permutations.
5293     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5294                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5295
5296     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5297     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5298                              &SplatMask[0]);
5299   } else
5300     llvm_unreachable("Vector size not supported");
5301
5302   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5303 }
5304
5305 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5306 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5307   MVT SrcVT = SV->getSimpleValueType(0);
5308   SDValue V1 = SV->getOperand(0);
5309   SDLoc dl(SV);
5310
5311   int EltNo = SV->getSplatIndex();
5312   int NumElems = SrcVT.getVectorNumElements();
5313   bool Is256BitVec = SrcVT.is256BitVector();
5314
5315   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5316          "Unknown how to promote splat for type");
5317
5318   // Extract the 128-bit part containing the splat element and update
5319   // the splat element index when it refers to the higher register.
5320   if (Is256BitVec) {
5321     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5322     if (EltNo >= NumElems/2)
5323       EltNo -= NumElems/2;
5324   }
5325
5326   // All i16 and i8 vector types can't be used directly by a generic shuffle
5327   // instruction because the target has no such instruction. Generate shuffles
5328   // which repeat i16 and i8 several times until they fit in i32, and then can
5329   // be manipulated by target suported shuffles.
5330   MVT EltVT = SrcVT.getVectorElementType();
5331   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5332     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5333
5334   // Recreate the 256-bit vector and place the same 128-bit vector
5335   // into the low and high part. This is necessary because we want
5336   // to use VPERM* to shuffle the vectors
5337   if (Is256BitVec) {
5338     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5339   }
5340
5341   return getLegalSplat(DAG, V1, EltNo);
5342 }
5343
5344 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5345 /// vector of zero or undef vector.  This produces a shuffle where the low
5346 /// element of V2 is swizzled into the zero/undef vector, landing at element
5347 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5348 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5349                                            bool IsZero,
5350                                            const X86Subtarget *Subtarget,
5351                                            SelectionDAG &DAG) {
5352   MVT VT = V2.getSimpleValueType();
5353   SDValue V1 = IsZero
5354     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5355   unsigned NumElems = VT.getVectorNumElements();
5356   SmallVector<int, 16> MaskVec;
5357   for (unsigned i = 0; i != NumElems; ++i)
5358     // If this is the insertion idx, put the low elt of V2 here.
5359     MaskVec.push_back(i == Idx ? NumElems : i);
5360   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5361 }
5362
5363 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5364 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5365 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5366 /// shuffles which use a single input multiple times, and in those cases it will
5367 /// adjust the mask to only have indices within that single input.
5368 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5369                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5370   unsigned NumElems = VT.getVectorNumElements();
5371   SDValue ImmN;
5372
5373   IsUnary = false;
5374   bool IsFakeUnary = false;
5375   switch(N->getOpcode()) {
5376   case X86ISD::BLENDI:
5377     ImmN = N->getOperand(N->getNumOperands()-1);
5378     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5379     break;
5380   case X86ISD::SHUFP:
5381     ImmN = N->getOperand(N->getNumOperands()-1);
5382     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5383     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5384     break;
5385   case X86ISD::UNPCKH:
5386     DecodeUNPCKHMask(VT, Mask);
5387     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5388     break;
5389   case X86ISD::UNPCKL:
5390     DecodeUNPCKLMask(VT, Mask);
5391     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5392     break;
5393   case X86ISD::MOVHLPS:
5394     DecodeMOVHLPSMask(NumElems, Mask);
5395     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5396     break;
5397   case X86ISD::MOVLHPS:
5398     DecodeMOVLHPSMask(NumElems, Mask);
5399     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5400     break;
5401   case X86ISD::PALIGNR:
5402     ImmN = N->getOperand(N->getNumOperands()-1);
5403     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5404     break;
5405   case X86ISD::PSHUFD:
5406   case X86ISD::VPERMILPI:
5407     ImmN = N->getOperand(N->getNumOperands()-1);
5408     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5409     IsUnary = true;
5410     break;
5411   case X86ISD::PSHUFHW:
5412     ImmN = N->getOperand(N->getNumOperands()-1);
5413     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5414     IsUnary = true;
5415     break;
5416   case X86ISD::PSHUFLW:
5417     ImmN = N->getOperand(N->getNumOperands()-1);
5418     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5419     IsUnary = true;
5420     break;
5421   case X86ISD::PSHUFB: {
5422     IsUnary = true;
5423     SDValue MaskNode = N->getOperand(1);
5424     while (MaskNode->getOpcode() == ISD::BITCAST)
5425       MaskNode = MaskNode->getOperand(0);
5426
5427     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5428       // If we have a build-vector, then things are easy.
5429       EVT VT = MaskNode.getValueType();
5430       assert(VT.isVector() &&
5431              "Can't produce a non-vector with a build_vector!");
5432       if (!VT.isInteger())
5433         return false;
5434
5435       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5436
5437       SmallVector<uint64_t, 32> RawMask;
5438       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5439         SDValue Op = MaskNode->getOperand(i);
5440         if (Op->getOpcode() == ISD::UNDEF) {
5441           RawMask.push_back((uint64_t)SM_SentinelUndef);
5442           continue;
5443         }
5444         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5445         if (!CN)
5446           return false;
5447         APInt MaskElement = CN->getAPIntValue();
5448
5449         // We now have to decode the element which could be any integer size and
5450         // extract each byte of it.
5451         for (int j = 0; j < NumBytesPerElement; ++j) {
5452           // Note that this is x86 and so always little endian: the low byte is
5453           // the first byte of the mask.
5454           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5455           MaskElement = MaskElement.lshr(8);
5456         }
5457       }
5458       DecodePSHUFBMask(RawMask, Mask);
5459       break;
5460     }
5461
5462     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5463     if (!MaskLoad)
5464       return false;
5465
5466     SDValue Ptr = MaskLoad->getBasePtr();
5467     if (Ptr->getOpcode() == X86ISD::Wrapper)
5468       Ptr = Ptr->getOperand(0);
5469
5470     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5471     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5472       return false;
5473
5474     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5475       DecodePSHUFBMask(C, Mask);
5476       break;
5477     }
5478
5479     return false;
5480   }
5481   case X86ISD::VPERMI:
5482     ImmN = N->getOperand(N->getNumOperands()-1);
5483     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5484     IsUnary = true;
5485     break;
5486   case X86ISD::MOVSS:
5487   case X86ISD::MOVSD: {
5488     // The index 0 always comes from the first element of the second source,
5489     // this is why MOVSS and MOVSD are used in the first place. The other
5490     // elements come from the other positions of the first source vector
5491     Mask.push_back(NumElems);
5492     for (unsigned i = 1; i != NumElems; ++i) {
5493       Mask.push_back(i);
5494     }
5495     break;
5496   }
5497   case X86ISD::VPERM2X128:
5498     ImmN = N->getOperand(N->getNumOperands()-1);
5499     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5500     if (Mask.empty()) return false;
5501     break;
5502   case X86ISD::MOVSLDUP:
5503     DecodeMOVSLDUPMask(VT, Mask);
5504     break;
5505   case X86ISD::MOVSHDUP:
5506     DecodeMOVSHDUPMask(VT, Mask);
5507     break;
5508   case X86ISD::MOVDDUP:
5509   case X86ISD::MOVLHPD:
5510   case X86ISD::MOVLPD:
5511   case X86ISD::MOVLPS:
5512     // Not yet implemented
5513     return false;
5514   default: llvm_unreachable("unknown target shuffle node");
5515   }
5516
5517   // If we have a fake unary shuffle, the shuffle mask is spread across two
5518   // inputs that are actually the same node. Re-map the mask to always point
5519   // into the first input.
5520   if (IsFakeUnary)
5521     for (int &M : Mask)
5522       if (M >= (int)Mask.size())
5523         M -= Mask.size();
5524
5525   return true;
5526 }
5527
5528 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5529 /// element of the result of the vector shuffle.
5530 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5531                                    unsigned Depth) {
5532   if (Depth == 6)
5533     return SDValue();  // Limit search depth.
5534
5535   SDValue V = SDValue(N, 0);
5536   EVT VT = V.getValueType();
5537   unsigned Opcode = V.getOpcode();
5538
5539   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5540   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5541     int Elt = SV->getMaskElt(Index);
5542
5543     if (Elt < 0)
5544       return DAG.getUNDEF(VT.getVectorElementType());
5545
5546     unsigned NumElems = VT.getVectorNumElements();
5547     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5548                                          : SV->getOperand(1);
5549     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5550   }
5551
5552   // Recurse into target specific vector shuffles to find scalars.
5553   if (isTargetShuffle(Opcode)) {
5554     MVT ShufVT = V.getSimpleValueType();
5555     unsigned NumElems = ShufVT.getVectorNumElements();
5556     SmallVector<int, 16> ShuffleMask;
5557     bool IsUnary;
5558
5559     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5560       return SDValue();
5561
5562     int Elt = ShuffleMask[Index];
5563     if (Elt < 0)
5564       return DAG.getUNDEF(ShufVT.getVectorElementType());
5565
5566     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5567                                          : N->getOperand(1);
5568     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5569                                Depth+1);
5570   }
5571
5572   // Actual nodes that may contain scalar elements
5573   if (Opcode == ISD::BITCAST) {
5574     V = V.getOperand(0);
5575     EVT SrcVT = V.getValueType();
5576     unsigned NumElems = VT.getVectorNumElements();
5577
5578     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5579       return SDValue();
5580   }
5581
5582   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5583     return (Index == 0) ? V.getOperand(0)
5584                         : DAG.getUNDEF(VT.getVectorElementType());
5585
5586   if (V.getOpcode() == ISD::BUILD_VECTOR)
5587     return V.getOperand(Index);
5588
5589   return SDValue();
5590 }
5591
5592 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5593 /// shuffle operation which come from a consecutively from a zero. The
5594 /// search can start in two different directions, from left or right.
5595 /// We count undefs as zeros until PreferredNum is reached.
5596 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5597                                          unsigned NumElems, bool ZerosFromLeft,
5598                                          SelectionDAG &DAG,
5599                                          unsigned PreferredNum = -1U) {
5600   unsigned NumZeros = 0;
5601   for (unsigned i = 0; i != NumElems; ++i) {
5602     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5603     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5604     if (!Elt.getNode())
5605       break;
5606
5607     if (X86::isZeroNode(Elt))
5608       ++NumZeros;
5609     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5610       NumZeros = std::min(NumZeros + 1, PreferredNum);
5611     else
5612       break;
5613   }
5614
5615   return NumZeros;
5616 }
5617
5618 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5619 /// correspond consecutively to elements from one of the vector operands,
5620 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5621 static
5622 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5623                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5624                               unsigned NumElems, unsigned &OpNum) {
5625   bool SeenV1 = false;
5626   bool SeenV2 = false;
5627
5628   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5629     int Idx = SVOp->getMaskElt(i);
5630     // Ignore undef indicies
5631     if (Idx < 0)
5632       continue;
5633
5634     if (Idx < (int)NumElems)
5635       SeenV1 = true;
5636     else
5637       SeenV2 = true;
5638
5639     // Only accept consecutive elements from the same vector
5640     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5641       return false;
5642   }
5643
5644   OpNum = SeenV1 ? 0 : 1;
5645   return true;
5646 }
5647
5648 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5649 /// logical left shift of a vector.
5650 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5651                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5652   unsigned NumElems =
5653     SVOp->getSimpleValueType(0).getVectorNumElements();
5654   unsigned NumZeros = getNumOfConsecutiveZeros(
5655       SVOp, NumElems, false /* check zeros from right */, DAG,
5656       SVOp->getMaskElt(0));
5657   unsigned OpSrc;
5658
5659   if (!NumZeros)
5660     return false;
5661
5662   // Considering the elements in the mask that are not consecutive zeros,
5663   // check if they consecutively come from only one of the source vectors.
5664   //
5665   //               V1 = {X, A, B, C}     0
5666   //                         \  \  \    /
5667   //   vector_shuffle V1, V2 <1, 2, 3, X>
5668   //
5669   if (!isShuffleMaskConsecutive(SVOp,
5670             0,                   // Mask Start Index
5671             NumElems-NumZeros,   // Mask End Index(exclusive)
5672             NumZeros,            // Where to start looking in the src vector
5673             NumElems,            // Number of elements in vector
5674             OpSrc))              // Which source operand ?
5675     return false;
5676
5677   isLeft = false;
5678   ShAmt = NumZeros;
5679   ShVal = SVOp->getOperand(OpSrc);
5680   return true;
5681 }
5682
5683 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5684 /// logical left shift of a vector.
5685 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5686                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5687   unsigned NumElems =
5688     SVOp->getSimpleValueType(0).getVectorNumElements();
5689   unsigned NumZeros = getNumOfConsecutiveZeros(
5690       SVOp, NumElems, true /* check zeros from left */, DAG,
5691       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5692   unsigned OpSrc;
5693
5694   if (!NumZeros)
5695     return false;
5696
5697   // Considering the elements in the mask that are not consecutive zeros,
5698   // check if they consecutively come from only one of the source vectors.
5699   //
5700   //                           0    { A, B, X, X } = V2
5701   //                          / \    /  /
5702   //   vector_shuffle V1, V2 <X, X, 4, 5>
5703   //
5704   if (!isShuffleMaskConsecutive(SVOp,
5705             NumZeros,     // Mask Start Index
5706             NumElems,     // Mask End Index(exclusive)
5707             0,            // Where to start looking in the src vector
5708             NumElems,     // Number of elements in vector
5709             OpSrc))       // Which source operand ?
5710     return false;
5711
5712   isLeft = true;
5713   ShAmt = NumZeros;
5714   ShVal = SVOp->getOperand(OpSrc);
5715   return true;
5716 }
5717
5718 /// isVectorShift - Returns true if the shuffle can be implemented as a
5719 /// logical left or right shift of a vector.
5720 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5721                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5722   // Although the logic below support any bitwidth size, there are no
5723   // shift instructions which handle more than 128-bit vectors.
5724   if (!SVOp->getSimpleValueType(0).is128BitVector())
5725     return false;
5726
5727   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5728       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5729     return true;
5730
5731   return false;
5732 }
5733
5734 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5735 ///
5736 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5737                                        unsigned NumNonZero, unsigned NumZero,
5738                                        SelectionDAG &DAG,
5739                                        const X86Subtarget* Subtarget,
5740                                        const TargetLowering &TLI) {
5741   if (NumNonZero > 8)
5742     return SDValue();
5743
5744   SDLoc dl(Op);
5745   SDValue V;
5746   bool First = true;
5747   for (unsigned i = 0; i < 16; ++i) {
5748     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5749     if (ThisIsNonZero && First) {
5750       if (NumZero)
5751         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5752       else
5753         V = DAG.getUNDEF(MVT::v8i16);
5754       First = false;
5755     }
5756
5757     if ((i & 1) != 0) {
5758       SDValue ThisElt, LastElt;
5759       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5760       if (LastIsNonZero) {
5761         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5762                               MVT::i16, Op.getOperand(i-1));
5763       }
5764       if (ThisIsNonZero) {
5765         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5766         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5767                               ThisElt, DAG.getConstant(8, MVT::i8));
5768         if (LastIsNonZero)
5769           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5770       } else
5771         ThisElt = LastElt;
5772
5773       if (ThisElt.getNode())
5774         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5775                         DAG.getIntPtrConstant(i/2));
5776     }
5777   }
5778
5779   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5780 }
5781
5782 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5783 ///
5784 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5785                                      unsigned NumNonZero, unsigned NumZero,
5786                                      SelectionDAG &DAG,
5787                                      const X86Subtarget* Subtarget,
5788                                      const TargetLowering &TLI) {
5789   if (NumNonZero > 4)
5790     return SDValue();
5791
5792   SDLoc dl(Op);
5793   SDValue V;
5794   bool First = true;
5795   for (unsigned i = 0; i < 8; ++i) {
5796     bool isNonZero = (NonZeros & (1 << i)) != 0;
5797     if (isNonZero) {
5798       if (First) {
5799         if (NumZero)
5800           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5801         else
5802           V = DAG.getUNDEF(MVT::v8i16);
5803         First = false;
5804       }
5805       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5806                       MVT::v8i16, V, Op.getOperand(i),
5807                       DAG.getIntPtrConstant(i));
5808     }
5809   }
5810
5811   return V;
5812 }
5813
5814 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5815 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5816                                      const X86Subtarget *Subtarget,
5817                                      const TargetLowering &TLI) {
5818   // Find all zeroable elements.
5819   bool Zeroable[4];
5820   for (int i=0; i < 4; ++i) {
5821     SDValue Elt = Op->getOperand(i);
5822     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5823   }
5824   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5825                        [](bool M) { return !M; }) > 1 &&
5826          "We expect at least two non-zero elements!");
5827
5828   // We only know how to deal with build_vector nodes where elements are either
5829   // zeroable or extract_vector_elt with constant index.
5830   SDValue FirstNonZero;
5831   unsigned FirstNonZeroIdx;
5832   for (unsigned i=0; i < 4; ++i) {
5833     if (Zeroable[i])
5834       continue;
5835     SDValue Elt = Op->getOperand(i);
5836     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5837         !isa<ConstantSDNode>(Elt.getOperand(1)))
5838       return SDValue();
5839     // Make sure that this node is extracting from a 128-bit vector.
5840     MVT VT = Elt.getOperand(0).getSimpleValueType();
5841     if (!VT.is128BitVector())
5842       return SDValue();
5843     if (!FirstNonZero.getNode()) {
5844       FirstNonZero = Elt;
5845       FirstNonZeroIdx = i;
5846     }
5847   }
5848
5849   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5850   SDValue V1 = FirstNonZero.getOperand(0);
5851   MVT VT = V1.getSimpleValueType();
5852
5853   // See if this build_vector can be lowered as a blend with zero.
5854   SDValue Elt;
5855   unsigned EltMaskIdx, EltIdx;
5856   int Mask[4];
5857   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5858     if (Zeroable[EltIdx]) {
5859       // The zero vector will be on the right hand side.
5860       Mask[EltIdx] = EltIdx+4;
5861       continue;
5862     }
5863
5864     Elt = Op->getOperand(EltIdx);
5865     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5866     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5867     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5868       break;
5869     Mask[EltIdx] = EltIdx;
5870   }
5871
5872   if (EltIdx == 4) {
5873     // Let the shuffle legalizer deal with blend operations.
5874     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5875     if (V1.getSimpleValueType() != VT)
5876       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5877     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5878   }
5879
5880   // See if we can lower this build_vector to a INSERTPS.
5881   if (!Subtarget->hasSSE41())
5882     return SDValue();
5883
5884   SDValue V2 = Elt.getOperand(0);
5885   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5886     V1 = SDValue();
5887
5888   bool CanFold = true;
5889   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5890     if (Zeroable[i])
5891       continue;
5892
5893     SDValue Current = Op->getOperand(i);
5894     SDValue SrcVector = Current->getOperand(0);
5895     if (!V1.getNode())
5896       V1 = SrcVector;
5897     CanFold = SrcVector == V1 &&
5898       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5899   }
5900
5901   if (!CanFold)
5902     return SDValue();
5903
5904   assert(V1.getNode() && "Expected at least two non-zero elements!");
5905   if (V1.getSimpleValueType() != MVT::v4f32)
5906     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5907   if (V2.getSimpleValueType() != MVT::v4f32)
5908     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5909
5910   // Ok, we can emit an INSERTPS instruction.
5911   unsigned ZMask = 0;
5912   for (int i = 0; i < 4; ++i)
5913     if (Zeroable[i])
5914       ZMask |= 1 << i;
5915
5916   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5917   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5918   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5919                                DAG.getIntPtrConstant(InsertPSMask));
5920   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5921 }
5922
5923 /// getVShift - Return a vector logical shift node.
5924 ///
5925 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5926                          unsigned NumBits, SelectionDAG &DAG,
5927                          const TargetLowering &TLI, SDLoc dl) {
5928   assert(VT.is128BitVector() && "Unknown type for VShift");
5929   EVT ShVT = MVT::v2i64;
5930   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5931   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5932   return DAG.getNode(ISD::BITCAST, dl, VT,
5933                      DAG.getNode(Opc, dl, ShVT, SrcOp,
5934                              DAG.getConstant(NumBits,
5935                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5936 }
5937
5938 static SDValue
5939 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5940
5941   // Check if the scalar load can be widened into a vector load. And if
5942   // the address is "base + cst" see if the cst can be "absorbed" into
5943   // the shuffle mask.
5944   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5945     SDValue Ptr = LD->getBasePtr();
5946     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5947       return SDValue();
5948     EVT PVT = LD->getValueType(0);
5949     if (PVT != MVT::i32 && PVT != MVT::f32)
5950       return SDValue();
5951
5952     int FI = -1;
5953     int64_t Offset = 0;
5954     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5955       FI = FINode->getIndex();
5956       Offset = 0;
5957     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5958                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5959       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5960       Offset = Ptr.getConstantOperandVal(1);
5961       Ptr = Ptr.getOperand(0);
5962     } else {
5963       return SDValue();
5964     }
5965
5966     // FIXME: 256-bit vector instructions don't require a strict alignment,
5967     // improve this code to support it better.
5968     unsigned RequiredAlign = VT.getSizeInBits()/8;
5969     SDValue Chain = LD->getChain();
5970     // Make sure the stack object alignment is at least 16 or 32.
5971     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5972     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5973       if (MFI->isFixedObjectIndex(FI)) {
5974         // Can't change the alignment. FIXME: It's possible to compute
5975         // the exact stack offset and reference FI + adjust offset instead.
5976         // If someone *really* cares about this. That's the way to implement it.
5977         return SDValue();
5978       } else {
5979         MFI->setObjectAlignment(FI, RequiredAlign);
5980       }
5981     }
5982
5983     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5984     // Ptr + (Offset & ~15).
5985     if (Offset < 0)
5986       return SDValue();
5987     if ((Offset % RequiredAlign) & 3)
5988       return SDValue();
5989     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5990     if (StartOffset)
5991       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5992                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5993
5994     int EltNo = (Offset - StartOffset) >> 2;
5995     unsigned NumElems = VT.getVectorNumElements();
5996
5997     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5998     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5999                              LD->getPointerInfo().getWithOffset(StartOffset),
6000                              false, false, false, 0);
6001
6002     SmallVector<int, 8> Mask;
6003     for (unsigned i = 0; i != NumElems; ++i)
6004       Mask.push_back(EltNo);
6005
6006     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6007   }
6008
6009   return SDValue();
6010 }
6011
6012 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
6013 /// vector of type 'VT', see if the elements can be replaced by a single large
6014 /// load which has the same value as a build_vector whose operands are 'elts'.
6015 ///
6016 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6017 ///
6018 /// FIXME: we'd also like to handle the case where the last elements are zero
6019 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6020 /// There's even a handy isZeroNode for that purpose.
6021 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
6022                                         SDLoc &DL, SelectionDAG &DAG,
6023                                         bool isAfterLegalize) {
6024   EVT EltVT = VT.getVectorElementType();
6025   unsigned NumElems = Elts.size();
6026
6027   LoadSDNode *LDBase = nullptr;
6028   unsigned LastLoadedElt = -1U;
6029
6030   // For each element in the initializer, see if we've found a load or an undef.
6031   // If we don't find an initial load element, or later load elements are
6032   // non-consecutive, bail out.
6033   for (unsigned i = 0; i < NumElems; ++i) {
6034     SDValue Elt = Elts[i];
6035
6036     if (!Elt.getNode() ||
6037         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6038       return SDValue();
6039     if (!LDBase) {
6040       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6041         return SDValue();
6042       LDBase = cast<LoadSDNode>(Elt.getNode());
6043       LastLoadedElt = i;
6044       continue;
6045     }
6046     if (Elt.getOpcode() == ISD::UNDEF)
6047       continue;
6048
6049     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6050     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
6051       return SDValue();
6052     LastLoadedElt = i;
6053   }
6054
6055   // If we have found an entire vector of loads and undefs, then return a large
6056   // load of the entire vector width starting at the base pointer.  If we found
6057   // consecutive loads for the low half, generate a vzext_load node.
6058   if (LastLoadedElt == NumElems - 1) {
6059
6060     if (isAfterLegalize &&
6061         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6062       return SDValue();
6063
6064     SDValue NewLd = SDValue();
6065
6066     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6067                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6068                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6069                         LDBase->getAlignment());
6070
6071     if (LDBase->hasAnyUseOfValue(1)) {
6072       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6073                                      SDValue(LDBase, 1),
6074                                      SDValue(NewLd.getNode(), 1));
6075       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6076       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6077                              SDValue(NewLd.getNode(), 1));
6078     }
6079
6080     return NewLd;
6081   }
6082
6083   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6084   //of a v4i32 / v4f32. It's probably worth generalizing.
6085   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6086       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6087     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6088     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6089     SDValue ResNode =
6090         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6091                                 LDBase->getPointerInfo(),
6092                                 LDBase->getAlignment(),
6093                                 false/*isVolatile*/, true/*ReadMem*/,
6094                                 false/*WriteMem*/);
6095
6096     // Make sure the newly-created LOAD is in the same position as LDBase in
6097     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6098     // update uses of LDBase's output chain to use the TokenFactor.
6099     if (LDBase->hasAnyUseOfValue(1)) {
6100       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6101                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6102       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6103       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6104                              SDValue(ResNode.getNode(), 1));
6105     }
6106
6107     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6108   }
6109   return SDValue();
6110 }
6111
6112 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6113 /// to generate a splat value for the following cases:
6114 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6115 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6116 /// a scalar load, or a constant.
6117 /// The VBROADCAST node is returned when a pattern is found,
6118 /// or SDValue() otherwise.
6119 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6120                                     SelectionDAG &DAG) {
6121   // VBROADCAST requires AVX.
6122   // TODO: Splats could be generated for non-AVX CPUs using SSE
6123   // instructions, but there's less potential gain for only 128-bit vectors.
6124   if (!Subtarget->hasAVX())
6125     return SDValue();
6126
6127   MVT VT = Op.getSimpleValueType();
6128   SDLoc dl(Op);
6129
6130   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6131          "Unsupported vector type for broadcast.");
6132
6133   SDValue Ld;
6134   bool ConstSplatVal;
6135
6136   switch (Op.getOpcode()) {
6137     default:
6138       // Unknown pattern found.
6139       return SDValue();
6140
6141     case ISD::BUILD_VECTOR: {
6142       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6143       BitVector UndefElements;
6144       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6145
6146       // We need a splat of a single value to use broadcast, and it doesn't
6147       // make any sense if the value is only in one element of the vector.
6148       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6149         return SDValue();
6150
6151       Ld = Splat;
6152       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6153                        Ld.getOpcode() == ISD::ConstantFP);
6154
6155       // Make sure that all of the users of a non-constant load are from the
6156       // BUILD_VECTOR node.
6157       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6158         return SDValue();
6159       break;
6160     }
6161
6162     case ISD::VECTOR_SHUFFLE: {
6163       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6164
6165       // Shuffles must have a splat mask where the first element is
6166       // broadcasted.
6167       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6168         return SDValue();
6169
6170       SDValue Sc = Op.getOperand(0);
6171       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6172           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6173
6174         if (!Subtarget->hasInt256())
6175           return SDValue();
6176
6177         // Use the register form of the broadcast instruction available on AVX2.
6178         if (VT.getSizeInBits() >= 256)
6179           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6180         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6181       }
6182
6183       Ld = Sc.getOperand(0);
6184       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6185                        Ld.getOpcode() == ISD::ConstantFP);
6186
6187       // The scalar_to_vector node and the suspected
6188       // load node must have exactly one user.
6189       // Constants may have multiple users.
6190
6191       // AVX-512 has register version of the broadcast
6192       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6193         Ld.getValueType().getSizeInBits() >= 32;
6194       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6195           !hasRegVer))
6196         return SDValue();
6197       break;
6198     }
6199   }
6200
6201   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6202   bool IsGE256 = (VT.getSizeInBits() >= 256);
6203
6204   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6205   // instruction to save 8 or more bytes of constant pool data.
6206   // TODO: If multiple splats are generated to load the same constant,
6207   // it may be detrimental to overall size. There needs to be a way to detect
6208   // that condition to know if this is truly a size win.
6209   const Function *F = DAG.getMachineFunction().getFunction();
6210   bool OptForSize = F->getAttributes().
6211     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6212
6213   // Handle broadcasting a single constant scalar from the constant pool
6214   // into a vector.
6215   // On Sandybridge (no AVX2), it is still better to load a constant vector
6216   // from the constant pool and not to broadcast it from a scalar.
6217   // But override that restriction when optimizing for size.
6218   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6219   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6220     EVT CVT = Ld.getValueType();
6221     assert(!CVT.isVector() && "Must not broadcast a vector type");
6222
6223     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6224     // For size optimization, also splat v2f64 and v2i64, and for size opt
6225     // with AVX2, also splat i8 and i16.
6226     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6227     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6228         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6229       const Constant *C = nullptr;
6230       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6231         C = CI->getConstantIntValue();
6232       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6233         C = CF->getConstantFPValue();
6234
6235       assert(C && "Invalid constant type");
6236
6237       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6238       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6239       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6240       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6241                        MachinePointerInfo::getConstantPool(),
6242                        false, false, false, Alignment);
6243
6244       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6245     }
6246   }
6247
6248   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6249
6250   // Handle AVX2 in-register broadcasts.
6251   if (!IsLoad && Subtarget->hasInt256() &&
6252       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6253     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6254
6255   // The scalar source must be a normal load.
6256   if (!IsLoad)
6257     return SDValue();
6258
6259   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6260       (Subtarget->hasVLX() && ScalarSize == 64))
6261     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6262
6263   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6264   // double since there is no vbroadcastsd xmm
6265   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6266     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6267       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6268   }
6269
6270   // Unsupported broadcast.
6271   return SDValue();
6272 }
6273
6274 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6275 /// underlying vector and index.
6276 ///
6277 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6278 /// index.
6279 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6280                                          SDValue ExtIdx) {
6281   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6282   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6283     return Idx;
6284
6285   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6286   // lowered this:
6287   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6288   // to:
6289   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6290   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6291   //                           undef)
6292   //                       Constant<0>)
6293   // In this case the vector is the extract_subvector expression and the index
6294   // is 2, as specified by the shuffle.
6295   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6296   SDValue ShuffleVec = SVOp->getOperand(0);
6297   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6298   assert(ShuffleVecVT.getVectorElementType() ==
6299          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6300
6301   int ShuffleIdx = SVOp->getMaskElt(Idx);
6302   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6303     ExtractedFromVec = ShuffleVec;
6304     return ShuffleIdx;
6305   }
6306   return Idx;
6307 }
6308
6309 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6310   MVT VT = Op.getSimpleValueType();
6311
6312   // Skip if insert_vec_elt is not supported.
6313   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6314   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6315     return SDValue();
6316
6317   SDLoc DL(Op);
6318   unsigned NumElems = Op.getNumOperands();
6319
6320   SDValue VecIn1;
6321   SDValue VecIn2;
6322   SmallVector<unsigned, 4> InsertIndices;
6323   SmallVector<int, 8> Mask(NumElems, -1);
6324
6325   for (unsigned i = 0; i != NumElems; ++i) {
6326     unsigned Opc = Op.getOperand(i).getOpcode();
6327
6328     if (Opc == ISD::UNDEF)
6329       continue;
6330
6331     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6332       // Quit if more than 1 elements need inserting.
6333       if (InsertIndices.size() > 1)
6334         return SDValue();
6335
6336       InsertIndices.push_back(i);
6337       continue;
6338     }
6339
6340     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6341     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6342     // Quit if non-constant index.
6343     if (!isa<ConstantSDNode>(ExtIdx))
6344       return SDValue();
6345     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6346
6347     // Quit if extracted from vector of different type.
6348     if (ExtractedFromVec.getValueType() != VT)
6349       return SDValue();
6350
6351     if (!VecIn1.getNode())
6352       VecIn1 = ExtractedFromVec;
6353     else if (VecIn1 != ExtractedFromVec) {
6354       if (!VecIn2.getNode())
6355         VecIn2 = ExtractedFromVec;
6356       else if (VecIn2 != ExtractedFromVec)
6357         // Quit if more than 2 vectors to shuffle
6358         return SDValue();
6359     }
6360
6361     if (ExtractedFromVec == VecIn1)
6362       Mask[i] = Idx;
6363     else if (ExtractedFromVec == VecIn2)
6364       Mask[i] = Idx + NumElems;
6365   }
6366
6367   if (!VecIn1.getNode())
6368     return SDValue();
6369
6370   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6371   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6372   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6373     unsigned Idx = InsertIndices[i];
6374     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6375                      DAG.getIntPtrConstant(Idx));
6376   }
6377
6378   return NV;
6379 }
6380
6381 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6382 SDValue
6383 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6384
6385   MVT VT = Op.getSimpleValueType();
6386   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6387          "Unexpected type in LowerBUILD_VECTORvXi1!");
6388
6389   SDLoc dl(Op);
6390   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6391     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6392     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6393     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6394   }
6395
6396   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6397     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6398     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6399     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6400   }
6401
6402   bool AllContants = true;
6403   uint64_t Immediate = 0;
6404   int NonConstIdx = -1;
6405   bool IsSplat = true;
6406   unsigned NumNonConsts = 0;
6407   unsigned NumConsts = 0;
6408   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6409     SDValue In = Op.getOperand(idx);
6410     if (In.getOpcode() == ISD::UNDEF)
6411       continue;
6412     if (!isa<ConstantSDNode>(In)) {
6413       AllContants = false;
6414       NonConstIdx = idx;
6415       NumNonConsts++;
6416     } else {
6417       NumConsts++;
6418       if (cast<ConstantSDNode>(In)->getZExtValue())
6419       Immediate |= (1ULL << idx);
6420     }
6421     if (In != Op.getOperand(0))
6422       IsSplat = false;
6423   }
6424
6425   if (AllContants) {
6426     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6427       DAG.getConstant(Immediate, MVT::i16));
6428     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6429                        DAG.getIntPtrConstant(0));
6430   }
6431
6432   if (NumNonConsts == 1 && NonConstIdx != 0) {
6433     SDValue DstVec;
6434     if (NumConsts) {
6435       SDValue VecAsImm = DAG.getConstant(Immediate,
6436                                          MVT::getIntegerVT(VT.getSizeInBits()));
6437       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6438     }
6439     else
6440       DstVec = DAG.getUNDEF(VT);
6441     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6442                        Op.getOperand(NonConstIdx),
6443                        DAG.getIntPtrConstant(NonConstIdx));
6444   }
6445   if (!IsSplat && (NonConstIdx != 0))
6446     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6447   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6448   SDValue Select;
6449   if (IsSplat)
6450     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6451                           DAG.getConstant(-1, SelectVT),
6452                           DAG.getConstant(0, SelectVT));
6453   else
6454     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6455                          DAG.getConstant((Immediate | 1), SelectVT),
6456                          DAG.getConstant(Immediate, SelectVT));
6457   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6458 }
6459
6460 /// \brief Return true if \p N implements a horizontal binop and return the
6461 /// operands for the horizontal binop into V0 and V1.
6462 ///
6463 /// This is a helper function of PerformBUILD_VECTORCombine.
6464 /// This function checks that the build_vector \p N in input implements a
6465 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6466 /// operation to match.
6467 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6468 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6469 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6470 /// arithmetic sub.
6471 ///
6472 /// This function only analyzes elements of \p N whose indices are
6473 /// in range [BaseIdx, LastIdx).
6474 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6475                               SelectionDAG &DAG,
6476                               unsigned BaseIdx, unsigned LastIdx,
6477                               SDValue &V0, SDValue &V1) {
6478   EVT VT = N->getValueType(0);
6479
6480   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6481   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6482          "Invalid Vector in input!");
6483
6484   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6485   bool CanFold = true;
6486   unsigned ExpectedVExtractIdx = BaseIdx;
6487   unsigned NumElts = LastIdx - BaseIdx;
6488   V0 = DAG.getUNDEF(VT);
6489   V1 = DAG.getUNDEF(VT);
6490
6491   // Check if N implements a horizontal binop.
6492   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6493     SDValue Op = N->getOperand(i + BaseIdx);
6494
6495     // Skip UNDEFs.
6496     if (Op->getOpcode() == ISD::UNDEF) {
6497       // Update the expected vector extract index.
6498       if (i * 2 == NumElts)
6499         ExpectedVExtractIdx = BaseIdx;
6500       ExpectedVExtractIdx += 2;
6501       continue;
6502     }
6503
6504     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6505
6506     if (!CanFold)
6507       break;
6508
6509     SDValue Op0 = Op.getOperand(0);
6510     SDValue Op1 = Op.getOperand(1);
6511
6512     // Try to match the following pattern:
6513     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6514     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6515         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6516         Op0.getOperand(0) == Op1.getOperand(0) &&
6517         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6518         isa<ConstantSDNode>(Op1.getOperand(1)));
6519     if (!CanFold)
6520       break;
6521
6522     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6523     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6524
6525     if (i * 2 < NumElts) {
6526       if (V0.getOpcode() == ISD::UNDEF)
6527         V0 = Op0.getOperand(0);
6528     } else {
6529       if (V1.getOpcode() == ISD::UNDEF)
6530         V1 = Op0.getOperand(0);
6531       if (i * 2 == NumElts)
6532         ExpectedVExtractIdx = BaseIdx;
6533     }
6534
6535     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6536     if (I0 == ExpectedVExtractIdx)
6537       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6538     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6539       // Try to match the following dag sequence:
6540       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6541       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6542     } else
6543       CanFold = false;
6544
6545     ExpectedVExtractIdx += 2;
6546   }
6547
6548   return CanFold;
6549 }
6550
6551 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6552 /// a concat_vector.
6553 ///
6554 /// This is a helper function of PerformBUILD_VECTORCombine.
6555 /// This function expects two 256-bit vectors called V0 and V1.
6556 /// At first, each vector is split into two separate 128-bit vectors.
6557 /// Then, the resulting 128-bit vectors are used to implement two
6558 /// horizontal binary operations.
6559 ///
6560 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6561 ///
6562 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6563 /// the two new horizontal binop.
6564 /// When Mode is set, the first horizontal binop dag node would take as input
6565 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6566 /// horizontal binop dag node would take as input the lower 128-bit of V1
6567 /// and the upper 128-bit of V1.
6568 ///   Example:
6569 ///     HADD V0_LO, V0_HI
6570 ///     HADD V1_LO, V1_HI
6571 ///
6572 /// Otherwise, the first horizontal binop dag node takes as input the lower
6573 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6574 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6575 ///   Example:
6576 ///     HADD V0_LO, V1_LO
6577 ///     HADD V0_HI, V1_HI
6578 ///
6579 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6580 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6581 /// the upper 128-bits of the result.
6582 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6583                                      SDLoc DL, SelectionDAG &DAG,
6584                                      unsigned X86Opcode, bool Mode,
6585                                      bool isUndefLO, bool isUndefHI) {
6586   EVT VT = V0.getValueType();
6587   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6588          "Invalid nodes in input!");
6589
6590   unsigned NumElts = VT.getVectorNumElements();
6591   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6592   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6593   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6594   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6595   EVT NewVT = V0_LO.getValueType();
6596
6597   SDValue LO = DAG.getUNDEF(NewVT);
6598   SDValue HI = DAG.getUNDEF(NewVT);
6599
6600   if (Mode) {
6601     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6602     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6603       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6604     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6605       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6606   } else {
6607     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6608     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6609                        V1_LO->getOpcode() != ISD::UNDEF))
6610       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6611
6612     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6613                        V1_HI->getOpcode() != ISD::UNDEF))
6614       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6615   }
6616
6617   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6618 }
6619
6620 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6621 /// sequence of 'vadd + vsub + blendi'.
6622 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6623                            const X86Subtarget *Subtarget) {
6624   SDLoc DL(BV);
6625   EVT VT = BV->getValueType(0);
6626   unsigned NumElts = VT.getVectorNumElements();
6627   SDValue InVec0 = DAG.getUNDEF(VT);
6628   SDValue InVec1 = DAG.getUNDEF(VT);
6629
6630   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6631           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6632
6633   // Odd-numbered elements in the input build vector are obtained from
6634   // adding two integer/float elements.
6635   // Even-numbered elements in the input build vector are obtained from
6636   // subtracting two integer/float elements.
6637   unsigned ExpectedOpcode = ISD::FSUB;
6638   unsigned NextExpectedOpcode = ISD::FADD;
6639   bool AddFound = false;
6640   bool SubFound = false;
6641
6642   for (unsigned i = 0, e = NumElts; i != e; i++) {
6643     SDValue Op = BV->getOperand(i);
6644
6645     // Skip 'undef' values.
6646     unsigned Opcode = Op.getOpcode();
6647     if (Opcode == ISD::UNDEF) {
6648       std::swap(ExpectedOpcode, NextExpectedOpcode);
6649       continue;
6650     }
6651
6652     // Early exit if we found an unexpected opcode.
6653     if (Opcode != ExpectedOpcode)
6654       return SDValue();
6655
6656     SDValue Op0 = Op.getOperand(0);
6657     SDValue Op1 = Op.getOperand(1);
6658
6659     // Try to match the following pattern:
6660     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6661     // Early exit if we cannot match that sequence.
6662     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6663         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6664         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6665         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6666         Op0.getOperand(1) != Op1.getOperand(1))
6667       return SDValue();
6668
6669     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6670     if (I0 != i)
6671       return SDValue();
6672
6673     // We found a valid add/sub node. Update the information accordingly.
6674     if (i & 1)
6675       AddFound = true;
6676     else
6677       SubFound = true;
6678
6679     // Update InVec0 and InVec1.
6680     if (InVec0.getOpcode() == ISD::UNDEF)
6681       InVec0 = Op0.getOperand(0);
6682     if (InVec1.getOpcode() == ISD::UNDEF)
6683       InVec1 = Op1.getOperand(0);
6684
6685     // Make sure that operands in input to each add/sub node always
6686     // come from a same pair of vectors.
6687     if (InVec0 != Op0.getOperand(0)) {
6688       if (ExpectedOpcode == ISD::FSUB)
6689         return SDValue();
6690
6691       // FADD is commutable. Try to commute the operands
6692       // and then test again.
6693       std::swap(Op0, Op1);
6694       if (InVec0 != Op0.getOperand(0))
6695         return SDValue();
6696     }
6697
6698     if (InVec1 != Op1.getOperand(0))
6699       return SDValue();
6700
6701     // Update the pair of expected opcodes.
6702     std::swap(ExpectedOpcode, NextExpectedOpcode);
6703   }
6704
6705   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6706   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6707       InVec1.getOpcode() != ISD::UNDEF)
6708     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6709
6710   return SDValue();
6711 }
6712
6713 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6714                                           const X86Subtarget *Subtarget) {
6715   SDLoc DL(N);
6716   EVT VT = N->getValueType(0);
6717   unsigned NumElts = VT.getVectorNumElements();
6718   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6719   SDValue InVec0, InVec1;
6720
6721   // Try to match an ADDSUB.
6722   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6723       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6724     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6725     if (Value.getNode())
6726       return Value;
6727   }
6728
6729   // Try to match horizontal ADD/SUB.
6730   unsigned NumUndefsLO = 0;
6731   unsigned NumUndefsHI = 0;
6732   unsigned Half = NumElts/2;
6733
6734   // Count the number of UNDEF operands in the build_vector in input.
6735   for (unsigned i = 0, e = Half; i != e; ++i)
6736     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6737       NumUndefsLO++;
6738
6739   for (unsigned i = Half, e = NumElts; i != e; ++i)
6740     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6741       NumUndefsHI++;
6742
6743   // Early exit if this is either a build_vector of all UNDEFs or all the
6744   // operands but one are UNDEF.
6745   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6746     return SDValue();
6747
6748   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6749     // Try to match an SSE3 float HADD/HSUB.
6750     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6751       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6752
6753     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6754       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6755   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6756     // Try to match an SSSE3 integer HADD/HSUB.
6757     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6758       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6759
6760     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6761       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6762   }
6763
6764   if (!Subtarget->hasAVX())
6765     return SDValue();
6766
6767   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6768     // Try to match an AVX horizontal add/sub of packed single/double
6769     // precision floating point values from 256-bit vectors.
6770     SDValue InVec2, InVec3;
6771     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6772         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6773         ((InVec0.getOpcode() == ISD::UNDEF ||
6774           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6775         ((InVec1.getOpcode() == ISD::UNDEF ||
6776           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6777       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6778
6779     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6780         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6781         ((InVec0.getOpcode() == ISD::UNDEF ||
6782           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6783         ((InVec1.getOpcode() == ISD::UNDEF ||
6784           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6785       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6786   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6787     // Try to match an AVX2 horizontal add/sub of signed integers.
6788     SDValue InVec2, InVec3;
6789     unsigned X86Opcode;
6790     bool CanFold = true;
6791
6792     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6793         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6794         ((InVec0.getOpcode() == ISD::UNDEF ||
6795           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6796         ((InVec1.getOpcode() == ISD::UNDEF ||
6797           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6798       X86Opcode = X86ISD::HADD;
6799     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6800         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6801         ((InVec0.getOpcode() == ISD::UNDEF ||
6802           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6803         ((InVec1.getOpcode() == ISD::UNDEF ||
6804           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6805       X86Opcode = X86ISD::HSUB;
6806     else
6807       CanFold = false;
6808
6809     if (CanFold) {
6810       // Fold this build_vector into a single horizontal add/sub.
6811       // Do this only if the target has AVX2.
6812       if (Subtarget->hasAVX2())
6813         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6814
6815       // Do not try to expand this build_vector into a pair of horizontal
6816       // add/sub if we can emit a pair of scalar add/sub.
6817       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6818         return SDValue();
6819
6820       // Convert this build_vector into a pair of horizontal binop followed by
6821       // a concat vector.
6822       bool isUndefLO = NumUndefsLO == Half;
6823       bool isUndefHI = NumUndefsHI == Half;
6824       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6825                                    isUndefLO, isUndefHI);
6826     }
6827   }
6828
6829   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6830        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6831     unsigned X86Opcode;
6832     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6833       X86Opcode = X86ISD::HADD;
6834     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6835       X86Opcode = X86ISD::HSUB;
6836     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6837       X86Opcode = X86ISD::FHADD;
6838     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6839       X86Opcode = X86ISD::FHSUB;
6840     else
6841       return SDValue();
6842
6843     // Don't try to expand this build_vector into a pair of horizontal add/sub
6844     // if we can simply emit a pair of scalar add/sub.
6845     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6846       return SDValue();
6847
6848     // Convert this build_vector into two horizontal add/sub followed by
6849     // a concat vector.
6850     bool isUndefLO = NumUndefsLO == Half;
6851     bool isUndefHI = NumUndefsHI == Half;
6852     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6853                                  isUndefLO, isUndefHI);
6854   }
6855
6856   return SDValue();
6857 }
6858
6859 SDValue
6860 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6861   SDLoc dl(Op);
6862
6863   MVT VT = Op.getSimpleValueType();
6864   MVT ExtVT = VT.getVectorElementType();
6865   unsigned NumElems = Op.getNumOperands();
6866
6867   // Generate vectors for predicate vectors.
6868   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6869     return LowerBUILD_VECTORvXi1(Op, DAG);
6870
6871   // Vectors containing all zeros can be matched by pxor and xorps later
6872   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6873     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6874     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6875     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6876       return Op;
6877
6878     return getZeroVector(VT, Subtarget, DAG, dl);
6879   }
6880
6881   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6882   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6883   // vpcmpeqd on 256-bit vectors.
6884   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6885     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6886       return Op;
6887
6888     if (!VT.is512BitVector())
6889       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6890   }
6891
6892   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6893   if (Broadcast.getNode())
6894     return Broadcast;
6895
6896   unsigned EVTBits = ExtVT.getSizeInBits();
6897
6898   unsigned NumZero  = 0;
6899   unsigned NumNonZero = 0;
6900   unsigned NonZeros = 0;
6901   bool IsAllConstants = true;
6902   SmallSet<SDValue, 8> Values;
6903   for (unsigned i = 0; i < NumElems; ++i) {
6904     SDValue Elt = Op.getOperand(i);
6905     if (Elt.getOpcode() == ISD::UNDEF)
6906       continue;
6907     Values.insert(Elt);
6908     if (Elt.getOpcode() != ISD::Constant &&
6909         Elt.getOpcode() != ISD::ConstantFP)
6910       IsAllConstants = false;
6911     if (X86::isZeroNode(Elt))
6912       NumZero++;
6913     else {
6914       NonZeros |= (1 << i);
6915       NumNonZero++;
6916     }
6917   }
6918
6919   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6920   if (NumNonZero == 0)
6921     return DAG.getUNDEF(VT);
6922
6923   // Special case for single non-zero, non-undef, element.
6924   if (NumNonZero == 1) {
6925     unsigned Idx = countTrailingZeros(NonZeros);
6926     SDValue Item = Op.getOperand(Idx);
6927
6928     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6929     // the value are obviously zero, truncate the value to i32 and do the
6930     // insertion that way.  Only do this if the value is non-constant or if the
6931     // value is a constant being inserted into element 0.  It is cheaper to do
6932     // a constant pool load than it is to do a movd + shuffle.
6933     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6934         (!IsAllConstants || Idx == 0)) {
6935       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6936         // Handle SSE only.
6937         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6938         EVT VecVT = MVT::v4i32;
6939         unsigned VecElts = 4;
6940
6941         // Truncate the value (which may itself be a constant) to i32, and
6942         // convert it to a vector with movd (S2V+shuffle to zero extend).
6943         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6944         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6945
6946         // If using the new shuffle lowering, just directly insert this.
6947         if (ExperimentalVectorShuffleLowering)
6948           return DAG.getNode(
6949               ISD::BITCAST, dl, VT,
6950               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6951
6952         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6953
6954         // Now we have our 32-bit value zero extended in the low element of
6955         // a vector.  If Idx != 0, swizzle it into place.
6956         if (Idx != 0) {
6957           SmallVector<int, 4> Mask;
6958           Mask.push_back(Idx);
6959           for (unsigned i = 1; i != VecElts; ++i)
6960             Mask.push_back(i);
6961           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6962                                       &Mask[0]);
6963         }
6964         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6965       }
6966     }
6967
6968     // If we have a constant or non-constant insertion into the low element of
6969     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6970     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6971     // depending on what the source datatype is.
6972     if (Idx == 0) {
6973       if (NumZero == 0)
6974         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6975
6976       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6977           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6978         if (VT.is256BitVector() || VT.is512BitVector()) {
6979           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6980           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6981                              Item, DAG.getIntPtrConstant(0));
6982         }
6983         assert(VT.is128BitVector() && "Expected an SSE value type!");
6984         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6985         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6986         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6987       }
6988
6989       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6990         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6991         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6992         if (VT.is256BitVector()) {
6993           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
6994           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6995         } else {
6996           assert(VT.is128BitVector() && "Expected an SSE value type!");
6997           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6998         }
6999         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7000       }
7001     }
7002
7003     // Is it a vector logical left shift?
7004     if (NumElems == 2 && Idx == 1 &&
7005         X86::isZeroNode(Op.getOperand(0)) &&
7006         !X86::isZeroNode(Op.getOperand(1))) {
7007       unsigned NumBits = VT.getSizeInBits();
7008       return getVShift(true, VT,
7009                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7010                                    VT, Op.getOperand(1)),
7011                        NumBits/2, DAG, *this, dl);
7012     }
7013
7014     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7015       return SDValue();
7016
7017     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7018     // is a non-constant being inserted into an element other than the low one,
7019     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7020     // movd/movss) to move this into the low element, then shuffle it into
7021     // place.
7022     if (EVTBits == 32) {
7023       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7024
7025       // If using the new shuffle lowering, just directly insert this.
7026       if (ExperimentalVectorShuffleLowering)
7027         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7028
7029       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7030       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7031       SmallVector<int, 8> MaskVec;
7032       for (unsigned i = 0; i != NumElems; ++i)
7033         MaskVec.push_back(i == Idx ? 0 : 1);
7034       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7035     }
7036   }
7037
7038   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7039   if (Values.size() == 1) {
7040     if (EVTBits == 32) {
7041       // Instead of a shuffle like this:
7042       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7043       // Check if it's possible to issue this instead.
7044       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7045       unsigned Idx = countTrailingZeros(NonZeros);
7046       SDValue Item = Op.getOperand(Idx);
7047       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7048         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7049     }
7050     return SDValue();
7051   }
7052
7053   // A vector full of immediates; various special cases are already
7054   // handled, so this is best done with a single constant-pool load.
7055   if (IsAllConstants)
7056     return SDValue();
7057
7058   // For AVX-length vectors, see if we can use a vector load to get all of the
7059   // elements, otherwise build the individual 128-bit pieces and use
7060   // shuffles to put them in place.
7061   if (VT.is256BitVector() || VT.is512BitVector()) {
7062     SmallVector<SDValue, 64> V;
7063     for (unsigned i = 0; i != NumElems; ++i)
7064       V.push_back(Op.getOperand(i));
7065
7066     // Check for a build vector of consecutive loads.
7067     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7068       return LD;
7069
7070     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7071
7072     // Build both the lower and upper subvector.
7073     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7074                                 makeArrayRef(&V[0], NumElems/2));
7075     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7076                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7077
7078     // Recreate the wider vector with the lower and upper part.
7079     if (VT.is256BitVector())
7080       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7081     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7082   }
7083
7084   // Let legalizer expand 2-wide build_vectors.
7085   if (EVTBits == 64) {
7086     if (NumNonZero == 1) {
7087       // One half is zero or undef.
7088       unsigned Idx = countTrailingZeros(NonZeros);
7089       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7090                                  Op.getOperand(Idx));
7091       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7092     }
7093     return SDValue();
7094   }
7095
7096   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7097   if (EVTBits == 8 && NumElems == 16) {
7098     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7099                                         Subtarget, *this);
7100     if (V.getNode()) return V;
7101   }
7102
7103   if (EVTBits == 16 && NumElems == 8) {
7104     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7105                                       Subtarget, *this);
7106     if (V.getNode()) return V;
7107   }
7108
7109   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7110   if (EVTBits == 32 && NumElems == 4) {
7111     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7112     if (V.getNode())
7113       return V;
7114   }
7115
7116   // If element VT is == 32 bits, turn it into a number of shuffles.
7117   SmallVector<SDValue, 8> V(NumElems);
7118   if (NumElems == 4 && NumZero > 0) {
7119     for (unsigned i = 0; i < 4; ++i) {
7120       bool isZero = !(NonZeros & (1 << i));
7121       if (isZero)
7122         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7123       else
7124         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7125     }
7126
7127     for (unsigned i = 0; i < 2; ++i) {
7128       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7129         default: break;
7130         case 0:
7131           V[i] = V[i*2];  // Must be a zero vector.
7132           break;
7133         case 1:
7134           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7135           break;
7136         case 2:
7137           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7138           break;
7139         case 3:
7140           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7141           break;
7142       }
7143     }
7144
7145     bool Reverse1 = (NonZeros & 0x3) == 2;
7146     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7147     int MaskVec[] = {
7148       Reverse1 ? 1 : 0,
7149       Reverse1 ? 0 : 1,
7150       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7151       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7152     };
7153     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7154   }
7155
7156   if (Values.size() > 1 && VT.is128BitVector()) {
7157     // Check for a build vector of consecutive loads.
7158     for (unsigned i = 0; i < NumElems; ++i)
7159       V[i] = Op.getOperand(i);
7160
7161     // Check for elements which are consecutive loads.
7162     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7163     if (LD.getNode())
7164       return LD;
7165
7166     // Check for a build vector from mostly shuffle plus few inserting.
7167     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7168     if (Sh.getNode())
7169       return Sh;
7170
7171     // For SSE 4.1, use insertps to put the high elements into the low element.
7172     if (getSubtarget()->hasSSE41()) {
7173       SDValue Result;
7174       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7175         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7176       else
7177         Result = DAG.getUNDEF(VT);
7178
7179       for (unsigned i = 1; i < NumElems; ++i) {
7180         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7181         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7182                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7183       }
7184       return Result;
7185     }
7186
7187     // Otherwise, expand into a number of unpckl*, start by extending each of
7188     // our (non-undef) elements to the full vector width with the element in the
7189     // bottom slot of the vector (which generates no code for SSE).
7190     for (unsigned i = 0; i < NumElems; ++i) {
7191       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7192         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7193       else
7194         V[i] = DAG.getUNDEF(VT);
7195     }
7196
7197     // Next, we iteratively mix elements, e.g. for v4f32:
7198     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7199     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7200     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7201     unsigned EltStride = NumElems >> 1;
7202     while (EltStride != 0) {
7203       for (unsigned i = 0; i < EltStride; ++i) {
7204         // If V[i+EltStride] is undef and this is the first round of mixing,
7205         // then it is safe to just drop this shuffle: V[i] is already in the
7206         // right place, the one element (since it's the first round) being
7207         // inserted as undef can be dropped.  This isn't safe for successive
7208         // rounds because they will permute elements within both vectors.
7209         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7210             EltStride == NumElems/2)
7211           continue;
7212
7213         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7214       }
7215       EltStride >>= 1;
7216     }
7217     return V[0];
7218   }
7219   return SDValue();
7220 }
7221
7222 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7223 // to create 256-bit vectors from two other 128-bit ones.
7224 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7225   SDLoc dl(Op);
7226   MVT ResVT = Op.getSimpleValueType();
7227
7228   assert((ResVT.is256BitVector() ||
7229           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7230
7231   SDValue V1 = Op.getOperand(0);
7232   SDValue V2 = Op.getOperand(1);
7233   unsigned NumElems = ResVT.getVectorNumElements();
7234   if(ResVT.is256BitVector())
7235     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7236
7237   if (Op.getNumOperands() == 4) {
7238     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7239                                 ResVT.getVectorNumElements()/2);
7240     SDValue V3 = Op.getOperand(2);
7241     SDValue V4 = Op.getOperand(3);
7242     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7243       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7244   }
7245   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7246 }
7247
7248 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7249   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7250   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7251          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7252           Op.getNumOperands() == 4)));
7253
7254   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7255   // from two other 128-bit ones.
7256
7257   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7258   return LowerAVXCONCAT_VECTORS(Op, DAG);
7259 }
7260
7261
7262 //===----------------------------------------------------------------------===//
7263 // Vector shuffle lowering
7264 //
7265 // This is an experimental code path for lowering vector shuffles on x86. It is
7266 // designed to handle arbitrary vector shuffles and blends, gracefully
7267 // degrading performance as necessary. It works hard to recognize idiomatic
7268 // shuffles and lower them to optimal instruction patterns without leaving
7269 // a framework that allows reasonably efficient handling of all vector shuffle
7270 // patterns.
7271 //===----------------------------------------------------------------------===//
7272
7273 /// \brief Tiny helper function to identify a no-op mask.
7274 ///
7275 /// This is a somewhat boring predicate function. It checks whether the mask
7276 /// array input, which is assumed to be a single-input shuffle mask of the kind
7277 /// used by the X86 shuffle instructions (not a fully general
7278 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7279 /// in-place shuffle are 'no-op's.
7280 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7281   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7282     if (Mask[i] != -1 && Mask[i] != i)
7283       return false;
7284   return true;
7285 }
7286
7287 /// \brief Helper function to classify a mask as a single-input mask.
7288 ///
7289 /// This isn't a generic single-input test because in the vector shuffle
7290 /// lowering we canonicalize single inputs to be the first input operand. This
7291 /// means we can more quickly test for a single input by only checking whether
7292 /// an input from the second operand exists. We also assume that the size of
7293 /// mask corresponds to the size of the input vectors which isn't true in the
7294 /// fully general case.
7295 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7296   for (int M : Mask)
7297     if (M >= (int)Mask.size())
7298       return false;
7299   return true;
7300 }
7301
7302 /// \brief Test whether there are elements crossing 128-bit lanes in this
7303 /// shuffle mask.
7304 ///
7305 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7306 /// and we routinely test for these.
7307 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7308   int LaneSize = 128 / VT.getScalarSizeInBits();
7309   int Size = Mask.size();
7310   for (int i = 0; i < Size; ++i)
7311     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7312       return true;
7313   return false;
7314 }
7315
7316 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7317 ///
7318 /// This checks a shuffle mask to see if it is performing the same
7319 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7320 /// that it is also not lane-crossing. It may however involve a blend from the
7321 /// same lane of a second vector.
7322 ///
7323 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7324 /// non-trivial to compute in the face of undef lanes. The representation is
7325 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7326 /// entries from both V1 and V2 inputs to the wider mask.
7327 static bool
7328 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7329                                 SmallVectorImpl<int> &RepeatedMask) {
7330   int LaneSize = 128 / VT.getScalarSizeInBits();
7331   RepeatedMask.resize(LaneSize, -1);
7332   int Size = Mask.size();
7333   for (int i = 0; i < Size; ++i) {
7334     if (Mask[i] < 0)
7335       continue;
7336     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7337       // This entry crosses lanes, so there is no way to model this shuffle.
7338       return false;
7339
7340     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7341     if (RepeatedMask[i % LaneSize] == -1)
7342       // This is the first non-undef entry in this slot of a 128-bit lane.
7343       RepeatedMask[i % LaneSize] =
7344           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7345     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7346       // Found a mismatch with the repeated mask.
7347       return false;
7348   }
7349   return true;
7350 }
7351
7352 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7353 // 2013 will allow us to use it as a non-type template parameter.
7354 namespace {
7355
7356 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7357 ///
7358 /// See its documentation for details.
7359 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7360   if (Mask.size() != Args.size())
7361     return false;
7362   for (int i = 0, e = Mask.size(); i < e; ++i) {
7363     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7364     if (Mask[i] != -1 && Mask[i] != *Args[i])
7365       return false;
7366   }
7367   return true;
7368 }
7369
7370 } // namespace
7371
7372 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7373 /// arguments.
7374 ///
7375 /// This is a fast way to test a shuffle mask against a fixed pattern:
7376 ///
7377 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7378 ///
7379 /// It returns true if the mask is exactly as wide as the argument list, and
7380 /// each element of the mask is either -1 (signifying undef) or the value given
7381 /// in the argument.
7382 static const VariadicFunction1<
7383     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7384
7385 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7386 ///
7387 /// This helper function produces an 8-bit shuffle immediate corresponding to
7388 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7389 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7390 /// example.
7391 ///
7392 /// NB: We rely heavily on "undef" masks preserving the input lane.
7393 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7394                                           SelectionDAG &DAG) {
7395   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7396   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7397   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7398   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7399   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7400
7401   unsigned Imm = 0;
7402   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7403   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7404   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7405   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7406   return DAG.getConstant(Imm, MVT::i8);
7407 }
7408
7409 /// \brief Try to emit a blend instruction for a shuffle.
7410 ///
7411 /// This doesn't do any checks for the availability of instructions for blending
7412 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7413 /// be matched in the backend with the type given. What it does check for is
7414 /// that the shuffle mask is in fact a blend.
7415 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7416                                          SDValue V2, ArrayRef<int> Mask,
7417                                          const X86Subtarget *Subtarget,
7418                                          SelectionDAG &DAG) {
7419
7420   unsigned BlendMask = 0;
7421   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7422     if (Mask[i] >= Size) {
7423       if (Mask[i] != i + Size)
7424         return SDValue(); // Shuffled V2 input!
7425       BlendMask |= 1u << i;
7426       continue;
7427     }
7428     if (Mask[i] >= 0 && Mask[i] != i)
7429       return SDValue(); // Shuffled V1 input!
7430   }
7431   switch (VT.SimpleTy) {
7432   case MVT::v2f64:
7433   case MVT::v4f32:
7434   case MVT::v4f64:
7435   case MVT::v8f32:
7436     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7437                        DAG.getConstant(BlendMask, MVT::i8));
7438
7439   case MVT::v4i64:
7440   case MVT::v8i32:
7441     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7442     // FALLTHROUGH
7443   case MVT::v2i64:
7444   case MVT::v4i32:
7445     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7446     // that instruction.
7447     if (Subtarget->hasAVX2()) {
7448       // Scale the blend by the number of 32-bit dwords per element.
7449       int Scale =  VT.getScalarSizeInBits() / 32;
7450       BlendMask = 0;
7451       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7452         if (Mask[i] >= Size)
7453           for (int j = 0; j < Scale; ++j)
7454             BlendMask |= 1u << (i * Scale + j);
7455
7456       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7457       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7458       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7459       return DAG.getNode(ISD::BITCAST, DL, VT,
7460                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7461                                      DAG.getConstant(BlendMask, MVT::i8)));
7462     }
7463     // FALLTHROUGH
7464   case MVT::v8i16: {
7465     // For integer shuffles we need to expand the mask and cast the inputs to
7466     // v8i16s prior to blending.
7467     int Scale = 8 / VT.getVectorNumElements();
7468     BlendMask = 0;
7469     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7470       if (Mask[i] >= Size)
7471         for (int j = 0; j < Scale; ++j)
7472           BlendMask |= 1u << (i * Scale + j);
7473
7474     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7475     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7476     return DAG.getNode(ISD::BITCAST, DL, VT,
7477                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7478                                    DAG.getConstant(BlendMask, MVT::i8)));
7479   }
7480
7481   case MVT::v16i16: {
7482     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7483     SmallVector<int, 8> RepeatedMask;
7484     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7485       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7486       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7487       BlendMask = 0;
7488       for (int i = 0; i < 8; ++i)
7489         if (RepeatedMask[i] >= 16)
7490           BlendMask |= 1u << i;
7491       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7492                          DAG.getConstant(BlendMask, MVT::i8));
7493     }
7494   }
7495     // FALLTHROUGH
7496   case MVT::v32i8: {
7497     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7498     // Scale the blend by the number of bytes per element.
7499     int Scale =  VT.getScalarSizeInBits() / 8;
7500     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7501
7502     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7503     // mix of LLVM's code generator and the x86 backend. We tell the code
7504     // generator that boolean values in the elements of an x86 vector register
7505     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7506     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7507     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7508     // of the element (the remaining are ignored) and 0 in that high bit would
7509     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7510     // the LLVM model for boolean values in vector elements gets the relevant
7511     // bit set, it is set backwards and over constrained relative to x86's
7512     // actual model.
7513     SDValue VSELECTMask[32];
7514     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7515       for (int j = 0; j < Scale; ++j)
7516         VSELECTMask[Scale * i + j] =
7517             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7518                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7519
7520     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7521     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7522     return DAG.getNode(
7523         ISD::BITCAST, DL, VT,
7524         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7525                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7526                     V1, V2));
7527   }
7528
7529   default:
7530     llvm_unreachable("Not a supported integer vector type!");
7531   }
7532 }
7533
7534 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7535 /// unblended shuffles followed by an unshuffled blend.
7536 ///
7537 /// This matches the extremely common pattern for handling combined
7538 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7539 /// operations.
7540 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7541                                                           SDValue V1,
7542                                                           SDValue V2,
7543                                                           ArrayRef<int> Mask,
7544                                                           SelectionDAG &DAG) {
7545   // Shuffle the input elements into the desired positions in V1 and V2 and
7546   // blend them together.
7547   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7548   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7549   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7550   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7551     if (Mask[i] >= 0 && Mask[i] < Size) {
7552       V1Mask[i] = Mask[i];
7553       BlendMask[i] = i;
7554     } else if (Mask[i] >= Size) {
7555       V2Mask[i] = Mask[i] - Size;
7556       BlendMask[i] = i + Size;
7557     }
7558
7559   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7560   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7561   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7562 }
7563
7564 /// \brief Try to lower a vector shuffle as a byte rotation.
7565 ///
7566 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7567 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7568 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7569 /// try to generically lower a vector shuffle through such an pattern. It
7570 /// does not check for the profitability of lowering either as PALIGNR or
7571 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7572 /// This matches shuffle vectors that look like:
7573 ///
7574 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7575 ///
7576 /// Essentially it concatenates V1 and V2, shifts right by some number of
7577 /// elements, and takes the low elements as the result. Note that while this is
7578 /// specified as a *right shift* because x86 is little-endian, it is a *left
7579 /// rotate* of the vector lanes.
7580 ///
7581 /// Note that this only handles 128-bit vector widths currently.
7582 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7583                                               SDValue V2,
7584                                               ArrayRef<int> Mask,
7585                                               const X86Subtarget *Subtarget,
7586                                               SelectionDAG &DAG) {
7587   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7588
7589   // We need to detect various ways of spelling a rotation:
7590   //   [11, 12, 13, 14, 15,  0,  1,  2]
7591   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7592   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7593   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7594   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7595   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7596   int Rotation = 0;
7597   SDValue Lo, Hi;
7598   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7599     if (Mask[i] == -1)
7600       continue;
7601     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7602
7603     // Based on the mod-Size value of this mask element determine where
7604     // a rotated vector would have started.
7605     int StartIdx = i - (Mask[i] % Size);
7606     if (StartIdx == 0)
7607       // The identity rotation isn't interesting, stop.
7608       return SDValue();
7609
7610     // If we found the tail of a vector the rotation must be the missing
7611     // front. If we found the head of a vector, it must be how much of the head.
7612     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7613
7614     if (Rotation == 0)
7615       Rotation = CandidateRotation;
7616     else if (Rotation != CandidateRotation)
7617       // The rotations don't match, so we can't match this mask.
7618       return SDValue();
7619
7620     // Compute which value this mask is pointing at.
7621     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7622
7623     // Compute which of the two target values this index should be assigned to.
7624     // This reflects whether the high elements are remaining or the low elements
7625     // are remaining.
7626     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7627
7628     // Either set up this value if we've not encountered it before, or check
7629     // that it remains consistent.
7630     if (!TargetV)
7631       TargetV = MaskV;
7632     else if (TargetV != MaskV)
7633       // This may be a rotation, but it pulls from the inputs in some
7634       // unsupported interleaving.
7635       return SDValue();
7636   }
7637
7638   // Check that we successfully analyzed the mask, and normalize the results.
7639   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7640   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7641   if (!Lo)
7642     Lo = Hi;
7643   else if (!Hi)
7644     Hi = Lo;
7645
7646   assert(VT.getSizeInBits() == 128 &&
7647          "Rotate-based lowering only supports 128-bit lowering!");
7648   assert(Mask.size() <= 16 &&
7649          "Can shuffle at most 16 bytes in a 128-bit vector!");
7650
7651   // The actual rotate instruction rotates bytes, so we need to scale the
7652   // rotation based on how many bytes are in the vector.
7653   int Scale = 16 / Mask.size();
7654
7655   // SSSE3 targets can use the palignr instruction
7656   if (Subtarget->hasSSSE3()) {
7657     // Cast the inputs to v16i8 to match PALIGNR.
7658     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7659     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7660
7661     return DAG.getNode(ISD::BITCAST, DL, VT,
7662                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7663                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7664   }
7665
7666   // Default SSE2 implementation
7667   int LoByteShift = 16 - Rotation * Scale;
7668   int HiByteShift = Rotation * Scale;
7669
7670   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7671   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7672   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7673
7674   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7675                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7676   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7677                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7678   return DAG.getNode(ISD::BITCAST, DL, VT,
7679                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7680 }
7681
7682 /// \brief Compute whether each element of a shuffle is zeroable.
7683 ///
7684 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7685 /// Either it is an undef element in the shuffle mask, the element of the input
7686 /// referenced is undef, or the element of the input referenced is known to be
7687 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7688 /// as many lanes with this technique as possible to simplify the remaining
7689 /// shuffle.
7690 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7691                                                      SDValue V1, SDValue V2) {
7692   SmallBitVector Zeroable(Mask.size(), false);
7693
7694   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7695   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7696
7697   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7698     int M = Mask[i];
7699     // Handle the easy cases.
7700     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7701       Zeroable[i] = true;
7702       continue;
7703     }
7704
7705     // If this is an index into a build_vector node, dig out the input value and
7706     // use it.
7707     SDValue V = M < Size ? V1 : V2;
7708     if (V.getOpcode() != ISD::BUILD_VECTOR)
7709       continue;
7710
7711     SDValue Input = V.getOperand(M % Size);
7712     // The UNDEF opcode check really should be dead code here, but not quite
7713     // worth asserting on (it isn't invalid, just unexpected).
7714     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7715       Zeroable[i] = true;
7716   }
7717
7718   return Zeroable;
7719 }
7720
7721 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7722 ///
7723 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7724 /// byte-shift instructions. The mask must consist of a shifted sequential
7725 /// shuffle from one of the input vectors and zeroable elements for the
7726 /// remaining 'shifted in' elements.
7727 ///
7728 /// Note that this only handles 128-bit vector widths currently.
7729 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7730                                              SDValue V2, ArrayRef<int> Mask,
7731                                              SelectionDAG &DAG) {
7732   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7733
7734   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7735
7736   int Size = Mask.size();
7737   int Scale = 16 / Size;
7738
7739   for (int Shift = 1; Shift < Size; Shift++) {
7740     int ByteShift = Shift * Scale;
7741
7742     // PSRLDQ : (little-endian) right byte shift
7743     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7744     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7745     // [  1, 2, -1, -1, -1, -1, zz, zz]
7746     bool ZeroableRight = true;
7747     for (int i = Size - Shift; i < Size; i++) {
7748       ZeroableRight &= Zeroable[i];
7749     }
7750
7751     if (ZeroableRight) {
7752       bool ValidShiftRight1 =
7753           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7754       bool ValidShiftRight2 =
7755           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7756
7757       if (ValidShiftRight1 || ValidShiftRight2) {
7758         // Cast the inputs to v2i64 to match PSRLDQ.
7759         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7760         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7761         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7762                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7763         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7764       }
7765     }
7766
7767     // PSLLDQ : (little-endian) left byte shift
7768     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7769     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7770     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7771     bool ZeroableLeft = true;
7772     for (int i = 0; i < Shift; i++) {
7773       ZeroableLeft &= Zeroable[i];
7774     }
7775
7776     if (ZeroableLeft) {
7777       bool ValidShiftLeft1 =
7778           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7779       bool ValidShiftLeft2 =
7780           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7781
7782       if (ValidShiftLeft1 || ValidShiftLeft2) {
7783         // Cast the inputs to v2i64 to match PSLLDQ.
7784         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7785         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7786         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7787                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7788         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7789       }
7790     }
7791   }
7792
7793   return SDValue();
7794 }
7795
7796 /// \brief Lower a vector shuffle as a zero or any extension.
7797 ///
7798 /// Given a specific number of elements, element bit width, and extension
7799 /// stride, produce either a zero or any extension based on the available
7800 /// features of the subtarget.
7801 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7802     SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
7803     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7804   assert(Scale > 1 && "Need a scale to extend.");
7805   int EltBits = VT.getSizeInBits() / NumElements;
7806   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7807          "Only 8, 16, and 32 bit elements can be extended.");
7808   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7809
7810   // Found a valid zext mask! Try various lowering strategies based on the
7811   // input type and available ISA extensions.
7812   if (Subtarget->hasSSE41()) {
7813     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7814     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7815                                  NumElements / Scale);
7816     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7817     return DAG.getNode(ISD::BITCAST, DL, VT,
7818                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7819   }
7820
7821   // For any extends we can cheat for larger element sizes and use shuffle
7822   // instructions that can fold with a load and/or copy.
7823   if (AnyExt && EltBits == 32) {
7824     int PSHUFDMask[4] = {0, -1, 1, -1};
7825     return DAG.getNode(
7826         ISD::BITCAST, DL, VT,
7827         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7828                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7829                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7830   }
7831   if (AnyExt && EltBits == 16 && Scale > 2) {
7832     int PSHUFDMask[4] = {0, -1, 0, -1};
7833     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7834                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7835                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7836     int PSHUFHWMask[4] = {1, -1, -1, -1};
7837     return DAG.getNode(
7838         ISD::BITCAST, DL, VT,
7839         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7840                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7841                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7842   }
7843
7844   // If this would require more than 2 unpack instructions to expand, use
7845   // pshufb when available. We can only use more than 2 unpack instructions
7846   // when zero extending i8 elements which also makes it easier to use pshufb.
7847   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7848     assert(NumElements == 16 && "Unexpected byte vector width!");
7849     SDValue PSHUFBMask[16];
7850     for (int i = 0; i < 16; ++i)
7851       PSHUFBMask[i] =
7852           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7853     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7854     return DAG.getNode(ISD::BITCAST, DL, VT,
7855                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7856                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7857                                                MVT::v16i8, PSHUFBMask)));
7858   }
7859
7860   // Otherwise emit a sequence of unpacks.
7861   do {
7862     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7863     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7864                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7865     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7866     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7867     Scale /= 2;
7868     EltBits *= 2;
7869     NumElements /= 2;
7870   } while (Scale > 1);
7871   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7872 }
7873
7874 /// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
7875 ///
7876 /// This routine will try to do everything in its power to cleverly lower
7877 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
7878 /// check for the profitability of this lowering,  it tries to aggressively
7879 /// match this pattern. It will use all of the micro-architectural details it
7880 /// can to emit an efficient lowering. It handles both blends with all-zero
7881 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
7882 /// masking out later).
7883 ///
7884 /// The reason we have dedicated lowering for zext-style shuffles is that they
7885 /// are both incredibly common and often quite performance sensitive.
7886 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
7887     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7888     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7889   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7890
7891   int Bits = VT.getSizeInBits();
7892   int NumElements = Mask.size();
7893
7894   // Define a helper function to check a particular ext-scale and lower to it if
7895   // valid.
7896   auto Lower = [&](int Scale) -> SDValue {
7897     SDValue InputV;
7898     bool AnyExt = true;
7899     for (int i = 0; i < NumElements; ++i) {
7900       if (Mask[i] == -1)
7901         continue; // Valid anywhere but doesn't tell us anything.
7902       if (i % Scale != 0) {
7903         // Each of the extend elements needs to be zeroable.
7904         if (!Zeroable[i])
7905           return SDValue();
7906
7907         // We no lorger are in the anyext case.
7908         AnyExt = false;
7909         continue;
7910       }
7911
7912       // Each of the base elements needs to be consecutive indices into the
7913       // same input vector.
7914       SDValue V = Mask[i] < NumElements ? V1 : V2;
7915       if (!InputV)
7916         InputV = V;
7917       else if (InputV != V)
7918         return SDValue(); // Flip-flopping inputs.
7919
7920       if (Mask[i] % NumElements != i / Scale)
7921         return SDValue(); // Non-consecutive strided elemenst.
7922     }
7923
7924     // If we fail to find an input, we have a zero-shuffle which should always
7925     // have already been handled.
7926     // FIXME: Maybe handle this here in case during blending we end up with one?
7927     if (!InputV)
7928       return SDValue();
7929
7930     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7931         DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
7932   };
7933
7934   // The widest scale possible for extending is to a 64-bit integer.
7935   assert(Bits % 64 == 0 &&
7936          "The number of bits in a vector must be divisible by 64 on x86!");
7937   int NumExtElements = Bits / 64;
7938
7939   // Each iteration, try extending the elements half as much, but into twice as
7940   // many elements.
7941   for (; NumExtElements < NumElements; NumExtElements *= 2) {
7942     assert(NumElements % NumExtElements == 0 &&
7943            "The input vector size must be divisble by the extended size.");
7944     if (SDValue V = Lower(NumElements / NumExtElements))
7945       return V;
7946   }
7947
7948   // No viable ext lowering found.
7949   return SDValue();
7950 }
7951
7952 /// \brief Try to get a scalar value for a specific element of a vector.
7953 ///
7954 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7955 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7956                                               SelectionDAG &DAG) {
7957   MVT VT = V.getSimpleValueType();
7958   MVT EltVT = VT.getVectorElementType();
7959   while (V.getOpcode() == ISD::BITCAST)
7960     V = V.getOperand(0);
7961   // If the bitcasts shift the element size, we can't extract an equivalent
7962   // element from it.
7963   MVT NewVT = V.getSimpleValueType();
7964   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
7965     return SDValue();
7966
7967   if (V.getOpcode() == ISD::BUILD_VECTOR ||
7968       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
7969     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
7970
7971   return SDValue();
7972 }
7973
7974 /// \brief Helper to test for a load that can be folded with x86 shuffles.
7975 ///
7976 /// This is particularly important because the set of instructions varies
7977 /// significantly based on whether the operand is a load or not.
7978 static bool isShuffleFoldableLoad(SDValue V) {
7979   while (V.getOpcode() == ISD::BITCAST)
7980     V = V.getOperand(0);
7981
7982   return ISD::isNON_EXTLoad(V.getNode());
7983 }
7984
7985 /// \brief Try to lower insertion of a single element into a zero vector.
7986 ///
7987 /// This is a common pattern that we have especially efficient patterns to lower
7988 /// across all subtarget feature sets.
7989 static SDValue lowerVectorShuffleAsElementInsertion(
7990     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7991     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7992   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7993   MVT ExtVT = VT;
7994   MVT EltVT = VT.getVectorElementType();
7995
7996   int V2Index = std::find_if(Mask.begin(), Mask.end(),
7997                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
7998                 Mask.begin();
7999   bool IsV1Zeroable = true;
8000   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8001     if (i != V2Index && !Zeroable[i]) {
8002       IsV1Zeroable = false;
8003       break;
8004     }
8005
8006   // Check for a single input from a SCALAR_TO_VECTOR node.
8007   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8008   // all the smarts here sunk into that routine. However, the current
8009   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8010   // vector shuffle lowering is dead.
8011   if (SDValue V2S = getScalarValueForVectorElement(
8012           V2, Mask[V2Index] - Mask.size(), DAG)) {
8013     // We need to zext the scalar if it is smaller than an i32.
8014     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8015     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8016       // Using zext to expand a narrow element won't work for non-zero
8017       // insertions.
8018       if (!IsV1Zeroable)
8019         return SDValue();
8020
8021       // Zero-extend directly to i32.
8022       ExtVT = MVT::v4i32;
8023       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8024     }
8025     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8026   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8027              EltVT == MVT::i16) {
8028     // Either not inserting from the low element of the input or the input
8029     // element size is too small to use VZEXT_MOVL to clear the high bits.
8030     return SDValue();
8031   }
8032
8033   if (!IsV1Zeroable) {
8034     // If V1 can't be treated as a zero vector we have fewer options to lower
8035     // this. We can't support integer vectors or non-zero targets cheaply, and
8036     // the V1 elements can't be permuted in any way.
8037     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8038     if (!VT.isFloatingPoint() || V2Index != 0)
8039       return SDValue();
8040     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8041     V1Mask[V2Index] = -1;
8042     if (!isNoopShuffleMask(V1Mask))
8043       return SDValue();
8044     // This is essentially a special case blend operation, but if we have
8045     // general purpose blend operations, they are always faster. Bail and let
8046     // the rest of the lowering handle these as blends.
8047     if (Subtarget->hasSSE41())
8048       return SDValue();
8049
8050     // Otherwise, use MOVSD or MOVSS.
8051     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8052            "Only two types of floating point element types to handle!");
8053     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8054                        ExtVT, V1, V2);
8055   }
8056
8057   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8058   if (ExtVT != VT)
8059     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8060
8061   if (V2Index != 0) {
8062     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8063     // the desired position. Otherwise it is more efficient to do a vector
8064     // shift left. We know that we can do a vector shift left because all
8065     // the inputs are zero.
8066     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8067       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8068       V2Shuffle[V2Index] = 0;
8069       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8070     } else {
8071       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8072       V2 = DAG.getNode(
8073           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8074           DAG.getConstant(
8075               V2Index * EltVT.getSizeInBits(),
8076               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8077       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8078     }
8079   }
8080   return V2;
8081 }
8082
8083 /// \brief Try to lower broadcast of a single element.
8084 ///
8085 /// For convenience, this code also bundles all of the subtarget feature set
8086 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8087 /// a convenient way to factor it out.
8088 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8089                                              ArrayRef<int> Mask,
8090                                              const X86Subtarget *Subtarget,
8091                                              SelectionDAG &DAG) {
8092   if (!Subtarget->hasAVX())
8093     return SDValue();
8094   if (VT.isInteger() && !Subtarget->hasAVX2())
8095     return SDValue();
8096
8097   // Check that the mask is a broadcast.
8098   int BroadcastIdx = -1;
8099   for (int M : Mask)
8100     if (M >= 0 && BroadcastIdx == -1)
8101       BroadcastIdx = M;
8102     else if (M >= 0 && M != BroadcastIdx)
8103       return SDValue();
8104
8105   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8106                                             "a sorted mask where the broadcast "
8107                                             "comes from V1.");
8108
8109   // Go up the chain of (vector) values to try and find a scalar load that
8110   // we can combine with the broadcast.
8111   for (;;) {
8112     switch (V.getOpcode()) {
8113     case ISD::CONCAT_VECTORS: {
8114       int OperandSize = Mask.size() / V.getNumOperands();
8115       V = V.getOperand(BroadcastIdx / OperandSize);
8116       BroadcastIdx %= OperandSize;
8117       continue;
8118     }
8119
8120     case ISD::INSERT_SUBVECTOR: {
8121       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8122       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8123       if (!ConstantIdx)
8124         break;
8125
8126       int BeginIdx = (int)ConstantIdx->getZExtValue();
8127       int EndIdx =
8128           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8129       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8130         BroadcastIdx -= BeginIdx;
8131         V = VInner;
8132       } else {
8133         V = VOuter;
8134       }
8135       continue;
8136     }
8137     }
8138     break;
8139   }
8140
8141   // Check if this is a broadcast of a scalar. We special case lowering
8142   // for scalars so that we can more effectively fold with loads.
8143   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8144       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8145     V = V.getOperand(BroadcastIdx);
8146
8147     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8148     // AVX2.
8149     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8150       return SDValue();
8151   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8152     // We can't broadcast from a vector register w/o AVX2, and we can only
8153     // broadcast from the zero-element of a vector register.
8154     return SDValue();
8155   }
8156
8157   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8158 }
8159
8160 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8161 // INSERTPS when the V1 elements are already in the correct locations
8162 // because otherwise we can just always use two SHUFPS instructions which
8163 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8164 // perform INSERTPS if a single V1 element is out of place and all V2
8165 // elements are zeroable.
8166 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8167                                             ArrayRef<int> Mask,
8168                                             SelectionDAG &DAG) {
8169   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8170   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8171   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8172   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8173
8174   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8175
8176   unsigned ZMask = 0;
8177   int V1DstIndex = -1;
8178   int V2DstIndex = -1;
8179   bool V1UsedInPlace = false;
8180
8181   for (int i = 0; i < 4; i++) {
8182     // Synthesize a zero mask from the zeroable elements (includes undefs).
8183     if (Zeroable[i]) {
8184       ZMask |= 1 << i;
8185       continue;
8186     }
8187
8188     // Flag if we use any V1 inputs in place.
8189     if (i == Mask[i]) {
8190       V1UsedInPlace = true;
8191       continue;
8192     }
8193
8194     // We can only insert a single non-zeroable element.
8195     if (V1DstIndex != -1 || V2DstIndex != -1)
8196       return SDValue();
8197
8198     if (Mask[i] < 4) {
8199       // V1 input out of place for insertion.
8200       V1DstIndex = i;
8201     } else {
8202       // V2 input for insertion.
8203       V2DstIndex = i;
8204     }
8205   }
8206
8207   // Don't bother if we have no (non-zeroable) element for insertion.
8208   if (V1DstIndex == -1 && V2DstIndex == -1)
8209     return SDValue();
8210
8211   // Determine element insertion src/dst indices. The src index is from the
8212   // start of the inserted vector, not the start of the concatenated vector.
8213   unsigned V2SrcIndex = 0;
8214   if (V1DstIndex != -1) {
8215     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8216     // and don't use the original V2 at all.
8217     V2SrcIndex = Mask[V1DstIndex];
8218     V2DstIndex = V1DstIndex;
8219     V2 = V1;
8220   } else {
8221     V2SrcIndex = Mask[V2DstIndex] - 4;
8222   }
8223
8224   // If no V1 inputs are used in place, then the result is created only from
8225   // the zero mask and the V2 insertion - so remove V1 dependency.
8226   if (!V1UsedInPlace)
8227     V1 = DAG.getUNDEF(MVT::v4f32);
8228
8229   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8230   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8231
8232   // Insert the V2 element into the desired position.
8233   SDLoc DL(Op);
8234   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8235                      DAG.getConstant(InsertPSMask, MVT::i8));
8236 }
8237
8238 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8239 ///
8240 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8241 /// support for floating point shuffles but not integer shuffles. These
8242 /// instructions will incur a domain crossing penalty on some chips though so
8243 /// it is better to avoid lowering through this for integer vectors where
8244 /// possible.
8245 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8246                                        const X86Subtarget *Subtarget,
8247                                        SelectionDAG &DAG) {
8248   SDLoc DL(Op);
8249   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8250   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8251   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8252   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8253   ArrayRef<int> Mask = SVOp->getMask();
8254   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8255
8256   if (isSingleInputShuffleMask(Mask)) {
8257     // Straight shuffle of a single input vector. Simulate this by using the
8258     // single input as both of the "inputs" to this instruction..
8259     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8260
8261     if (Subtarget->hasAVX()) {
8262       // If we have AVX, we can use VPERMILPS which will allow folding a load
8263       // into the shuffle.
8264       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8265                          DAG.getConstant(SHUFPDMask, MVT::i8));
8266     }
8267
8268     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8269                        DAG.getConstant(SHUFPDMask, MVT::i8));
8270   }
8271   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8272   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8273
8274   // Use dedicated unpack instructions for masks that match their pattern.
8275   if (isShuffleEquivalent(Mask, 0, 2))
8276     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8277   if (isShuffleEquivalent(Mask, 1, 3))
8278     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8279
8280   // If we have a single input, insert that into V1 if we can do so cheaply.
8281   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8282     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8283             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8284       return Insertion;
8285     // Try inverting the insertion since for v2 masks it is easy to do and we
8286     // can't reliably sort the mask one way or the other.
8287     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8288                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8289     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8290             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8291       return Insertion;
8292   }
8293
8294   // Try to use one of the special instruction patterns to handle two common
8295   // blend patterns if a zero-blend above didn't work.
8296   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8297     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8298       // We can either use a special instruction to load over the low double or
8299       // to move just the low double.
8300       return DAG.getNode(
8301           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8302           DL, MVT::v2f64, V2,
8303           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8304
8305   if (Subtarget->hasSSE41())
8306     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8307                                                   Subtarget, DAG))
8308       return Blend;
8309
8310   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8311   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8312                      DAG.getConstant(SHUFPDMask, MVT::i8));
8313 }
8314
8315 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8316 ///
8317 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8318 /// the integer unit to minimize domain crossing penalties. However, for blends
8319 /// it falls back to the floating point shuffle operation with appropriate bit
8320 /// casting.
8321 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8322                                        const X86Subtarget *Subtarget,
8323                                        SelectionDAG &DAG) {
8324   SDLoc DL(Op);
8325   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8326   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8327   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8328   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8329   ArrayRef<int> Mask = SVOp->getMask();
8330   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8331
8332   if (isSingleInputShuffleMask(Mask)) {
8333     // Check for being able to broadcast a single element.
8334     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8335                                                           Mask, Subtarget, DAG))
8336       return Broadcast;
8337
8338     // Straight shuffle of a single input vector. For everything from SSE2
8339     // onward this has a single fast instruction with no scary immediates.
8340     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8341     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8342     int WidenedMask[4] = {
8343         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8344         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8345     return DAG.getNode(
8346         ISD::BITCAST, DL, MVT::v2i64,
8347         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8348                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8349   }
8350
8351   // Try to use byte shift instructions.
8352   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8353           DL, MVT::v2i64, V1, V2, Mask, DAG))
8354     return Shift;
8355
8356   // If we have a single input from V2 insert that into V1 if we can do so
8357   // cheaply.
8358   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8359     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8360             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8361       return Insertion;
8362     // Try inverting the insertion since for v2 masks it is easy to do and we
8363     // can't reliably sort the mask one way or the other.
8364     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8365                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8366     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8367             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8368       return Insertion;
8369   }
8370
8371   // Use dedicated unpack instructions for masks that match their pattern.
8372   if (isShuffleEquivalent(Mask, 0, 2))
8373     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8374   if (isShuffleEquivalent(Mask, 1, 3))
8375     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8376
8377   if (Subtarget->hasSSE41())
8378     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8379                                                   Subtarget, DAG))
8380       return Blend;
8381
8382   // Try to use byte rotation instructions.
8383   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8384   if (Subtarget->hasSSSE3())
8385     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8386             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8387       return Rotate;
8388
8389   // We implement this with SHUFPD which is pretty lame because it will likely
8390   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8391   // However, all the alternatives are still more cycles and newer chips don't
8392   // have this problem. It would be really nice if x86 had better shuffles here.
8393   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8394   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8395   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8396                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8397 }
8398
8399 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8400 ///
8401 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8402 /// It makes no assumptions about whether this is the *best* lowering, it simply
8403 /// uses it.
8404 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8405                                             ArrayRef<int> Mask, SDValue V1,
8406                                             SDValue V2, SelectionDAG &DAG) {
8407   SDValue LowV = V1, HighV = V2;
8408   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8409
8410   int NumV2Elements =
8411       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8412
8413   if (NumV2Elements == 1) {
8414     int V2Index =
8415         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8416         Mask.begin();
8417
8418     // Compute the index adjacent to V2Index and in the same half by toggling
8419     // the low bit.
8420     int V2AdjIndex = V2Index ^ 1;
8421
8422     if (Mask[V2AdjIndex] == -1) {
8423       // Handles all the cases where we have a single V2 element and an undef.
8424       // This will only ever happen in the high lanes because we commute the
8425       // vector otherwise.
8426       if (V2Index < 2)
8427         std::swap(LowV, HighV);
8428       NewMask[V2Index] -= 4;
8429     } else {
8430       // Handle the case where the V2 element ends up adjacent to a V1 element.
8431       // To make this work, blend them together as the first step.
8432       int V1Index = V2AdjIndex;
8433       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8434       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8435                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8436
8437       // Now proceed to reconstruct the final blend as we have the necessary
8438       // high or low half formed.
8439       if (V2Index < 2) {
8440         LowV = V2;
8441         HighV = V1;
8442       } else {
8443         HighV = V2;
8444       }
8445       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8446       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8447     }
8448   } else if (NumV2Elements == 2) {
8449     if (Mask[0] < 4 && Mask[1] < 4) {
8450       // Handle the easy case where we have V1 in the low lanes and V2 in the
8451       // high lanes.
8452       NewMask[2] -= 4;
8453       NewMask[3] -= 4;
8454     } else if (Mask[2] < 4 && Mask[3] < 4) {
8455       // We also handle the reversed case because this utility may get called
8456       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8457       // arrange things in the right direction.
8458       NewMask[0] -= 4;
8459       NewMask[1] -= 4;
8460       HighV = V1;
8461       LowV = V2;
8462     } else {
8463       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8464       // trying to place elements directly, just blend them and set up the final
8465       // shuffle to place them.
8466
8467       // The first two blend mask elements are for V1, the second two are for
8468       // V2.
8469       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8470                           Mask[2] < 4 ? Mask[2] : Mask[3],
8471                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8472                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8473       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8474                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8475
8476       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8477       // a blend.
8478       LowV = HighV = V1;
8479       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8480       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8481       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8482       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8483     }
8484   }
8485   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8486                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8487 }
8488
8489 /// \brief Lower 4-lane 32-bit floating point shuffles.
8490 ///
8491 /// Uses instructions exclusively from the floating point unit to minimize
8492 /// domain crossing penalties, as these are sufficient to implement all v4f32
8493 /// shuffles.
8494 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8495                                        const X86Subtarget *Subtarget,
8496                                        SelectionDAG &DAG) {
8497   SDLoc DL(Op);
8498   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8499   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8500   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8501   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8502   ArrayRef<int> Mask = SVOp->getMask();
8503   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8504
8505   int NumV2Elements =
8506       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8507
8508   if (NumV2Elements == 0) {
8509     // Check for being able to broadcast a single element.
8510     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8511                                                           Mask, Subtarget, DAG))
8512       return Broadcast;
8513
8514     if (Subtarget->hasAVX()) {
8515       // If we have AVX, we can use VPERMILPS which will allow folding a load
8516       // into the shuffle.
8517       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8518                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8519     }
8520
8521     // Otherwise, use a straight shuffle of a single input vector. We pass the
8522     // input vector to both operands to simulate this with a SHUFPS.
8523     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8524                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8525   }
8526
8527   // Use dedicated unpack instructions for masks that match their pattern.
8528   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8529     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8530   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8531     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8532
8533   // There are special ways we can lower some single-element blends. However, we
8534   // have custom ways we can lower more complex single-element blends below that
8535   // we defer to if both this and BLENDPS fail to match, so restrict this to
8536   // when the V2 input is targeting element 0 of the mask -- that is the fast
8537   // case here.
8538   if (NumV2Elements == 1 && Mask[0] >= 4)
8539     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8540                                                          Mask, Subtarget, DAG))
8541       return V;
8542
8543   if (Subtarget->hasSSE41()) {
8544     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8545                                                   Subtarget, DAG))
8546       return Blend;
8547
8548     // Use INSERTPS if we can complete the shuffle efficiently.
8549     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8550       return V;
8551   }
8552
8553   // Otherwise fall back to a SHUFPS lowering strategy.
8554   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8555 }
8556
8557 /// \brief Lower 4-lane i32 vector shuffles.
8558 ///
8559 /// We try to handle these with integer-domain shuffles where we can, but for
8560 /// blends we use the floating point domain blend instructions.
8561 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8562                                        const X86Subtarget *Subtarget,
8563                                        SelectionDAG &DAG) {
8564   SDLoc DL(Op);
8565   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8566   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8567   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8568   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8569   ArrayRef<int> Mask = SVOp->getMask();
8570   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8571
8572   // Whenever we can lower this as a zext, that instruction is strictly faster
8573   // than any alternative. It also allows us to fold memory operands into the
8574   // shuffle in many cases.
8575   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8576                                                          Mask, Subtarget, DAG))
8577     return ZExt;
8578
8579   int NumV2Elements =
8580       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8581
8582   if (NumV2Elements == 0) {
8583     // Check for being able to broadcast a single element.
8584     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8585                                                           Mask, Subtarget, DAG))
8586       return Broadcast;
8587
8588     // Straight shuffle of a single input vector. For everything from SSE2
8589     // onward this has a single fast instruction with no scary immediates.
8590     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8591     // but we aren't actually going to use the UNPCK instruction because doing
8592     // so prevents folding a load into this instruction or making a copy.
8593     const int UnpackLoMask[] = {0, 0, 1, 1};
8594     const int UnpackHiMask[] = {2, 2, 3, 3};
8595     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8596       Mask = UnpackLoMask;
8597     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8598       Mask = UnpackHiMask;
8599
8600     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8601                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8602   }
8603
8604   // Try to use byte shift instructions.
8605   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8606           DL, MVT::v4i32, V1, V2, Mask, DAG))
8607     return Shift;
8608
8609   // There are special ways we can lower some single-element blends.
8610   if (NumV2Elements == 1)
8611     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8612                                                          Mask, Subtarget, DAG))
8613       return V;
8614
8615   // Use dedicated unpack instructions for masks that match their pattern.
8616   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8617     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8618   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8619     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8620
8621   if (Subtarget->hasSSE41())
8622     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8623                                                   Subtarget, DAG))
8624       return Blend;
8625
8626   // Try to use byte rotation instructions.
8627   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8628   if (Subtarget->hasSSSE3())
8629     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8630             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8631       return Rotate;
8632
8633   // We implement this with SHUFPS because it can blend from two vectors.
8634   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8635   // up the inputs, bypassing domain shift penalties that we would encur if we
8636   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8637   // relevant.
8638   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8639                      DAG.getVectorShuffle(
8640                          MVT::v4f32, DL,
8641                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8642                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8643 }
8644
8645 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8646 /// shuffle lowering, and the most complex part.
8647 ///
8648 /// The lowering strategy is to try to form pairs of input lanes which are
8649 /// targeted at the same half of the final vector, and then use a dword shuffle
8650 /// to place them onto the right half, and finally unpack the paired lanes into
8651 /// their final position.
8652 ///
8653 /// The exact breakdown of how to form these dword pairs and align them on the
8654 /// correct sides is really tricky. See the comments within the function for
8655 /// more of the details.
8656 static SDValue lowerV8I16SingleInputVectorShuffle(
8657     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8658     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8659   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8660   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8661   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8662
8663   SmallVector<int, 4> LoInputs;
8664   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8665                [](int M) { return M >= 0; });
8666   std::sort(LoInputs.begin(), LoInputs.end());
8667   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8668   SmallVector<int, 4> HiInputs;
8669   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8670                [](int M) { return M >= 0; });
8671   std::sort(HiInputs.begin(), HiInputs.end());
8672   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8673   int NumLToL =
8674       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8675   int NumHToL = LoInputs.size() - NumLToL;
8676   int NumLToH =
8677       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8678   int NumHToH = HiInputs.size() - NumLToH;
8679   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8680   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8681   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8682   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8683
8684   // Check for being able to broadcast a single element.
8685   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8686                                                         Mask, Subtarget, DAG))
8687     return Broadcast;
8688
8689   // Try to use byte shift instructions.
8690   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8691           DL, MVT::v8i16, V, V, Mask, DAG))
8692     return Shift;
8693
8694   // Use dedicated unpack instructions for masks that match their pattern.
8695   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8696     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8697   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8698     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8699
8700   // Try to use byte rotation instructions.
8701   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8702           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8703     return Rotate;
8704
8705   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8706   // such inputs we can swap two of the dwords across the half mark and end up
8707   // with <=2 inputs to each half in each half. Once there, we can fall through
8708   // to the generic code below. For example:
8709   //
8710   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8711   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8712   //
8713   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8714   // and an existing 2-into-2 on the other half. In this case we may have to
8715   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8716   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8717   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8718   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8719   // half than the one we target for fixing) will be fixed when we re-enter this
8720   // path. We will also combine away any sequence of PSHUFD instructions that
8721   // result into a single instruction. Here is an example of the tricky case:
8722   //
8723   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8724   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8725   //
8726   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8727   //
8728   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8729   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8730   //
8731   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8732   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8733   //
8734   // The result is fine to be handled by the generic logic.
8735   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8736                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8737                           int AOffset, int BOffset) {
8738     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8739            "Must call this with A having 3 or 1 inputs from the A half.");
8740     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8741            "Must call this with B having 1 or 3 inputs from the B half.");
8742     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8743            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8744
8745     // Compute the index of dword with only one word among the three inputs in
8746     // a half by taking the sum of the half with three inputs and subtracting
8747     // the sum of the actual three inputs. The difference is the remaining
8748     // slot.
8749     int ADWord, BDWord;
8750     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8751     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8752     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8753     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8754     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8755     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8756     int TripleNonInputIdx =
8757         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8758     TripleDWord = TripleNonInputIdx / 2;
8759
8760     // We use xor with one to compute the adjacent DWord to whichever one the
8761     // OneInput is in.
8762     OneInputDWord = (OneInput / 2) ^ 1;
8763
8764     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8765     // and BToA inputs. If there is also such a problem with the BToB and AToB
8766     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8767     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8768     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8769     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8770       // Compute how many inputs will be flipped by swapping these DWords. We
8771       // need
8772       // to balance this to ensure we don't form a 3-1 shuffle in the other
8773       // half.
8774       int NumFlippedAToBInputs =
8775           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8776           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8777       int NumFlippedBToBInputs =
8778           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8779           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8780       if ((NumFlippedAToBInputs == 1 &&
8781            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8782           (NumFlippedBToBInputs == 1 &&
8783            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8784         // We choose whether to fix the A half or B half based on whether that
8785         // half has zero flipped inputs. At zero, we may not be able to fix it
8786         // with that half. We also bias towards fixing the B half because that
8787         // will more commonly be the high half, and we have to bias one way.
8788         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8789                                                        ArrayRef<int> Inputs) {
8790           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8791           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8792                                          PinnedIdx ^ 1) != Inputs.end();
8793           // Determine whether the free index is in the flipped dword or the
8794           // unflipped dword based on where the pinned index is. We use this bit
8795           // in an xor to conditionally select the adjacent dword.
8796           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8797           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8798                                              FixFreeIdx) != Inputs.end();
8799           if (IsFixIdxInput == IsFixFreeIdxInput)
8800             FixFreeIdx += 1;
8801           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8802                                         FixFreeIdx) != Inputs.end();
8803           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8804                  "We need to be changing the number of flipped inputs!");
8805           int PSHUFHalfMask[] = {0, 1, 2, 3};
8806           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8807           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8808                           MVT::v8i16, V,
8809                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8810
8811           for (int &M : Mask)
8812             if (M != -1 && M == FixIdx)
8813               M = FixFreeIdx;
8814             else if (M != -1 && M == FixFreeIdx)
8815               M = FixIdx;
8816         };
8817         if (NumFlippedBToBInputs != 0) {
8818           int BPinnedIdx =
8819               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8820           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8821         } else {
8822           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
8823           int APinnedIdx =
8824               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8825           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8826         }
8827       }
8828     }
8829
8830     int PSHUFDMask[] = {0, 1, 2, 3};
8831     PSHUFDMask[ADWord] = BDWord;
8832     PSHUFDMask[BDWord] = ADWord;
8833     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
8834                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8835                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
8836                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8837
8838     // Adjust the mask to match the new locations of A and B.
8839     for (int &M : Mask)
8840       if (M != -1 && M/2 == ADWord)
8841         M = 2 * BDWord + M % 2;
8842       else if (M != -1 && M/2 == BDWord)
8843         M = 2 * ADWord + M % 2;
8844
8845     // Recurse back into this routine to re-compute state now that this isn't
8846     // a 3 and 1 problem.
8847     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
8848                                 Mask);
8849   };
8850   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8851     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8852   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8853     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8854
8855   // At this point there are at most two inputs to the low and high halves from
8856   // each half. That means the inputs can always be grouped into dwords and
8857   // those dwords can then be moved to the correct half with a dword shuffle.
8858   // We use at most one low and one high word shuffle to collect these paired
8859   // inputs into dwords, and finally a dword shuffle to place them.
8860   int PSHUFLMask[4] = {-1, -1, -1, -1};
8861   int PSHUFHMask[4] = {-1, -1, -1, -1};
8862   int PSHUFDMask[4] = {-1, -1, -1, -1};
8863
8864   // First fix the masks for all the inputs that are staying in their
8865   // original halves. This will then dictate the targets of the cross-half
8866   // shuffles.
8867   auto fixInPlaceInputs =
8868       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8869                     MutableArrayRef<int> SourceHalfMask,
8870                     MutableArrayRef<int> HalfMask, int HalfOffset) {
8871     if (InPlaceInputs.empty())
8872       return;
8873     if (InPlaceInputs.size() == 1) {
8874       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8875           InPlaceInputs[0] - HalfOffset;
8876       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8877       return;
8878     }
8879     if (IncomingInputs.empty()) {
8880       // Just fix all of the in place inputs.
8881       for (int Input : InPlaceInputs) {
8882         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8883         PSHUFDMask[Input / 2] = Input / 2;
8884       }
8885       return;
8886     }
8887
8888     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
8889     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8890         InPlaceInputs[0] - HalfOffset;
8891     // Put the second input next to the first so that they are packed into
8892     // a dword. We find the adjacent index by toggling the low bit.
8893     int AdjIndex = InPlaceInputs[0] ^ 1;
8894     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8895     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8896     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8897   };
8898   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8899   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8900
8901   // Now gather the cross-half inputs and place them into a free dword of
8902   // their target half.
8903   // FIXME: This operation could almost certainly be simplified dramatically to
8904   // look more like the 3-1 fixing operation.
8905   auto moveInputsToRightHalf = [&PSHUFDMask](
8906       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8907       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8908       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8909       int DestOffset) {
8910     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8911       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8912     };
8913     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8914                                                int Word) {
8915       int LowWord = Word & ~1;
8916       int HighWord = Word | 1;
8917       return isWordClobbered(SourceHalfMask, LowWord) ||
8918              isWordClobbered(SourceHalfMask, HighWord);
8919     };
8920
8921     if (IncomingInputs.empty())
8922       return;
8923
8924     if (ExistingInputs.empty()) {
8925       // Map any dwords with inputs from them into the right half.
8926       for (int Input : IncomingInputs) {
8927         // If the source half mask maps over the inputs, turn those into
8928         // swaps and use the swapped lane.
8929         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8930           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8931             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8932                 Input - SourceOffset;
8933             // We have to swap the uses in our half mask in one sweep.
8934             for (int &M : HalfMask)
8935               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8936                 M = Input;
8937               else if (M == Input)
8938                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8939           } else {
8940             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
8941                        Input - SourceOffset &&
8942                    "Previous placement doesn't match!");
8943           }
8944           // Note that this correctly re-maps both when we do a swap and when
8945           // we observe the other side of the swap above. We rely on that to
8946           // avoid swapping the members of the input list directly.
8947           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8948         }
8949
8950         // Map the input's dword into the correct half.
8951         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
8952           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
8953         else
8954           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
8955                      Input / 2 &&
8956                  "Previous placement doesn't match!");
8957       }
8958
8959       // And just directly shift any other-half mask elements to be same-half
8960       // as we will have mirrored the dword containing the element into the
8961       // same position within that half.
8962       for (int &M : HalfMask)
8963         if (M >= SourceOffset && M < SourceOffset + 4) {
8964           M = M - SourceOffset + DestOffset;
8965           assert(M >= 0 && "This should never wrap below zero!");
8966         }
8967       return;
8968     }
8969
8970     // Ensure we have the input in a viable dword of its current half. This
8971     // is particularly tricky because the original position may be clobbered
8972     // by inputs being moved and *staying* in that half.
8973     if (IncomingInputs.size() == 1) {
8974       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8975         int InputFixed = std::find(std::begin(SourceHalfMask),
8976                                    std::end(SourceHalfMask), -1) -
8977                          std::begin(SourceHalfMask) + SourceOffset;
8978         SourceHalfMask[InputFixed - SourceOffset] =
8979             IncomingInputs[0] - SourceOffset;
8980         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
8981                      InputFixed);
8982         IncomingInputs[0] = InputFixed;
8983       }
8984     } else if (IncomingInputs.size() == 2) {
8985       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
8986           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8987         // We have two non-adjacent or clobbered inputs we need to extract from
8988         // the source half. To do this, we need to map them into some adjacent
8989         // dword slot in the source mask.
8990         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
8991                               IncomingInputs[1] - SourceOffset};
8992
8993         // If there is a free slot in the source half mask adjacent to one of
8994         // the inputs, place the other input in it. We use (Index XOR 1) to
8995         // compute an adjacent index.
8996         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
8997             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
8998           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
8999           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9000           InputsFixed[1] = InputsFixed[0] ^ 1;
9001         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9002                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9003           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9004           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9005           InputsFixed[0] = InputsFixed[1] ^ 1;
9006         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9007                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9008           // The two inputs are in the same DWord but it is clobbered and the
9009           // adjacent DWord isn't used at all. Move both inputs to the free
9010           // slot.
9011           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9012           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9013           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9014           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9015         } else {
9016           // The only way we hit this point is if there is no clobbering
9017           // (because there are no off-half inputs to this half) and there is no
9018           // free slot adjacent to one of the inputs. In this case, we have to
9019           // swap an input with a non-input.
9020           for (int i = 0; i < 4; ++i)
9021             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9022                    "We can't handle any clobbers here!");
9023           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9024                  "Cannot have adjacent inputs here!");
9025
9026           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9027           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9028
9029           // We also have to update the final source mask in this case because
9030           // it may need to undo the above swap.
9031           for (int &M : FinalSourceHalfMask)
9032             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9033               M = InputsFixed[1] + SourceOffset;
9034             else if (M == InputsFixed[1] + SourceOffset)
9035               M = (InputsFixed[0] ^ 1) + SourceOffset;
9036
9037           InputsFixed[1] = InputsFixed[0] ^ 1;
9038         }
9039
9040         // Point everything at the fixed inputs.
9041         for (int &M : HalfMask)
9042           if (M == IncomingInputs[0])
9043             M = InputsFixed[0] + SourceOffset;
9044           else if (M == IncomingInputs[1])
9045             M = InputsFixed[1] + SourceOffset;
9046
9047         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9048         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9049       }
9050     } else {
9051       llvm_unreachable("Unhandled input size!");
9052     }
9053
9054     // Now hoist the DWord down to the right half.
9055     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9056     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9057     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9058     for (int &M : HalfMask)
9059       for (int Input : IncomingInputs)
9060         if (M == Input)
9061           M = FreeDWord * 2 + Input % 2;
9062   };
9063   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9064                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9065   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9066                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9067
9068   // Now enact all the shuffles we've computed to move the inputs into their
9069   // target half.
9070   if (!isNoopShuffleMask(PSHUFLMask))
9071     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9072                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9073   if (!isNoopShuffleMask(PSHUFHMask))
9074     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9075                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9076   if (!isNoopShuffleMask(PSHUFDMask))
9077     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9078                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9079                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9080                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9081
9082   // At this point, each half should contain all its inputs, and we can then
9083   // just shuffle them into their final position.
9084   assert(std::count_if(LoMask.begin(), LoMask.end(),
9085                        [](int M) { return M >= 4; }) == 0 &&
9086          "Failed to lift all the high half inputs to the low mask!");
9087   assert(std::count_if(HiMask.begin(), HiMask.end(),
9088                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9089          "Failed to lift all the low half inputs to the high mask!");
9090
9091   // Do a half shuffle for the low mask.
9092   if (!isNoopShuffleMask(LoMask))
9093     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9094                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9095
9096   // Do a half shuffle with the high mask after shifting its values down.
9097   for (int &M : HiMask)
9098     if (M >= 0)
9099       M -= 4;
9100   if (!isNoopShuffleMask(HiMask))
9101     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9102                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9103
9104   return V;
9105 }
9106
9107 /// \brief Detect whether the mask pattern should be lowered through
9108 /// interleaving.
9109 ///
9110 /// This essentially tests whether viewing the mask as an interleaving of two
9111 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9112 /// lowering it through interleaving is a significantly better strategy.
9113 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9114   int NumEvenInputs[2] = {0, 0};
9115   int NumOddInputs[2] = {0, 0};
9116   int NumLoInputs[2] = {0, 0};
9117   int NumHiInputs[2] = {0, 0};
9118   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9119     if (Mask[i] < 0)
9120       continue;
9121
9122     int InputIdx = Mask[i] >= Size;
9123
9124     if (i < Size / 2)
9125       ++NumLoInputs[InputIdx];
9126     else
9127       ++NumHiInputs[InputIdx];
9128
9129     if ((i % 2) == 0)
9130       ++NumEvenInputs[InputIdx];
9131     else
9132       ++NumOddInputs[InputIdx];
9133   }
9134
9135   // The minimum number of cross-input results for both the interleaved and
9136   // split cases. If interleaving results in fewer cross-input results, return
9137   // true.
9138   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9139                                     NumEvenInputs[0] + NumOddInputs[1]);
9140   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9141                               NumLoInputs[0] + NumHiInputs[1]);
9142   return InterleavedCrosses < SplitCrosses;
9143 }
9144
9145 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9146 ///
9147 /// This strategy only works when the inputs from each vector fit into a single
9148 /// half of that vector, and generally there are not so many inputs as to leave
9149 /// the in-place shuffles required highly constrained (and thus expensive). It
9150 /// shifts all the inputs into a single side of both input vectors and then
9151 /// uses an unpack to interleave these inputs in a single vector. At that
9152 /// point, we will fall back on the generic single input shuffle lowering.
9153 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9154                                                  SDValue V2,
9155                                                  MutableArrayRef<int> Mask,
9156                                                  const X86Subtarget *Subtarget,
9157                                                  SelectionDAG &DAG) {
9158   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9159   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9160   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9161   for (int i = 0; i < 8; ++i)
9162     if (Mask[i] >= 0 && Mask[i] < 4)
9163       LoV1Inputs.push_back(i);
9164     else if (Mask[i] >= 4 && Mask[i] < 8)
9165       HiV1Inputs.push_back(i);
9166     else if (Mask[i] >= 8 && Mask[i] < 12)
9167       LoV2Inputs.push_back(i);
9168     else if (Mask[i] >= 12)
9169       HiV2Inputs.push_back(i);
9170
9171   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9172   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9173   (void)NumV1Inputs;
9174   (void)NumV2Inputs;
9175   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9176   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9177   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9178
9179   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9180                      HiV1Inputs.size() + HiV2Inputs.size();
9181
9182   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9183                               ArrayRef<int> HiInputs, bool MoveToLo,
9184                               int MaskOffset) {
9185     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9186     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9187     if (BadInputs.empty())
9188       return V;
9189
9190     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9191     int MoveOffset = MoveToLo ? 0 : 4;
9192
9193     if (GoodInputs.empty()) {
9194       for (int BadInput : BadInputs) {
9195         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9196         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9197       }
9198     } else {
9199       if (GoodInputs.size() == 2) {
9200         // If the low inputs are spread across two dwords, pack them into
9201         // a single dword.
9202         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9203         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9204         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9205         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9206       } else {
9207         // Otherwise pin the good inputs.
9208         for (int GoodInput : GoodInputs)
9209           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9210       }
9211
9212       if (BadInputs.size() == 2) {
9213         // If we have two bad inputs then there may be either one or two good
9214         // inputs fixed in place. Find a fixed input, and then find the *other*
9215         // two adjacent indices by using modular arithmetic.
9216         int GoodMaskIdx =
9217             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9218                          [](int M) { return M >= 0; }) -
9219             std::begin(MoveMask);
9220         int MoveMaskIdx =
9221             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9222         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9223         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9224         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9225         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9226         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9227         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9228       } else {
9229         assert(BadInputs.size() == 1 && "All sizes handled");
9230         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9231                                     std::end(MoveMask), -1) -
9232                           std::begin(MoveMask);
9233         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9234         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9235       }
9236     }
9237
9238     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9239                                 MoveMask);
9240   };
9241   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9242                         /*MaskOffset*/ 0);
9243   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9244                         /*MaskOffset*/ 8);
9245
9246   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9247   // cross-half traffic in the final shuffle.
9248
9249   // Munge the mask to be a single-input mask after the unpack merges the
9250   // results.
9251   for (int &M : Mask)
9252     if (M != -1)
9253       M = 2 * (M % 4) + (M / 8);
9254
9255   return DAG.getVectorShuffle(
9256       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9257                                   DL, MVT::v8i16, V1, V2),
9258       DAG.getUNDEF(MVT::v8i16), Mask);
9259 }
9260
9261 /// \brief Generic lowering of 8-lane i16 shuffles.
9262 ///
9263 /// This handles both single-input shuffles and combined shuffle/blends with
9264 /// two inputs. The single input shuffles are immediately delegated to
9265 /// a dedicated lowering routine.
9266 ///
9267 /// The blends are lowered in one of three fundamental ways. If there are few
9268 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9269 /// of the input is significantly cheaper when lowered as an interleaving of
9270 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9271 /// halves of the inputs separately (making them have relatively few inputs)
9272 /// and then concatenate them.
9273 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9274                                        const X86Subtarget *Subtarget,
9275                                        SelectionDAG &DAG) {
9276   SDLoc DL(Op);
9277   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9278   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9279   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9280   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9281   ArrayRef<int> OrigMask = SVOp->getMask();
9282   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9283                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9284   MutableArrayRef<int> Mask(MaskStorage);
9285
9286   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9287
9288   // Whenever we can lower this as a zext, that instruction is strictly faster
9289   // than any alternative.
9290   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9291           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9292     return ZExt;
9293
9294   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9295   auto isV2 = [](int M) { return M >= 8; };
9296
9297   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9298   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9299
9300   if (NumV2Inputs == 0)
9301     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9302
9303   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9304                             "to be V1-input shuffles.");
9305
9306   // Try to use byte shift instructions.
9307   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9308           DL, MVT::v8i16, V1, V2, Mask, DAG))
9309     return Shift;
9310
9311   // There are special ways we can lower some single-element blends.
9312   if (NumV2Inputs == 1)
9313     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9314                                                          Mask, Subtarget, DAG))
9315       return V;
9316
9317   // Use dedicated unpack instructions for masks that match their pattern.
9318   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9319     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9320   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9321     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9322
9323   if (Subtarget->hasSSE41())
9324     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9325                                                   Subtarget, DAG))
9326       return Blend;
9327
9328   // Try to use byte rotation instructions.
9329   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9330           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9331     return Rotate;
9332
9333   if (NumV1Inputs + NumV2Inputs <= 4)
9334     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9335
9336   // Check whether an interleaving lowering is likely to be more efficient.
9337   // This isn't perfect but it is a strong heuristic that tends to work well on
9338   // the kinds of shuffles that show up in practice.
9339   //
9340   // FIXME: Handle 1x, 2x, and 4x interleaving.
9341   if (shouldLowerAsInterleaving(Mask)) {
9342     // FIXME: Figure out whether we should pack these into the low or high
9343     // halves.
9344
9345     int EMask[8], OMask[8];
9346     for (int i = 0; i < 4; ++i) {
9347       EMask[i] = Mask[2*i];
9348       OMask[i] = Mask[2*i + 1];
9349       EMask[i + 4] = -1;
9350       OMask[i + 4] = -1;
9351     }
9352
9353     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9354     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9355
9356     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9357   }
9358
9359   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9360   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9361
9362   for (int i = 0; i < 4; ++i) {
9363     LoBlendMask[i] = Mask[i];
9364     HiBlendMask[i] = Mask[i + 4];
9365   }
9366
9367   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9368   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9369   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9370   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9371
9372   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9373                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9374 }
9375
9376 /// \brief Check whether a compaction lowering can be done by dropping even
9377 /// elements and compute how many times even elements must be dropped.
9378 ///
9379 /// This handles shuffles which take every Nth element where N is a power of
9380 /// two. Example shuffle masks:
9381 ///
9382 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9383 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9384 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9385 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9386 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9387 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9388 ///
9389 /// Any of these lanes can of course be undef.
9390 ///
9391 /// This routine only supports N <= 3.
9392 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9393 /// for larger N.
9394 ///
9395 /// \returns N above, or the number of times even elements must be dropped if
9396 /// there is such a number. Otherwise returns zero.
9397 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9398   // Figure out whether we're looping over two inputs or just one.
9399   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9400
9401   // The modulus for the shuffle vector entries is based on whether this is
9402   // a single input or not.
9403   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9404   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9405          "We should only be called with masks with a power-of-2 size!");
9406
9407   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9408
9409   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9410   // and 2^3 simultaneously. This is because we may have ambiguity with
9411   // partially undef inputs.
9412   bool ViableForN[3] = {true, true, true};
9413
9414   for (int i = 0, e = Mask.size(); i < e; ++i) {
9415     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9416     // want.
9417     if (Mask[i] == -1)
9418       continue;
9419
9420     bool IsAnyViable = false;
9421     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9422       if (ViableForN[j]) {
9423         uint64_t N = j + 1;
9424
9425         // The shuffle mask must be equal to (i * 2^N) % M.
9426         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9427           IsAnyViable = true;
9428         else
9429           ViableForN[j] = false;
9430       }
9431     // Early exit if we exhaust the possible powers of two.
9432     if (!IsAnyViable)
9433       break;
9434   }
9435
9436   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9437     if (ViableForN[j])
9438       return j + 1;
9439
9440   // Return 0 as there is no viable power of two.
9441   return 0;
9442 }
9443
9444 /// \brief Generic lowering of v16i8 shuffles.
9445 ///
9446 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9447 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9448 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9449 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9450 /// back together.
9451 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9452                                        const X86Subtarget *Subtarget,
9453                                        SelectionDAG &DAG) {
9454   SDLoc DL(Op);
9455   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9456   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9457   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9458   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9459   ArrayRef<int> OrigMask = SVOp->getMask();
9460   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9461
9462   // Try to use byte shift instructions.
9463   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9464           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9465     return Shift;
9466
9467   // Try to use byte rotation instructions.
9468   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9469           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9470     return Rotate;
9471
9472   // Try to use a zext lowering.
9473   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9474           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9475     return ZExt;
9476
9477   int MaskStorage[16] = {
9478       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9479       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9480       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9481       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9482   MutableArrayRef<int> Mask(MaskStorage);
9483   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9484   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9485
9486   int NumV2Elements =
9487       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9488
9489   // For single-input shuffles, there are some nicer lowering tricks we can use.
9490   if (NumV2Elements == 0) {
9491     // Check for being able to broadcast a single element.
9492     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9493                                                           Mask, Subtarget, DAG))
9494       return Broadcast;
9495
9496     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9497     // Notably, this handles splat and partial-splat shuffles more efficiently.
9498     // However, it only makes sense if the pre-duplication shuffle simplifies
9499     // things significantly. Currently, this means we need to be able to
9500     // express the pre-duplication shuffle as an i16 shuffle.
9501     //
9502     // FIXME: We should check for other patterns which can be widened into an
9503     // i16 shuffle as well.
9504     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9505       for (int i = 0; i < 16; i += 2)
9506         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9507           return false;
9508
9509       return true;
9510     };
9511     auto tryToWidenViaDuplication = [&]() -> SDValue {
9512       if (!canWidenViaDuplication(Mask))
9513         return SDValue();
9514       SmallVector<int, 4> LoInputs;
9515       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9516                    [](int M) { return M >= 0 && M < 8; });
9517       std::sort(LoInputs.begin(), LoInputs.end());
9518       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9519                      LoInputs.end());
9520       SmallVector<int, 4> HiInputs;
9521       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9522                    [](int M) { return M >= 8; });
9523       std::sort(HiInputs.begin(), HiInputs.end());
9524       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9525                      HiInputs.end());
9526
9527       bool TargetLo = LoInputs.size() >= HiInputs.size();
9528       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9529       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9530
9531       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9532       SmallDenseMap<int, int, 8> LaneMap;
9533       for (int I : InPlaceInputs) {
9534         PreDupI16Shuffle[I/2] = I/2;
9535         LaneMap[I] = I;
9536       }
9537       int j = TargetLo ? 0 : 4, je = j + 4;
9538       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9539         // Check if j is already a shuffle of this input. This happens when
9540         // there are two adjacent bytes after we move the low one.
9541         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9542           // If we haven't yet mapped the input, search for a slot into which
9543           // we can map it.
9544           while (j < je && PreDupI16Shuffle[j] != -1)
9545             ++j;
9546
9547           if (j == je)
9548             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9549             return SDValue();
9550
9551           // Map this input with the i16 shuffle.
9552           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9553         }
9554
9555         // Update the lane map based on the mapping we ended up with.
9556         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9557       }
9558       V1 = DAG.getNode(
9559           ISD::BITCAST, DL, MVT::v16i8,
9560           DAG.getVectorShuffle(MVT::v8i16, DL,
9561                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9562                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9563
9564       // Unpack the bytes to form the i16s that will be shuffled into place.
9565       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9566                        MVT::v16i8, V1, V1);
9567
9568       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9569       for (int i = 0; i < 16; ++i)
9570         if (Mask[i] != -1) {
9571           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9572           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9573           if (PostDupI16Shuffle[i / 2] == -1)
9574             PostDupI16Shuffle[i / 2] = MappedMask;
9575           else
9576             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9577                    "Conflicting entrties in the original shuffle!");
9578         }
9579       return DAG.getNode(
9580           ISD::BITCAST, DL, MVT::v16i8,
9581           DAG.getVectorShuffle(MVT::v8i16, DL,
9582                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9583                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9584     };
9585     if (SDValue V = tryToWidenViaDuplication())
9586       return V;
9587   }
9588
9589   // Check whether an interleaving lowering is likely to be more efficient.
9590   // This isn't perfect but it is a strong heuristic that tends to work well on
9591   // the kinds of shuffles that show up in practice.
9592   //
9593   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9594   if (shouldLowerAsInterleaving(Mask)) {
9595     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9596       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9597     });
9598     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9599       return (M >= 8 && M < 16) || M >= 24;
9600     });
9601     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9602                      -1, -1, -1, -1, -1, -1, -1, -1};
9603     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9604                      -1, -1, -1, -1, -1, -1, -1, -1};
9605     bool UnpackLo = NumLoHalf >= NumHiHalf;
9606     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9607     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9608     for (int i = 0; i < 8; ++i) {
9609       TargetEMask[i] = Mask[2 * i];
9610       TargetOMask[i] = Mask[2 * i + 1];
9611     }
9612
9613     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9614     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9615
9616     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9617                        MVT::v16i8, Evens, Odds);
9618   }
9619
9620   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9621   // with PSHUFB. It is important to do this before we attempt to generate any
9622   // blends but after all of the single-input lowerings. If the single input
9623   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9624   // want to preserve that and we can DAG combine any longer sequences into
9625   // a PSHUFB in the end. But once we start blending from multiple inputs,
9626   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9627   // and there are *very* few patterns that would actually be faster than the
9628   // PSHUFB approach because of its ability to zero lanes.
9629   //
9630   // FIXME: The only exceptions to the above are blends which are exact
9631   // interleavings with direct instructions supporting them. We currently don't
9632   // handle those well here.
9633   if (Subtarget->hasSSSE3()) {
9634     SDValue V1Mask[16];
9635     SDValue V2Mask[16];
9636     bool V1InUse = false;
9637     bool V2InUse = false;
9638     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9639
9640     for (int i = 0; i < 16; ++i) {
9641       if (Mask[i] == -1) {
9642         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9643       } else {
9644         const int ZeroMask = 0x80;
9645         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9646         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9647         if (Zeroable[i])
9648           V1Idx = V2Idx = ZeroMask;
9649         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9650         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9651         V1InUse |= (ZeroMask != V1Idx);
9652         V2InUse |= (ZeroMask != V2Idx);
9653       }
9654     }
9655
9656     if (V1InUse)
9657       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9658                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9659     if (V2InUse)
9660       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9661                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9662
9663     // If we need shuffled inputs from both, blend the two.
9664     if (V1InUse && V2InUse)
9665       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9666     if (V1InUse)
9667       return V1; // Single inputs are easy.
9668     if (V2InUse)
9669       return V2; // Single inputs are easy.
9670     // Shuffling to a zeroable vector.
9671     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9672   }
9673
9674   // There are special ways we can lower some single-element blends.
9675   if (NumV2Elements == 1)
9676     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9677                                                          Mask, Subtarget, DAG))
9678       return V;
9679
9680   // Check whether a compaction lowering can be done. This handles shuffles
9681   // which take every Nth element for some even N. See the helper function for
9682   // details.
9683   //
9684   // We special case these as they can be particularly efficiently handled with
9685   // the PACKUSB instruction on x86 and they show up in common patterns of
9686   // rearranging bytes to truncate wide elements.
9687   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9688     // NumEvenDrops is the power of two stride of the elements. Another way of
9689     // thinking about it is that we need to drop the even elements this many
9690     // times to get the original input.
9691     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9692
9693     // First we need to zero all the dropped bytes.
9694     assert(NumEvenDrops <= 3 &&
9695            "No support for dropping even elements more than 3 times.");
9696     // We use the mask type to pick which bytes are preserved based on how many
9697     // elements are dropped.
9698     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9699     SDValue ByteClearMask =
9700         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9701                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9702     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9703     if (!IsSingleInput)
9704       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9705
9706     // Now pack things back together.
9707     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9708     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9709     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9710     for (int i = 1; i < NumEvenDrops; ++i) {
9711       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9712       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9713     }
9714
9715     return Result;
9716   }
9717
9718   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9719   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9720   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9721   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9722
9723   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9724                             MutableArrayRef<int> V1HalfBlendMask,
9725                             MutableArrayRef<int> V2HalfBlendMask) {
9726     for (int i = 0; i < 8; ++i)
9727       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9728         V1HalfBlendMask[i] = HalfMask[i];
9729         HalfMask[i] = i;
9730       } else if (HalfMask[i] >= 16) {
9731         V2HalfBlendMask[i] = HalfMask[i] - 16;
9732         HalfMask[i] = i + 8;
9733       }
9734   };
9735   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9736   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9737
9738   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9739
9740   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9741                              MutableArrayRef<int> HiBlendMask) {
9742     SDValue V1, V2;
9743     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9744     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9745     // i16s.
9746     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9747                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9748         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9749                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9750       // Use a mask to drop the high bytes.
9751       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9752       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9753                        DAG.getConstant(0x00FF, MVT::v8i16));
9754
9755       // This will be a single vector shuffle instead of a blend so nuke V2.
9756       V2 = DAG.getUNDEF(MVT::v8i16);
9757
9758       // Squash the masks to point directly into V1.
9759       for (int &M : LoBlendMask)
9760         if (M >= 0)
9761           M /= 2;
9762       for (int &M : HiBlendMask)
9763         if (M >= 0)
9764           M /= 2;
9765     } else {
9766       // Otherwise just unpack the low half of V into V1 and the high half into
9767       // V2 so that we can blend them as i16s.
9768       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9769                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9770       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9771                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9772     }
9773
9774     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9775     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9776     return std::make_pair(BlendedLo, BlendedHi);
9777   };
9778   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9779   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9780   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9781
9782   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9783   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9784
9785   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9786 }
9787
9788 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9789 ///
9790 /// This routine breaks down the specific type of 128-bit shuffle and
9791 /// dispatches to the lowering routines accordingly.
9792 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9793                                         MVT VT, const X86Subtarget *Subtarget,
9794                                         SelectionDAG &DAG) {
9795   switch (VT.SimpleTy) {
9796   case MVT::v2i64:
9797     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9798   case MVT::v2f64:
9799     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9800   case MVT::v4i32:
9801     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9802   case MVT::v4f32:
9803     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9804   case MVT::v8i16:
9805     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
9806   case MVT::v16i8:
9807     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9808
9809   default:
9810     llvm_unreachable("Unimplemented!");
9811   }
9812 }
9813
9814 /// \brief Helper function to test whether a shuffle mask could be
9815 /// simplified by widening the elements being shuffled.
9816 ///
9817 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9818 /// leaves it in an unspecified state.
9819 ///
9820 /// NOTE: This must handle normal vector shuffle masks and *target* vector
9821 /// shuffle masks. The latter have the special property of a '-2' representing
9822 /// a zero-ed lane of a vector.
9823 static bool canWidenShuffleElements(ArrayRef<int> Mask,
9824                                     SmallVectorImpl<int> &WidenedMask) {
9825   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9826     // If both elements are undef, its trivial.
9827     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9828       WidenedMask.push_back(SM_SentinelUndef);
9829       continue;
9830     }
9831
9832     // Check for an undef mask and a mask value properly aligned to fit with
9833     // a pair of values. If we find such a case, use the non-undef mask's value.
9834     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9835       WidenedMask.push_back(Mask[i + 1] / 2);
9836       continue;
9837     }
9838     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9839       WidenedMask.push_back(Mask[i] / 2);
9840       continue;
9841     }
9842
9843     // When zeroing, we need to spread the zeroing across both lanes to widen.
9844     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9845       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9846           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9847         WidenedMask.push_back(SM_SentinelZero);
9848         continue;
9849       }
9850       return false;
9851     }
9852
9853     // Finally check if the two mask values are adjacent and aligned with
9854     // a pair.
9855     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9856       WidenedMask.push_back(Mask[i] / 2);
9857       continue;
9858     }
9859
9860     // Otherwise we can't safely widen the elements used in this shuffle.
9861     return false;
9862   }
9863   assert(WidenedMask.size() == Mask.size() / 2 &&
9864          "Incorrect size of mask after widening the elements!");
9865
9866   return true;
9867 }
9868
9869 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
9870 ///
9871 /// This routine just extracts two subvectors, shuffles them independently, and
9872 /// then concatenates them back together. This should work effectively with all
9873 /// AVX vector shuffle types.
9874 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9875                                           SDValue V2, ArrayRef<int> Mask,
9876                                           SelectionDAG &DAG) {
9877   assert(VT.getSizeInBits() >= 256 &&
9878          "Only for 256-bit or wider vector shuffles!");
9879   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
9880   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
9881
9882   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9883   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9884
9885   int NumElements = VT.getVectorNumElements();
9886   int SplitNumElements = NumElements / 2;
9887   MVT ScalarVT = VT.getScalarType();
9888   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9889
9890   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9891                              DAG.getIntPtrConstant(0));
9892   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9893                              DAG.getIntPtrConstant(SplitNumElements));
9894   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9895                              DAG.getIntPtrConstant(0));
9896   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9897                              DAG.getIntPtrConstant(SplitNumElements));
9898
9899   // Now create two 4-way blends of these half-width vectors.
9900   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9901     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9902     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9903     for (int i = 0; i < SplitNumElements; ++i) {
9904       int M = HalfMask[i];
9905       if (M >= NumElements) {
9906         if (M >= NumElements + SplitNumElements)
9907           UseHiV2 = true;
9908         else
9909           UseLoV2 = true;
9910         V2BlendMask.push_back(M - NumElements);
9911         V1BlendMask.push_back(-1);
9912         BlendMask.push_back(SplitNumElements + i);
9913       } else if (M >= 0) {
9914         if (M >= SplitNumElements)
9915           UseHiV1 = true;
9916         else
9917           UseLoV1 = true;
9918         V2BlendMask.push_back(-1);
9919         V1BlendMask.push_back(M);
9920         BlendMask.push_back(i);
9921       } else {
9922         V2BlendMask.push_back(-1);
9923         V1BlendMask.push_back(-1);
9924         BlendMask.push_back(-1);
9925       }
9926     }
9927
9928     // Because the lowering happens after all combining takes place, we need to
9929     // manually combine these blend masks as much as possible so that we create
9930     // a minimal number of high-level vector shuffle nodes.
9931
9932     // First try just blending the halves of V1 or V2.
9933     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9934       return DAG.getUNDEF(SplitVT);
9935     if (!UseLoV2 && !UseHiV2)
9936       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9937     if (!UseLoV1 && !UseHiV1)
9938       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9939
9940     SDValue V1Blend, V2Blend;
9941     if (UseLoV1 && UseHiV1) {
9942       V1Blend =
9943         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9944     } else {
9945       // We only use half of V1 so map the usage down into the final blend mask.
9946       V1Blend = UseLoV1 ? LoV1 : HiV1;
9947       for (int i = 0; i < SplitNumElements; ++i)
9948         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9949           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9950     }
9951     if (UseLoV2 && UseHiV2) {
9952       V2Blend =
9953         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9954     } else {
9955       // We only use half of V2 so map the usage down into the final blend mask.
9956       V2Blend = UseLoV2 ? LoV2 : HiV2;
9957       for (int i = 0; i < SplitNumElements; ++i)
9958         if (BlendMask[i] >= SplitNumElements)
9959           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
9960     }
9961     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
9962   };
9963   SDValue Lo = HalfBlend(LoMask);
9964   SDValue Hi = HalfBlend(HiMask);
9965   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
9966 }
9967
9968 /// \brief Either split a vector in halves or decompose the shuffles and the
9969 /// blend.
9970 ///
9971 /// This is provided as a good fallback for many lowerings of non-single-input
9972 /// shuffles with more than one 128-bit lane. In those cases, we want to select
9973 /// between splitting the shuffle into 128-bit components and stitching those
9974 /// back together vs. extracting the single-input shuffles and blending those
9975 /// results.
9976 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
9977                                                 SDValue V2, ArrayRef<int> Mask,
9978                                                 SelectionDAG &DAG) {
9979   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
9980                                             "lower single-input shuffles as it "
9981                                             "could then recurse on itself.");
9982   int Size = Mask.size();
9983
9984   // If this can be modeled as a broadcast of two elements followed by a blend,
9985   // prefer that lowering. This is especially important because broadcasts can
9986   // often fold with memory operands.
9987   auto DoBothBroadcast = [&] {
9988     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
9989     for (int M : Mask)
9990       if (M >= Size) {
9991         if (V2BroadcastIdx == -1)
9992           V2BroadcastIdx = M - Size;
9993         else if (M - Size != V2BroadcastIdx)
9994           return false;
9995       } else if (M >= 0) {
9996         if (V1BroadcastIdx == -1)
9997           V1BroadcastIdx = M;
9998         else if (M != V1BroadcastIdx)
9999           return false;
10000       }
10001     return true;
10002   };
10003   if (DoBothBroadcast())
10004     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10005                                                       DAG);
10006
10007   // If the inputs all stem from a single 128-bit lane of each input, then we
10008   // split them rather than blending because the split will decompose to
10009   // unusually few instructions.
10010   int LaneCount = VT.getSizeInBits() / 128;
10011   int LaneSize = Size / LaneCount;
10012   SmallBitVector LaneInputs[2];
10013   LaneInputs[0].resize(LaneCount, false);
10014   LaneInputs[1].resize(LaneCount, false);
10015   for (int i = 0; i < Size; ++i)
10016     if (Mask[i] >= 0)
10017       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10018   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10019     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10020
10021   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10022   // that the decomposed single-input shuffles don't end up here.
10023   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10024 }
10025
10026 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10027 /// a permutation and blend of those lanes.
10028 ///
10029 /// This essentially blends the out-of-lane inputs to each lane into the lane
10030 /// from a permuted copy of the vector. This lowering strategy results in four
10031 /// instructions in the worst case for a single-input cross lane shuffle which
10032 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10033 /// of. Special cases for each particular shuffle pattern should be handled
10034 /// prior to trying this lowering.
10035 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10036                                                        SDValue V1, SDValue V2,
10037                                                        ArrayRef<int> Mask,
10038                                                        SelectionDAG &DAG) {
10039   // FIXME: This should probably be generalized for 512-bit vectors as well.
10040   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10041   int LaneSize = Mask.size() / 2;
10042
10043   // If there are only inputs from one 128-bit lane, splitting will in fact be
10044   // less expensive. The flags track wether the given lane contains an element
10045   // that crosses to another lane.
10046   bool LaneCrossing[2] = {false, false};
10047   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10048     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10049       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10050   if (!LaneCrossing[0] || !LaneCrossing[1])
10051     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10052
10053   if (isSingleInputShuffleMask(Mask)) {
10054     SmallVector<int, 32> FlippedBlendMask;
10055     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10056       FlippedBlendMask.push_back(
10057           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10058                                   ? Mask[i]
10059                                   : Mask[i] % LaneSize +
10060                                         (i / LaneSize) * LaneSize + Size));
10061
10062     // Flip the vector, and blend the results which should now be in-lane. The
10063     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10064     // 5 for the high source. The value 3 selects the high half of source 2 and
10065     // the value 2 selects the low half of source 2. We only use source 2 to
10066     // allow folding it into a memory operand.
10067     unsigned PERMMask = 3 | 2 << 4;
10068     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10069                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10070     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10071   }
10072
10073   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10074   // will be handled by the above logic and a blend of the results, much like
10075   // other patterns in AVX.
10076   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10077 }
10078
10079 /// \brief Handle lowering 2-lane 128-bit shuffles.
10080 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10081                                         SDValue V2, ArrayRef<int> Mask,
10082                                         const X86Subtarget *Subtarget,
10083                                         SelectionDAG &DAG) {
10084   // Blends are faster and handle all the non-lane-crossing cases.
10085   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10086                                                 Subtarget, DAG))
10087     return Blend;
10088
10089   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10090                                VT.getVectorNumElements() / 2);
10091   // Check for patterns which can be matched with a single insert of a 128-bit
10092   // subvector.
10093   if (isShuffleEquivalent(Mask, 0, 1, 0, 1) ||
10094       isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10095     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10096                               DAG.getIntPtrConstant(0));
10097     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10098                               Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0));
10099     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10100   }
10101   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10102     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10103                               DAG.getIntPtrConstant(0));
10104     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10105                               DAG.getIntPtrConstant(2));
10106     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10107   }
10108
10109   // Otherwise form a 128-bit permutation.
10110   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10111   unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4;
10112   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10113                      DAG.getConstant(PermMask, MVT::i8));
10114 }
10115
10116 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10117 /// shuffling each lane.
10118 ///
10119 /// This will only succeed when the result of fixing the 128-bit lanes results
10120 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10121 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10122 /// the lane crosses early and then use simpler shuffles within each lane.
10123 ///
10124 /// FIXME: It might be worthwhile at some point to support this without
10125 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10126 /// in x86 only floating point has interesting non-repeating shuffles, and even
10127 /// those are still *marginally* more expensive.
10128 static SDValue lowerVectorShuffleByMerging128BitLanes(
10129     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10130     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10131   assert(!isSingleInputShuffleMask(Mask) &&
10132          "This is only useful with multiple inputs.");
10133
10134   int Size = Mask.size();
10135   int LaneSize = 128 / VT.getScalarSizeInBits();
10136   int NumLanes = Size / LaneSize;
10137   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10138
10139   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10140   // check whether the in-128-bit lane shuffles share a repeating pattern.
10141   SmallVector<int, 4> Lanes;
10142   Lanes.resize(NumLanes, -1);
10143   SmallVector<int, 4> InLaneMask;
10144   InLaneMask.resize(LaneSize, -1);
10145   for (int i = 0; i < Size; ++i) {
10146     if (Mask[i] < 0)
10147       continue;
10148
10149     int j = i / LaneSize;
10150
10151     if (Lanes[j] < 0) {
10152       // First entry we've seen for this lane.
10153       Lanes[j] = Mask[i] / LaneSize;
10154     } else if (Lanes[j] != Mask[i] / LaneSize) {
10155       // This doesn't match the lane selected previously!
10156       return SDValue();
10157     }
10158
10159     // Check that within each lane we have a consistent shuffle mask.
10160     int k = i % LaneSize;
10161     if (InLaneMask[k] < 0) {
10162       InLaneMask[k] = Mask[i] % LaneSize;
10163     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10164       // This doesn't fit a repeating in-lane mask.
10165       return SDValue();
10166     }
10167   }
10168
10169   // First shuffle the lanes into place.
10170   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10171                                 VT.getSizeInBits() / 64);
10172   SmallVector<int, 8> LaneMask;
10173   LaneMask.resize(NumLanes * 2, -1);
10174   for (int i = 0; i < NumLanes; ++i)
10175     if (Lanes[i] >= 0) {
10176       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10177       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10178     }
10179
10180   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10181   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10182   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10183
10184   // Cast it back to the type we actually want.
10185   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10186
10187   // Now do a simple shuffle that isn't lane crossing.
10188   SmallVector<int, 8> NewMask;
10189   NewMask.resize(Size, -1);
10190   for (int i = 0; i < Size; ++i)
10191     if (Mask[i] >= 0)
10192       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10193   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10194          "Must not introduce lane crosses at this point!");
10195
10196   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10197 }
10198
10199 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10200 /// given mask.
10201 ///
10202 /// This returns true if the elements from a particular input are already in the
10203 /// slot required by the given mask and require no permutation.
10204 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10205   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10206   int Size = Mask.size();
10207   for (int i = 0; i < Size; ++i)
10208     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10209       return false;
10210
10211   return true;
10212 }
10213
10214 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10215 ///
10216 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10217 /// isn't available.
10218 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10219                                        const X86Subtarget *Subtarget,
10220                                        SelectionDAG &DAG) {
10221   SDLoc DL(Op);
10222   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10223   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10224   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10225   ArrayRef<int> Mask = SVOp->getMask();
10226   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10227
10228   SmallVector<int, 4> WidenedMask;
10229   if (canWidenShuffleElements(Mask, WidenedMask))
10230     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10231                                     DAG);
10232
10233   if (isSingleInputShuffleMask(Mask)) {
10234     // Check for being able to broadcast a single element.
10235     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10236                                                           Mask, Subtarget, DAG))
10237       return Broadcast;
10238
10239     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10240       // Non-half-crossing single input shuffles can be lowerid with an
10241       // interleaved permutation.
10242       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10243                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10244       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10245                          DAG.getConstant(VPERMILPMask, MVT::i8));
10246     }
10247
10248     // With AVX2 we have direct support for this permutation.
10249     if (Subtarget->hasAVX2())
10250       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10251                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10252
10253     // Otherwise, fall back.
10254     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10255                                                    DAG);
10256   }
10257
10258   // X86 has dedicated unpack instructions that can handle specific blend
10259   // operations: UNPCKH and UNPCKL.
10260   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10261     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10262   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10263     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10264
10265   // If we have a single input to the zero element, insert that into V1 if we
10266   // can do so cheaply.
10267   int NumV2Elements =
10268       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10269   if (NumV2Elements == 1 && Mask[0] >= 4)
10270     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10271             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10272       return Insertion;
10273
10274   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10275                                                 Subtarget, DAG))
10276     return Blend;
10277
10278   // Check if the blend happens to exactly fit that of SHUFPD.
10279   if ((Mask[0] == -1 || Mask[0] < 2) &&
10280       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10281       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10282       (Mask[3] == -1 || Mask[3] >= 6)) {
10283     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10284                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10285     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10286                        DAG.getConstant(SHUFPDMask, MVT::i8));
10287   }
10288   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10289       (Mask[1] == -1 || Mask[1] < 2) &&
10290       (Mask[2] == -1 || Mask[2] >= 6) &&
10291       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10292     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10293                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10294     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10295                        DAG.getConstant(SHUFPDMask, MVT::i8));
10296   }
10297
10298   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10299   // shuffle. However, if we have AVX2 and either inputs are already in place,
10300   // we will be able to shuffle even across lanes the other input in a single
10301   // instruction so skip this pattern.
10302   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10303                                  isShuffleMaskInputInPlace(1, Mask))))
10304     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10305             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10306       return Result;
10307
10308   // If we have AVX2 then we always want to lower with a blend because an v4 we
10309   // can fully permute the elements.
10310   if (Subtarget->hasAVX2())
10311     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10312                                                       Mask, DAG);
10313
10314   // Otherwise fall back on generic lowering.
10315   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10316 }
10317
10318 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10319 ///
10320 /// This routine is only called when we have AVX2 and thus a reasonable
10321 /// instruction set for v4i64 shuffling..
10322 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10323                                        const X86Subtarget *Subtarget,
10324                                        SelectionDAG &DAG) {
10325   SDLoc DL(Op);
10326   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10327   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10328   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10329   ArrayRef<int> Mask = SVOp->getMask();
10330   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10331   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10332
10333   SmallVector<int, 4> WidenedMask;
10334   if (canWidenShuffleElements(Mask, WidenedMask))
10335     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10336                                     DAG);
10337
10338   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10339                                                 Subtarget, DAG))
10340     return Blend;
10341
10342   // Check for being able to broadcast a single element.
10343   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10344                                                         Mask, Subtarget, DAG))
10345     return Broadcast;
10346
10347   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10348   // use lower latency instructions that will operate on both 128-bit lanes.
10349   SmallVector<int, 2> RepeatedMask;
10350   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10351     if (isSingleInputShuffleMask(Mask)) {
10352       int PSHUFDMask[] = {-1, -1, -1, -1};
10353       for (int i = 0; i < 2; ++i)
10354         if (RepeatedMask[i] >= 0) {
10355           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10356           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10357         }
10358       return DAG.getNode(
10359           ISD::BITCAST, DL, MVT::v4i64,
10360           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10361                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10362                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10363     }
10364
10365     // Use dedicated unpack instructions for masks that match their pattern.
10366     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10367       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10368     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10369       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10370   }
10371
10372   // AVX2 provides a direct instruction for permuting a single input across
10373   // lanes.
10374   if (isSingleInputShuffleMask(Mask))
10375     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10376                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10377
10378   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10379   // shuffle. However, if we have AVX2 and either inputs are already in place,
10380   // we will be able to shuffle even across lanes the other input in a single
10381   // instruction so skip this pattern.
10382   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10383                                  isShuffleMaskInputInPlace(1, Mask))))
10384     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10385             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10386       return Result;
10387
10388   // Otherwise fall back on generic blend lowering.
10389   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10390                                                     Mask, DAG);
10391 }
10392
10393 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10394 ///
10395 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10396 /// isn't available.
10397 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10398                                        const X86Subtarget *Subtarget,
10399                                        SelectionDAG &DAG) {
10400   SDLoc DL(Op);
10401   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10402   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10403   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10404   ArrayRef<int> Mask = SVOp->getMask();
10405   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10406
10407   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10408                                                 Subtarget, DAG))
10409     return Blend;
10410
10411   // Check for being able to broadcast a single element.
10412   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10413                                                         Mask, Subtarget, DAG))
10414     return Broadcast;
10415
10416   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10417   // options to efficiently lower the shuffle.
10418   SmallVector<int, 4> RepeatedMask;
10419   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10420     assert(RepeatedMask.size() == 4 &&
10421            "Repeated masks must be half the mask width!");
10422     if (isSingleInputShuffleMask(Mask))
10423       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10424                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10425
10426     // Use dedicated unpack instructions for masks that match their pattern.
10427     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10428       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10429     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10430       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10431
10432     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10433     // have already handled any direct blends. We also need to squash the
10434     // repeated mask into a simulated v4f32 mask.
10435     for (int i = 0; i < 4; ++i)
10436       if (RepeatedMask[i] >= 8)
10437         RepeatedMask[i] -= 4;
10438     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10439   }
10440
10441   // If we have a single input shuffle with different shuffle patterns in the
10442   // two 128-bit lanes use the variable mask to VPERMILPS.
10443   if (isSingleInputShuffleMask(Mask)) {
10444     SDValue VPermMask[8];
10445     for (int i = 0; i < 8; ++i)
10446       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10447                                  : DAG.getConstant(Mask[i], MVT::i32);
10448     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10449       return DAG.getNode(
10450           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10451           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10452
10453     if (Subtarget->hasAVX2())
10454       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10455                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10456                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10457                                                  MVT::v8i32, VPermMask)),
10458                          V1);
10459
10460     // Otherwise, fall back.
10461     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10462                                                    DAG);
10463   }
10464
10465   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10466   // shuffle.
10467   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10468           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10469     return Result;
10470
10471   // If we have AVX2 then we always want to lower with a blend because at v8 we
10472   // can fully permute the elements.
10473   if (Subtarget->hasAVX2())
10474     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10475                                                       Mask, DAG);
10476
10477   // Otherwise fall back on generic lowering.
10478   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10479 }
10480
10481 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10482 ///
10483 /// This routine is only called when we have AVX2 and thus a reasonable
10484 /// instruction set for v8i32 shuffling..
10485 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10486                                        const X86Subtarget *Subtarget,
10487                                        SelectionDAG &DAG) {
10488   SDLoc DL(Op);
10489   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10490   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10491   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10492   ArrayRef<int> Mask = SVOp->getMask();
10493   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10494   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10495
10496   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10497                                                 Subtarget, DAG))
10498     return Blend;
10499
10500   // Check for being able to broadcast a single element.
10501   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10502                                                         Mask, Subtarget, DAG))
10503     return Broadcast;
10504
10505   // If the shuffle mask is repeated in each 128-bit lane we can use more
10506   // efficient instructions that mirror the shuffles across the two 128-bit
10507   // lanes.
10508   SmallVector<int, 4> RepeatedMask;
10509   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10510     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10511     if (isSingleInputShuffleMask(Mask))
10512       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10513                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10514
10515     // Use dedicated unpack instructions for masks that match their pattern.
10516     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10517       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10518     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10519       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10520   }
10521
10522   // If the shuffle patterns aren't repeated but it is a single input, directly
10523   // generate a cross-lane VPERMD instruction.
10524   if (isSingleInputShuffleMask(Mask)) {
10525     SDValue VPermMask[8];
10526     for (int i = 0; i < 8; ++i)
10527       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10528                                  : DAG.getConstant(Mask[i], MVT::i32);
10529     return DAG.getNode(
10530         X86ISD::VPERMV, DL, MVT::v8i32,
10531         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10532   }
10533
10534   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10535   // shuffle.
10536   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10537           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10538     return Result;
10539
10540   // Otherwise fall back on generic blend lowering.
10541   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10542                                                     Mask, DAG);
10543 }
10544
10545 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10546 ///
10547 /// This routine is only called when we have AVX2 and thus a reasonable
10548 /// instruction set for v16i16 shuffling..
10549 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10550                                         const X86Subtarget *Subtarget,
10551                                         SelectionDAG &DAG) {
10552   SDLoc DL(Op);
10553   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10554   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10555   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10556   ArrayRef<int> Mask = SVOp->getMask();
10557   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10558   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10559
10560   // Check for being able to broadcast a single element.
10561   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10562                                                         Mask, Subtarget, DAG))
10563     return Broadcast;
10564
10565   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10566                                                 Subtarget, DAG))
10567     return Blend;
10568
10569   // Use dedicated unpack instructions for masks that match their pattern.
10570   if (isShuffleEquivalent(Mask,
10571                           // First 128-bit lane:
10572                           0, 16, 1, 17, 2, 18, 3, 19,
10573                           // Second 128-bit lane:
10574                           8, 24, 9, 25, 10, 26, 11, 27))
10575     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10576   if (isShuffleEquivalent(Mask,
10577                           // First 128-bit lane:
10578                           4, 20, 5, 21, 6, 22, 7, 23,
10579                           // Second 128-bit lane:
10580                           12, 28, 13, 29, 14, 30, 15, 31))
10581     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10582
10583   if (isSingleInputShuffleMask(Mask)) {
10584     // There are no generalized cross-lane shuffle operations available on i16
10585     // element types.
10586     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10587       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10588                                                      Mask, DAG);
10589
10590     SDValue PSHUFBMask[32];
10591     for (int i = 0; i < 16; ++i) {
10592       if (Mask[i] == -1) {
10593         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10594         continue;
10595       }
10596
10597       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10598       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10599       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10600       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10601     }
10602     return DAG.getNode(
10603         ISD::BITCAST, DL, MVT::v16i16,
10604         DAG.getNode(
10605             X86ISD::PSHUFB, DL, MVT::v32i8,
10606             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10607             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10608   }
10609
10610   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10611   // shuffle.
10612   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10613           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10614     return Result;
10615
10616   // Otherwise fall back on generic lowering.
10617   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10618 }
10619
10620 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10621 ///
10622 /// This routine is only called when we have AVX2 and thus a reasonable
10623 /// instruction set for v32i8 shuffling..
10624 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10625                                        const X86Subtarget *Subtarget,
10626                                        SelectionDAG &DAG) {
10627   SDLoc DL(Op);
10628   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10629   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10630   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10631   ArrayRef<int> Mask = SVOp->getMask();
10632   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10633   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10634
10635   // Check for being able to broadcast a single element.
10636   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10637                                                         Mask, Subtarget, DAG))
10638     return Broadcast;
10639
10640   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10641                                                 Subtarget, DAG))
10642     return Blend;
10643
10644   // Use dedicated unpack instructions for masks that match their pattern.
10645   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10646   // 256-bit lanes.
10647   if (isShuffleEquivalent(
10648           Mask,
10649           // First 128-bit lane:
10650           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10651           // Second 128-bit lane:
10652           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10653     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10654   if (isShuffleEquivalent(
10655           Mask,
10656           // First 128-bit lane:
10657           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10658           // Second 128-bit lane:
10659           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10660     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10661
10662   if (isSingleInputShuffleMask(Mask)) {
10663     // There are no generalized cross-lane shuffle operations available on i8
10664     // element types.
10665     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10666       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10667                                                      Mask, DAG);
10668
10669     SDValue PSHUFBMask[32];
10670     for (int i = 0; i < 32; ++i)
10671       PSHUFBMask[i] =
10672           Mask[i] < 0
10673               ? DAG.getUNDEF(MVT::i8)
10674               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10675
10676     return DAG.getNode(
10677         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10678         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10679   }
10680
10681   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10682   // shuffle.
10683   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10684           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10685     return Result;
10686
10687   // Otherwise fall back on generic lowering.
10688   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10689 }
10690
10691 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10692 ///
10693 /// This routine either breaks down the specific type of a 256-bit x86 vector
10694 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10695 /// together based on the available instructions.
10696 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10697                                         MVT VT, const X86Subtarget *Subtarget,
10698                                         SelectionDAG &DAG) {
10699   SDLoc DL(Op);
10700   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10701   ArrayRef<int> Mask = SVOp->getMask();
10702
10703   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10704   // check for those subtargets here and avoid much of the subtarget querying in
10705   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10706   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10707   // floating point types there eventually, just immediately cast everything to
10708   // a float and operate entirely in that domain.
10709   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10710     int ElementBits = VT.getScalarSizeInBits();
10711     if (ElementBits < 32)
10712       // No floating point type available, decompose into 128-bit vectors.
10713       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10714
10715     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10716                                 VT.getVectorNumElements());
10717     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10718     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10719     return DAG.getNode(ISD::BITCAST, DL, VT,
10720                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10721   }
10722
10723   switch (VT.SimpleTy) {
10724   case MVT::v4f64:
10725     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10726   case MVT::v4i64:
10727     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10728   case MVT::v8f32:
10729     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10730   case MVT::v8i32:
10731     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10732   case MVT::v16i16:
10733     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10734   case MVT::v32i8:
10735     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10736
10737   default:
10738     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10739   }
10740 }
10741
10742 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10743 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10744                                        const X86Subtarget *Subtarget,
10745                                        SelectionDAG &DAG) {
10746   SDLoc DL(Op);
10747   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10748   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10749   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10750   ArrayRef<int> Mask = SVOp->getMask();
10751   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10752
10753   // X86 has dedicated unpack instructions that can handle specific blend
10754   // operations: UNPCKH and UNPCKL.
10755   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10756     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10757   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10758     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
10759
10760   // FIXME: Implement direct support for this type!
10761   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
10762 }
10763
10764 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
10765 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10766                                        const X86Subtarget *Subtarget,
10767                                        SelectionDAG &DAG) {
10768   SDLoc DL(Op);
10769   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10770   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10771   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10772   ArrayRef<int> Mask = SVOp->getMask();
10773   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10774
10775   // Use dedicated unpack instructions for masks that match their pattern.
10776   if (isShuffleEquivalent(Mask,
10777                           0, 16, 1, 17, 4, 20, 5, 21,
10778                           8, 24, 9, 25, 12, 28, 13, 29))
10779     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
10780   if (isShuffleEquivalent(Mask,
10781                           2, 18, 3, 19, 6, 22, 7, 23,
10782                           10, 26, 11, 27, 14, 30, 15, 31))
10783     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
10784
10785   // FIXME: Implement direct support for this type!
10786   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
10787 }
10788
10789 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
10790 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10791                                        const X86Subtarget *Subtarget,
10792                                        SelectionDAG &DAG) {
10793   SDLoc DL(Op);
10794   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10795   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10796   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10797   ArrayRef<int> Mask = SVOp->getMask();
10798   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10799
10800   // X86 has dedicated unpack instructions that can handle specific blend
10801   // operations: UNPCKH and UNPCKL.
10802   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10803     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
10804   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10805     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
10806
10807   // FIXME: Implement direct support for this type!
10808   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
10809 }
10810
10811 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10812 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10813                                        const X86Subtarget *Subtarget,
10814                                        SelectionDAG &DAG) {
10815   SDLoc DL(Op);
10816   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10817   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10818   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10819   ArrayRef<int> Mask = SVOp->getMask();
10820   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10821
10822   // Use dedicated unpack instructions for masks that match their pattern.
10823   if (isShuffleEquivalent(Mask,
10824                           0, 16, 1, 17, 4, 20, 5, 21,
10825                           8, 24, 9, 25, 12, 28, 13, 29))
10826     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
10827   if (isShuffleEquivalent(Mask,
10828                           2, 18, 3, 19, 6, 22, 7, 23,
10829                           10, 26, 11, 27, 14, 30, 15, 31))
10830     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
10831
10832   // FIXME: Implement direct support for this type!
10833   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
10834 }
10835
10836 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10837 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10838                                         const X86Subtarget *Subtarget,
10839                                         SelectionDAG &DAG) {
10840   SDLoc DL(Op);
10841   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10842   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10843   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10844   ArrayRef<int> Mask = SVOp->getMask();
10845   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10846   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
10847
10848   // FIXME: Implement direct support for this type!
10849   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10850 }
10851
10852 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10853 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10854                                        const X86Subtarget *Subtarget,
10855                                        SelectionDAG &DAG) {
10856   SDLoc DL(Op);
10857   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10858   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10859   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10860   ArrayRef<int> Mask = SVOp->getMask();
10861   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
10862   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
10863
10864   // FIXME: Implement direct support for this type!
10865   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10866 }
10867
10868 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10869 ///
10870 /// This routine either breaks down the specific type of a 512-bit x86 vector
10871 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
10872 /// together based on the available instructions.
10873 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10874                                         MVT VT, const X86Subtarget *Subtarget,
10875                                         SelectionDAG &DAG) {
10876   SDLoc DL(Op);
10877   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10878   ArrayRef<int> Mask = SVOp->getMask();
10879   assert(Subtarget->hasAVX512() &&
10880          "Cannot lower 512-bit vectors w/ basic ISA!");
10881
10882   // Check for being able to broadcast a single element.
10883   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
10884                                                         Mask, Subtarget, DAG))
10885     return Broadcast;
10886
10887   // Dispatch to each element type for lowering. If we don't have supprot for
10888   // specific element type shuffles at 512 bits, immediately split them and
10889   // lower them. Each lowering routine of a given type is allowed to assume that
10890   // the requisite ISA extensions for that element type are available.
10891   switch (VT.SimpleTy) {
10892   case MVT::v8f64:
10893     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10894   case MVT::v16f32:
10895     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10896   case MVT::v8i64:
10897     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10898   case MVT::v16i32:
10899     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10900   case MVT::v32i16:
10901     if (Subtarget->hasBWI())
10902       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10903     break;
10904   case MVT::v64i8:
10905     if (Subtarget->hasBWI())
10906       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10907     break;
10908
10909   default:
10910     llvm_unreachable("Not a valid 512-bit x86 vector type!");
10911   }
10912
10913   // Otherwise fall back on splitting.
10914   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10915 }
10916
10917 /// \brief Top-level lowering for x86 vector shuffles.
10918 ///
10919 /// This handles decomposition, canonicalization, and lowering of all x86
10920 /// vector shuffles. Most of the specific lowering strategies are encapsulated
10921 /// above in helper routines. The canonicalization attempts to widen shuffles
10922 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
10923 /// s.t. only one of the two inputs needs to be tested, etc.
10924 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10925                                   SelectionDAG &DAG) {
10926   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10927   ArrayRef<int> Mask = SVOp->getMask();
10928   SDValue V1 = Op.getOperand(0);
10929   SDValue V2 = Op.getOperand(1);
10930   MVT VT = Op.getSimpleValueType();
10931   int NumElements = VT.getVectorNumElements();
10932   SDLoc dl(Op);
10933
10934   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
10935
10936   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10937   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10938   if (V1IsUndef && V2IsUndef)
10939     return DAG.getUNDEF(VT);
10940
10941   // When we create a shuffle node we put the UNDEF node to second operand,
10942   // but in some cases the first operand may be transformed to UNDEF.
10943   // In this case we should just commute the node.
10944   if (V1IsUndef)
10945     return DAG.getCommutedVectorShuffle(*SVOp);
10946
10947   // Check for non-undef masks pointing at an undef vector and make the masks
10948   // undef as well. This makes it easier to match the shuffle based solely on
10949   // the mask.
10950   if (V2IsUndef)
10951     for (int M : Mask)
10952       if (M >= NumElements) {
10953         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
10954         for (int &M : NewMask)
10955           if (M >= NumElements)
10956             M = -1;
10957         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
10958       }
10959
10960   // Try to collapse shuffles into using a vector type with fewer elements but
10961   // wider element types. We cap this to not form integers or floating point
10962   // elements wider than 64 bits, but it might be interesting to form i128
10963   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
10964   SmallVector<int, 16> WidenedMask;
10965   if (VT.getScalarSizeInBits() < 64 &&
10966       canWidenShuffleElements(Mask, WidenedMask)) {
10967     MVT NewEltVT = VT.isFloatingPoint()
10968                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
10969                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
10970     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
10971     // Make sure that the new vector type is legal. For example, v2f64 isn't
10972     // legal on SSE1.
10973     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
10974       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
10975       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
10976       return DAG.getNode(ISD::BITCAST, dl, VT,
10977                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
10978     }
10979   }
10980
10981   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
10982   for (int M : SVOp->getMask())
10983     if (M < 0)
10984       ++NumUndefElements;
10985     else if (M < NumElements)
10986       ++NumV1Elements;
10987     else
10988       ++NumV2Elements;
10989
10990   // Commute the shuffle as needed such that more elements come from V1 than
10991   // V2. This allows us to match the shuffle pattern strictly on how many
10992   // elements come from V1 without handling the symmetric cases.
10993   if (NumV2Elements > NumV1Elements)
10994     return DAG.getCommutedVectorShuffle(*SVOp);
10995
10996   // When the number of V1 and V2 elements are the same, try to minimize the
10997   // number of uses of V2 in the low half of the vector. When that is tied,
10998   // ensure that the sum of indices for V1 is equal to or lower than the sum
10999   // indices for V2. When those are equal, try to ensure that the number of odd
11000   // indices for V1 is lower than the number of odd indices for V2.
11001   if (NumV1Elements == NumV2Elements) {
11002     int LowV1Elements = 0, LowV2Elements = 0;
11003     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11004       if (M >= NumElements)
11005         ++LowV2Elements;
11006       else if (M >= 0)
11007         ++LowV1Elements;
11008     if (LowV2Elements > LowV1Elements) {
11009       return DAG.getCommutedVectorShuffle(*SVOp);
11010     } else if (LowV2Elements == LowV1Elements) {
11011       int SumV1Indices = 0, SumV2Indices = 0;
11012       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11013         if (SVOp->getMask()[i] >= NumElements)
11014           SumV2Indices += i;
11015         else if (SVOp->getMask()[i] >= 0)
11016           SumV1Indices += i;
11017       if (SumV2Indices < SumV1Indices) {
11018         return DAG.getCommutedVectorShuffle(*SVOp);
11019       } else if (SumV2Indices == SumV1Indices) {
11020         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11021         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11022           if (SVOp->getMask()[i] >= NumElements)
11023             NumV2OddIndices += i % 2;
11024           else if (SVOp->getMask()[i] >= 0)
11025             NumV1OddIndices += i % 2;
11026         if (NumV2OddIndices < NumV1OddIndices)
11027           return DAG.getCommutedVectorShuffle(*SVOp);
11028       }
11029     }
11030   }
11031
11032   // For each vector width, delegate to a specialized lowering routine.
11033   if (VT.getSizeInBits() == 128)
11034     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11035
11036   if (VT.getSizeInBits() == 256)
11037     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11038
11039   // Force AVX-512 vectors to be scalarized for now.
11040   // FIXME: Implement AVX-512 support!
11041   if (VT.getSizeInBits() == 512)
11042     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11043
11044   llvm_unreachable("Unimplemented!");
11045 }
11046
11047
11048 //===----------------------------------------------------------------------===//
11049 // Legacy vector shuffle lowering
11050 //
11051 // This code is the legacy code handling vector shuffles until the above
11052 // replaces its functionality and performance.
11053 //===----------------------------------------------------------------------===//
11054
11055 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11056                         bool hasInt256, unsigned *MaskOut = nullptr) {
11057   MVT EltVT = VT.getVectorElementType();
11058
11059   // There is no blend with immediate in AVX-512.
11060   if (VT.is512BitVector())
11061     return false;
11062
11063   if (!hasSSE41 || EltVT == MVT::i8)
11064     return false;
11065   if (!hasInt256 && VT == MVT::v16i16)
11066     return false;
11067
11068   unsigned MaskValue = 0;
11069   unsigned NumElems = VT.getVectorNumElements();
11070   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11071   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11072   unsigned NumElemsInLane = NumElems / NumLanes;
11073
11074   // Blend for v16i16 should be symetric for the both lanes.
11075   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11076
11077     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11078     int EltIdx = MaskVals[i];
11079
11080     if ((EltIdx < 0 || EltIdx == (int)i) &&
11081         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11082       continue;
11083
11084     if (((unsigned)EltIdx == (i + NumElems)) &&
11085         (SndLaneEltIdx < 0 ||
11086          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11087       MaskValue |= (1 << i);
11088     else
11089       return false;
11090   }
11091
11092   if (MaskOut)
11093     *MaskOut = MaskValue;
11094   return true;
11095 }
11096
11097 // Try to lower a shuffle node into a simple blend instruction.
11098 // This function assumes isBlendMask returns true for this
11099 // SuffleVectorSDNode
11100 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11101                                           unsigned MaskValue,
11102                                           const X86Subtarget *Subtarget,
11103                                           SelectionDAG &DAG) {
11104   MVT VT = SVOp->getSimpleValueType(0);
11105   MVT EltVT = VT.getVectorElementType();
11106   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11107                      Subtarget->hasInt256() && "Trying to lower a "
11108                                                "VECTOR_SHUFFLE to a Blend but "
11109                                                "with the wrong mask"));
11110   SDValue V1 = SVOp->getOperand(0);
11111   SDValue V2 = SVOp->getOperand(1);
11112   SDLoc dl(SVOp);
11113   unsigned NumElems = VT.getVectorNumElements();
11114
11115   // Convert i32 vectors to floating point if it is not AVX2.
11116   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11117   MVT BlendVT = VT;
11118   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11119     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11120                                NumElems);
11121     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11122     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11123   }
11124
11125   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11126                             DAG.getConstant(MaskValue, MVT::i32));
11127   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11128 }
11129
11130 /// In vector type \p VT, return true if the element at index \p InputIdx
11131 /// falls on a different 128-bit lane than \p OutputIdx.
11132 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11133                                      unsigned OutputIdx) {
11134   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11135   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11136 }
11137
11138 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11139 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11140 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11141 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11142 /// zero.
11143 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11144                          SelectionDAG &DAG) {
11145   MVT VT = V1.getSimpleValueType();
11146   assert(VT.is128BitVector() || VT.is256BitVector());
11147
11148   MVT EltVT = VT.getVectorElementType();
11149   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11150   unsigned NumElts = VT.getVectorNumElements();
11151
11152   SmallVector<SDValue, 32> PshufbMask;
11153   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11154     int InputIdx = MaskVals[OutputIdx];
11155     unsigned InputByteIdx;
11156
11157     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11158       InputByteIdx = 0x80;
11159     else {
11160       // Cross lane is not allowed.
11161       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11162         return SDValue();
11163       InputByteIdx = InputIdx * EltSizeInBytes;
11164       // Index is an byte offset within the 128-bit lane.
11165       InputByteIdx &= 0xf;
11166     }
11167
11168     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11169       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11170       if (InputByteIdx != 0x80)
11171         ++InputByteIdx;
11172     }
11173   }
11174
11175   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11176   if (ShufVT != VT)
11177     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11178   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11179                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11180 }
11181
11182 // v8i16 shuffles - Prefer shuffles in the following order:
11183 // 1. [all]   pshuflw, pshufhw, optional move
11184 // 2. [ssse3] 1 x pshufb
11185 // 3. [ssse3] 2 x pshufb + 1 x por
11186 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11187 static SDValue
11188 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11189                          SelectionDAG &DAG) {
11190   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11191   SDValue V1 = SVOp->getOperand(0);
11192   SDValue V2 = SVOp->getOperand(1);
11193   SDLoc dl(SVOp);
11194   SmallVector<int, 8> MaskVals;
11195
11196   // Determine if more than 1 of the words in each of the low and high quadwords
11197   // of the result come from the same quadword of one of the two inputs.  Undef
11198   // mask values count as coming from any quadword, for better codegen.
11199   //
11200   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11201   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11202   unsigned LoQuad[] = { 0, 0, 0, 0 };
11203   unsigned HiQuad[] = { 0, 0, 0, 0 };
11204   // Indices of quads used.
11205   std::bitset<4> InputQuads;
11206   for (unsigned i = 0; i < 8; ++i) {
11207     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11208     int EltIdx = SVOp->getMaskElt(i);
11209     MaskVals.push_back(EltIdx);
11210     if (EltIdx < 0) {
11211       ++Quad[0];
11212       ++Quad[1];
11213       ++Quad[2];
11214       ++Quad[3];
11215       continue;
11216     }
11217     ++Quad[EltIdx / 4];
11218     InputQuads.set(EltIdx / 4);
11219   }
11220
11221   int BestLoQuad = -1;
11222   unsigned MaxQuad = 1;
11223   for (unsigned i = 0; i < 4; ++i) {
11224     if (LoQuad[i] > MaxQuad) {
11225       BestLoQuad = i;
11226       MaxQuad = LoQuad[i];
11227     }
11228   }
11229
11230   int BestHiQuad = -1;
11231   MaxQuad = 1;
11232   for (unsigned i = 0; i < 4; ++i) {
11233     if (HiQuad[i] > MaxQuad) {
11234       BestHiQuad = i;
11235       MaxQuad = HiQuad[i];
11236     }
11237   }
11238
11239   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11240   // of the two input vectors, shuffle them into one input vector so only a
11241   // single pshufb instruction is necessary. If there are more than 2 input
11242   // quads, disable the next transformation since it does not help SSSE3.
11243   bool V1Used = InputQuads[0] || InputQuads[1];
11244   bool V2Used = InputQuads[2] || InputQuads[3];
11245   if (Subtarget->hasSSSE3()) {
11246     if (InputQuads.count() == 2 && V1Used && V2Used) {
11247       BestLoQuad = InputQuads[0] ? 0 : 1;
11248       BestHiQuad = InputQuads[2] ? 2 : 3;
11249     }
11250     if (InputQuads.count() > 2) {
11251       BestLoQuad = -1;
11252       BestHiQuad = -1;
11253     }
11254   }
11255
11256   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11257   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11258   // words from all 4 input quadwords.
11259   SDValue NewV;
11260   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11261     int MaskV[] = {
11262       BestLoQuad < 0 ? 0 : BestLoQuad,
11263       BestHiQuad < 0 ? 1 : BestHiQuad
11264     };
11265     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11266                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11267                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11268     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11269
11270     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11271     // source words for the shuffle, to aid later transformations.
11272     bool AllWordsInNewV = true;
11273     bool InOrder[2] = { true, true };
11274     for (unsigned i = 0; i != 8; ++i) {
11275       int idx = MaskVals[i];
11276       if (idx != (int)i)
11277         InOrder[i/4] = false;
11278       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11279         continue;
11280       AllWordsInNewV = false;
11281       break;
11282     }
11283
11284     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11285     if (AllWordsInNewV) {
11286       for (int i = 0; i != 8; ++i) {
11287         int idx = MaskVals[i];
11288         if (idx < 0)
11289           continue;
11290         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11291         if ((idx != i) && idx < 4)
11292           pshufhw = false;
11293         if ((idx != i) && idx > 3)
11294           pshuflw = false;
11295       }
11296       V1 = NewV;
11297       V2Used = false;
11298       BestLoQuad = 0;
11299       BestHiQuad = 1;
11300     }
11301
11302     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11303     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11304     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11305       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11306       unsigned TargetMask = 0;
11307       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11308                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11309       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11310       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11311                              getShufflePSHUFLWImmediate(SVOp);
11312       V1 = NewV.getOperand(0);
11313       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11314     }
11315   }
11316
11317   // Promote splats to a larger type which usually leads to more efficient code.
11318   // FIXME: Is this true if pshufb is available?
11319   if (SVOp->isSplat())
11320     return PromoteSplat(SVOp, DAG);
11321
11322   // If we have SSSE3, and all words of the result are from 1 input vector,
11323   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11324   // is present, fall back to case 4.
11325   if (Subtarget->hasSSSE3()) {
11326     SmallVector<SDValue,16> pshufbMask;
11327
11328     // If we have elements from both input vectors, set the high bit of the
11329     // shuffle mask element to zero out elements that come from V2 in the V1
11330     // mask, and elements that come from V1 in the V2 mask, so that the two
11331     // results can be OR'd together.
11332     bool TwoInputs = V1Used && V2Used;
11333     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11334     if (!TwoInputs)
11335       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11336
11337     // Calculate the shuffle mask for the second input, shuffle it, and
11338     // OR it with the first shuffled input.
11339     CommuteVectorShuffleMask(MaskVals, 8);
11340     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11341     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11342     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11343   }
11344
11345   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11346   // and update MaskVals with new element order.
11347   std::bitset<8> InOrder;
11348   if (BestLoQuad >= 0) {
11349     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11350     for (int i = 0; i != 4; ++i) {
11351       int idx = MaskVals[i];
11352       if (idx < 0) {
11353         InOrder.set(i);
11354       } else if ((idx / 4) == BestLoQuad) {
11355         MaskV[i] = idx & 3;
11356         InOrder.set(i);
11357       }
11358     }
11359     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11360                                 &MaskV[0]);
11361
11362     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11363       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11364       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11365                                   NewV.getOperand(0),
11366                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11367     }
11368   }
11369
11370   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11371   // and update MaskVals with the new element order.
11372   if (BestHiQuad >= 0) {
11373     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11374     for (unsigned i = 4; i != 8; ++i) {
11375       int idx = MaskVals[i];
11376       if (idx < 0) {
11377         InOrder.set(i);
11378       } else if ((idx / 4) == BestHiQuad) {
11379         MaskV[i] = (idx & 3) + 4;
11380         InOrder.set(i);
11381       }
11382     }
11383     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11384                                 &MaskV[0]);
11385
11386     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11387       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11388       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11389                                   NewV.getOperand(0),
11390                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11391     }
11392   }
11393
11394   // In case BestHi & BestLo were both -1, which means each quadword has a word
11395   // from each of the four input quadwords, calculate the InOrder bitvector now
11396   // before falling through to the insert/extract cleanup.
11397   if (BestLoQuad == -1 && BestHiQuad == -1) {
11398     NewV = V1;
11399     for (int i = 0; i != 8; ++i)
11400       if (MaskVals[i] < 0 || MaskVals[i] == i)
11401         InOrder.set(i);
11402   }
11403
11404   // The other elements are put in the right place using pextrw and pinsrw.
11405   for (unsigned i = 0; i != 8; ++i) {
11406     if (InOrder[i])
11407       continue;
11408     int EltIdx = MaskVals[i];
11409     if (EltIdx < 0)
11410       continue;
11411     SDValue ExtOp = (EltIdx < 8) ?
11412       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11413                   DAG.getIntPtrConstant(EltIdx)) :
11414       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11415                   DAG.getIntPtrConstant(EltIdx - 8));
11416     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11417                        DAG.getIntPtrConstant(i));
11418   }
11419   return NewV;
11420 }
11421
11422 /// \brief v16i16 shuffles
11423 ///
11424 /// FIXME: We only support generation of a single pshufb currently.  We can
11425 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11426 /// well (e.g 2 x pshufb + 1 x por).
11427 static SDValue
11428 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11429   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11430   SDValue V1 = SVOp->getOperand(0);
11431   SDValue V2 = SVOp->getOperand(1);
11432   SDLoc dl(SVOp);
11433
11434   if (V2.getOpcode() != ISD::UNDEF)
11435     return SDValue();
11436
11437   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11438   return getPSHUFB(MaskVals, V1, dl, DAG);
11439 }
11440
11441 // v16i8 shuffles - Prefer shuffles in the following order:
11442 // 1. [ssse3] 1 x pshufb
11443 // 2. [ssse3] 2 x pshufb + 1 x por
11444 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11445 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11446                                         const X86Subtarget* Subtarget,
11447                                         SelectionDAG &DAG) {
11448   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11449   SDValue V1 = SVOp->getOperand(0);
11450   SDValue V2 = SVOp->getOperand(1);
11451   SDLoc dl(SVOp);
11452   ArrayRef<int> MaskVals = SVOp->getMask();
11453
11454   // Promote splats to a larger type which usually leads to more efficient code.
11455   // FIXME: Is this true if pshufb is available?
11456   if (SVOp->isSplat())
11457     return PromoteSplat(SVOp, DAG);
11458
11459   // If we have SSSE3, case 1 is generated when all result bytes come from
11460   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11461   // present, fall back to case 3.
11462
11463   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11464   if (Subtarget->hasSSSE3()) {
11465     SmallVector<SDValue,16> pshufbMask;
11466
11467     // If all result elements are from one input vector, then only translate
11468     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11469     //
11470     // Otherwise, we have elements from both input vectors, and must zero out
11471     // elements that come from V2 in the first mask, and V1 in the second mask
11472     // so that we can OR them together.
11473     for (unsigned i = 0; i != 16; ++i) {
11474       int EltIdx = MaskVals[i];
11475       if (EltIdx < 0 || EltIdx >= 16)
11476         EltIdx = 0x80;
11477       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11478     }
11479     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11480                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11481                                  MVT::v16i8, pshufbMask));
11482
11483     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11484     // the 2nd operand if it's undefined or zero.
11485     if (V2.getOpcode() == ISD::UNDEF ||
11486         ISD::isBuildVectorAllZeros(V2.getNode()))
11487       return V1;
11488
11489     // Calculate the shuffle mask for the second input, shuffle it, and
11490     // OR it with the first shuffled input.
11491     pshufbMask.clear();
11492     for (unsigned i = 0; i != 16; ++i) {
11493       int EltIdx = MaskVals[i];
11494       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11495       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11496     }
11497     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11498                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11499                                  MVT::v16i8, pshufbMask));
11500     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11501   }
11502
11503   // No SSSE3 - Calculate in place words and then fix all out of place words
11504   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11505   // the 16 different words that comprise the two doublequadword input vectors.
11506   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11507   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11508   SDValue NewV = V1;
11509   for (int i = 0; i != 8; ++i) {
11510     int Elt0 = MaskVals[i*2];
11511     int Elt1 = MaskVals[i*2+1];
11512
11513     // This word of the result is all undef, skip it.
11514     if (Elt0 < 0 && Elt1 < 0)
11515       continue;
11516
11517     // This word of the result is already in the correct place, skip it.
11518     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11519       continue;
11520
11521     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11522     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11523     SDValue InsElt;
11524
11525     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11526     // using a single extract together, load it and store it.
11527     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11528       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11529                            DAG.getIntPtrConstant(Elt1 / 2));
11530       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11531                         DAG.getIntPtrConstant(i));
11532       continue;
11533     }
11534
11535     // If Elt1 is defined, extract it from the appropriate source.  If the
11536     // source byte is not also odd, shift the extracted word left 8 bits
11537     // otherwise clear the bottom 8 bits if we need to do an or.
11538     if (Elt1 >= 0) {
11539       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11540                            DAG.getIntPtrConstant(Elt1 / 2));
11541       if ((Elt1 & 1) == 0)
11542         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11543                              DAG.getConstant(8,
11544                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11545       else if (Elt0 >= 0)
11546         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11547                              DAG.getConstant(0xFF00, MVT::i16));
11548     }
11549     // If Elt0 is defined, extract it from the appropriate source.  If the
11550     // source byte is not also even, shift the extracted word right 8 bits. If
11551     // Elt1 was also defined, OR the extracted values together before
11552     // inserting them in the result.
11553     if (Elt0 >= 0) {
11554       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11555                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11556       if ((Elt0 & 1) != 0)
11557         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11558                               DAG.getConstant(8,
11559                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11560       else if (Elt1 >= 0)
11561         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11562                              DAG.getConstant(0x00FF, MVT::i16));
11563       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11564                          : InsElt0;
11565     }
11566     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11567                        DAG.getIntPtrConstant(i));
11568   }
11569   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11570 }
11571
11572 // v32i8 shuffles - Translate to VPSHUFB if possible.
11573 static
11574 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11575                                  const X86Subtarget *Subtarget,
11576                                  SelectionDAG &DAG) {
11577   MVT VT = SVOp->getSimpleValueType(0);
11578   SDValue V1 = SVOp->getOperand(0);
11579   SDValue V2 = SVOp->getOperand(1);
11580   SDLoc dl(SVOp);
11581   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11582
11583   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11584   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11585   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11586
11587   // VPSHUFB may be generated if
11588   // (1) one of input vector is undefined or zeroinitializer.
11589   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11590   // And (2) the mask indexes don't cross the 128-bit lane.
11591   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11592       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11593     return SDValue();
11594
11595   if (V1IsAllZero && !V2IsAllZero) {
11596     CommuteVectorShuffleMask(MaskVals, 32);
11597     V1 = V2;
11598   }
11599   return getPSHUFB(MaskVals, V1, dl, DAG);
11600 }
11601
11602 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11603 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11604 /// done when every pair / quad of shuffle mask elements point to elements in
11605 /// the right sequence. e.g.
11606 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11607 static
11608 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11609                                  SelectionDAG &DAG) {
11610   MVT VT = SVOp->getSimpleValueType(0);
11611   SDLoc dl(SVOp);
11612   unsigned NumElems = VT.getVectorNumElements();
11613   MVT NewVT;
11614   unsigned Scale;
11615   switch (VT.SimpleTy) {
11616   default: llvm_unreachable("Unexpected!");
11617   case MVT::v2i64:
11618   case MVT::v2f64:
11619            return SDValue(SVOp, 0);
11620   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11621   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11622   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11623   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11624   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11625   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11626   }
11627
11628   SmallVector<int, 8> MaskVec;
11629   for (unsigned i = 0; i != NumElems; i += Scale) {
11630     int StartIdx = -1;
11631     for (unsigned j = 0; j != Scale; ++j) {
11632       int EltIdx = SVOp->getMaskElt(i+j);
11633       if (EltIdx < 0)
11634         continue;
11635       if (StartIdx < 0)
11636         StartIdx = (EltIdx / Scale);
11637       if (EltIdx != (int)(StartIdx*Scale + j))
11638         return SDValue();
11639     }
11640     MaskVec.push_back(StartIdx);
11641   }
11642
11643   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11644   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11645   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11646 }
11647
11648 /// getVZextMovL - Return a zero-extending vector move low node.
11649 ///
11650 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11651                             SDValue SrcOp, SelectionDAG &DAG,
11652                             const X86Subtarget *Subtarget, SDLoc dl) {
11653   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11654     LoadSDNode *LD = nullptr;
11655     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11656       LD = dyn_cast<LoadSDNode>(SrcOp);
11657     if (!LD) {
11658       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11659       // instead.
11660       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11661       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11662           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11663           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11664           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11665         // PR2108
11666         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11667         return DAG.getNode(ISD::BITCAST, dl, VT,
11668                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11669                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11670                                                    OpVT,
11671                                                    SrcOp.getOperand(0)
11672                                                           .getOperand(0))));
11673       }
11674     }
11675   }
11676
11677   return DAG.getNode(ISD::BITCAST, dl, VT,
11678                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11679                                  DAG.getNode(ISD::BITCAST, dl,
11680                                              OpVT, SrcOp)));
11681 }
11682
11683 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11684 /// which could not be matched by any known target speficic shuffle
11685 static SDValue
11686 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11687
11688   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11689   if (NewOp.getNode())
11690     return NewOp;
11691
11692   MVT VT = SVOp->getSimpleValueType(0);
11693
11694   unsigned NumElems = VT.getVectorNumElements();
11695   unsigned NumLaneElems = NumElems / 2;
11696
11697   SDLoc dl(SVOp);
11698   MVT EltVT = VT.getVectorElementType();
11699   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11700   SDValue Output[2];
11701
11702   SmallVector<int, 16> Mask;
11703   for (unsigned l = 0; l < 2; ++l) {
11704     // Build a shuffle mask for the output, discovering on the fly which
11705     // input vectors to use as shuffle operands (recorded in InputUsed).
11706     // If building a suitable shuffle vector proves too hard, then bail
11707     // out with UseBuildVector set.
11708     bool UseBuildVector = false;
11709     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11710     unsigned LaneStart = l * NumLaneElems;
11711     for (unsigned i = 0; i != NumLaneElems; ++i) {
11712       // The mask element.  This indexes into the input.
11713       int Idx = SVOp->getMaskElt(i+LaneStart);
11714       if (Idx < 0) {
11715         // the mask element does not index into any input vector.
11716         Mask.push_back(-1);
11717         continue;
11718       }
11719
11720       // The input vector this mask element indexes into.
11721       int Input = Idx / NumLaneElems;
11722
11723       // Turn the index into an offset from the start of the input vector.
11724       Idx -= Input * NumLaneElems;
11725
11726       // Find or create a shuffle vector operand to hold this input.
11727       unsigned OpNo;
11728       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11729         if (InputUsed[OpNo] == Input)
11730           // This input vector is already an operand.
11731           break;
11732         if (InputUsed[OpNo] < 0) {
11733           // Create a new operand for this input vector.
11734           InputUsed[OpNo] = Input;
11735           break;
11736         }
11737       }
11738
11739       if (OpNo >= array_lengthof(InputUsed)) {
11740         // More than two input vectors used!  Give up on trying to create a
11741         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11742         UseBuildVector = true;
11743         break;
11744       }
11745
11746       // Add the mask index for the new shuffle vector.
11747       Mask.push_back(Idx + OpNo * NumLaneElems);
11748     }
11749
11750     if (UseBuildVector) {
11751       SmallVector<SDValue, 16> SVOps;
11752       for (unsigned i = 0; i != NumLaneElems; ++i) {
11753         // The mask element.  This indexes into the input.
11754         int Idx = SVOp->getMaskElt(i+LaneStart);
11755         if (Idx < 0) {
11756           SVOps.push_back(DAG.getUNDEF(EltVT));
11757           continue;
11758         }
11759
11760         // The input vector this mask element indexes into.
11761         int Input = Idx / NumElems;
11762
11763         // Turn the index into an offset from the start of the input vector.
11764         Idx -= Input * NumElems;
11765
11766         // Extract the vector element by hand.
11767         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
11768                                     SVOp->getOperand(Input),
11769                                     DAG.getIntPtrConstant(Idx)));
11770       }
11771
11772       // Construct the output using a BUILD_VECTOR.
11773       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
11774     } else if (InputUsed[0] < 0) {
11775       // No input vectors were used! The result is undefined.
11776       Output[l] = DAG.getUNDEF(NVT);
11777     } else {
11778       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
11779                                         (InputUsed[0] % 2) * NumLaneElems,
11780                                         DAG, dl);
11781       // If only one input was used, use an undefined vector for the other.
11782       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
11783         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
11784                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
11785       // At least one input vector was used. Create a new shuffle vector.
11786       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
11787     }
11788
11789     Mask.clear();
11790   }
11791
11792   // Concatenate the result back
11793   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
11794 }
11795
11796 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
11797 /// 4 elements, and match them with several different shuffle types.
11798 static SDValue
11799 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11800   SDValue V1 = SVOp->getOperand(0);
11801   SDValue V2 = SVOp->getOperand(1);
11802   SDLoc dl(SVOp);
11803   MVT VT = SVOp->getSimpleValueType(0);
11804
11805   assert(VT.is128BitVector() && "Unsupported vector size");
11806
11807   std::pair<int, int> Locs[4];
11808   int Mask1[] = { -1, -1, -1, -1 };
11809   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
11810
11811   unsigned NumHi = 0;
11812   unsigned NumLo = 0;
11813   for (unsigned i = 0; i != 4; ++i) {
11814     int Idx = PermMask[i];
11815     if (Idx < 0) {
11816       Locs[i] = std::make_pair(-1, -1);
11817     } else {
11818       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
11819       if (Idx < 4) {
11820         Locs[i] = std::make_pair(0, NumLo);
11821         Mask1[NumLo] = Idx;
11822         NumLo++;
11823       } else {
11824         Locs[i] = std::make_pair(1, NumHi);
11825         if (2+NumHi < 4)
11826           Mask1[2+NumHi] = Idx;
11827         NumHi++;
11828       }
11829     }
11830   }
11831
11832   if (NumLo <= 2 && NumHi <= 2) {
11833     // If no more than two elements come from either vector. This can be
11834     // implemented with two shuffles. First shuffle gather the elements.
11835     // The second shuffle, which takes the first shuffle as both of its
11836     // vector operands, put the elements into the right order.
11837     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11838
11839     int Mask2[] = { -1, -1, -1, -1 };
11840
11841     for (unsigned i = 0; i != 4; ++i)
11842       if (Locs[i].first != -1) {
11843         unsigned Idx = (i < 2) ? 0 : 4;
11844         Idx += Locs[i].first * 2 + Locs[i].second;
11845         Mask2[i] = Idx;
11846       }
11847
11848     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
11849   }
11850
11851   if (NumLo == 3 || NumHi == 3) {
11852     // Otherwise, we must have three elements from one vector, call it X, and
11853     // one element from the other, call it Y.  First, use a shufps to build an
11854     // intermediate vector with the one element from Y and the element from X
11855     // that will be in the same half in the final destination (the indexes don't
11856     // matter). Then, use a shufps to build the final vector, taking the half
11857     // containing the element from Y from the intermediate, and the other half
11858     // from X.
11859     if (NumHi == 3) {
11860       // Normalize it so the 3 elements come from V1.
11861       CommuteVectorShuffleMask(PermMask, 4);
11862       std::swap(V1, V2);
11863     }
11864
11865     // Find the element from V2.
11866     unsigned HiIndex;
11867     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
11868       int Val = PermMask[HiIndex];
11869       if (Val < 0)
11870         continue;
11871       if (Val >= 4)
11872         break;
11873     }
11874
11875     Mask1[0] = PermMask[HiIndex];
11876     Mask1[1] = -1;
11877     Mask1[2] = PermMask[HiIndex^1];
11878     Mask1[3] = -1;
11879     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11880
11881     if (HiIndex >= 2) {
11882       Mask1[0] = PermMask[0];
11883       Mask1[1] = PermMask[1];
11884       Mask1[2] = HiIndex & 1 ? 6 : 4;
11885       Mask1[3] = HiIndex & 1 ? 4 : 6;
11886       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11887     }
11888
11889     Mask1[0] = HiIndex & 1 ? 2 : 0;
11890     Mask1[1] = HiIndex & 1 ? 0 : 2;
11891     Mask1[2] = PermMask[2];
11892     Mask1[3] = PermMask[3];
11893     if (Mask1[2] >= 0)
11894       Mask1[2] += 4;
11895     if (Mask1[3] >= 0)
11896       Mask1[3] += 4;
11897     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
11898   }
11899
11900   // Break it into (shuffle shuffle_hi, shuffle_lo).
11901   int LoMask[] = { -1, -1, -1, -1 };
11902   int HiMask[] = { -1, -1, -1, -1 };
11903
11904   int *MaskPtr = LoMask;
11905   unsigned MaskIdx = 0;
11906   unsigned LoIdx = 0;
11907   unsigned HiIdx = 2;
11908   for (unsigned i = 0; i != 4; ++i) {
11909     if (i == 2) {
11910       MaskPtr = HiMask;
11911       MaskIdx = 1;
11912       LoIdx = 0;
11913       HiIdx = 2;
11914     }
11915     int Idx = PermMask[i];
11916     if (Idx < 0) {
11917       Locs[i] = std::make_pair(-1, -1);
11918     } else if (Idx < 4) {
11919       Locs[i] = std::make_pair(MaskIdx, LoIdx);
11920       MaskPtr[LoIdx] = Idx;
11921       LoIdx++;
11922     } else {
11923       Locs[i] = std::make_pair(MaskIdx, HiIdx);
11924       MaskPtr[HiIdx] = Idx;
11925       HiIdx++;
11926     }
11927   }
11928
11929   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
11930   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
11931   int MaskOps[] = { -1, -1, -1, -1 };
11932   for (unsigned i = 0; i != 4; ++i)
11933     if (Locs[i].first != -1)
11934       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
11935   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
11936 }
11937
11938 static bool MayFoldVectorLoad(SDValue V) {
11939   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
11940     V = V.getOperand(0);
11941
11942   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
11943     V = V.getOperand(0);
11944   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
11945       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
11946     // BUILD_VECTOR (load), undef
11947     V = V.getOperand(0);
11948
11949   return MayFoldLoad(V);
11950 }
11951
11952 static
11953 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
11954   MVT VT = Op.getSimpleValueType();
11955
11956   // Canonizalize to v2f64.
11957   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
11958   return DAG.getNode(ISD::BITCAST, dl, VT,
11959                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
11960                                           V1, DAG));
11961 }
11962
11963 static
11964 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
11965                         bool HasSSE2) {
11966   SDValue V1 = Op.getOperand(0);
11967   SDValue V2 = Op.getOperand(1);
11968   MVT VT = Op.getSimpleValueType();
11969
11970   assert(VT != MVT::v2i64 && "unsupported shuffle type");
11971
11972   if (HasSSE2 && VT == MVT::v2f64)
11973     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
11974
11975   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
11976   return DAG.getNode(ISD::BITCAST, dl, VT,
11977                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
11978                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
11979                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
11980 }
11981
11982 static
11983 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
11984   SDValue V1 = Op.getOperand(0);
11985   SDValue V2 = Op.getOperand(1);
11986   MVT VT = Op.getSimpleValueType();
11987
11988   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
11989          "unsupported shuffle type");
11990
11991   if (V2.getOpcode() == ISD::UNDEF)
11992     V2 = V1;
11993
11994   // v4i32 or v4f32
11995   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
11996 }
11997
11998 static
11999 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12000   SDValue V1 = Op.getOperand(0);
12001   SDValue V2 = Op.getOperand(1);
12002   MVT VT = Op.getSimpleValueType();
12003   unsigned NumElems = VT.getVectorNumElements();
12004
12005   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12006   // operand of these instructions is only memory, so check if there's a
12007   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12008   // same masks.
12009   bool CanFoldLoad = false;
12010
12011   // Trivial case, when V2 comes from a load.
12012   if (MayFoldVectorLoad(V2))
12013     CanFoldLoad = true;
12014
12015   // When V1 is a load, it can be folded later into a store in isel, example:
12016   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12017   //    turns into:
12018   //  (MOVLPSmr addr:$src1, VR128:$src2)
12019   // So, recognize this potential and also use MOVLPS or MOVLPD
12020   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12021     CanFoldLoad = true;
12022
12023   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12024   if (CanFoldLoad) {
12025     if (HasSSE2 && NumElems == 2)
12026       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12027
12028     if (NumElems == 4)
12029       // If we don't care about the second element, proceed to use movss.
12030       if (SVOp->getMaskElt(1) != -1)
12031         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12032   }
12033
12034   // movl and movlp will both match v2i64, but v2i64 is never matched by
12035   // movl earlier because we make it strict to avoid messing with the movlp load
12036   // folding logic (see the code above getMOVLP call). Match it here then,
12037   // this is horrible, but will stay like this until we move all shuffle
12038   // matching to x86 specific nodes. Note that for the 1st condition all
12039   // types are matched with movsd.
12040   if (HasSSE2) {
12041     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12042     // as to remove this logic from here, as much as possible
12043     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12044       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12045     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12046   }
12047
12048   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12049
12050   // Invert the operand order and use SHUFPS to match it.
12051   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12052                               getShuffleSHUFImmediate(SVOp), DAG);
12053 }
12054
12055 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12056                                          SelectionDAG &DAG) {
12057   SDLoc dl(Load);
12058   MVT VT = Load->getSimpleValueType(0);
12059   MVT EVT = VT.getVectorElementType();
12060   SDValue Addr = Load->getOperand(1);
12061   SDValue NewAddr = DAG.getNode(
12062       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12063       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12064
12065   SDValue NewLoad =
12066       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12067                   DAG.getMachineFunction().getMachineMemOperand(
12068                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12069   return NewLoad;
12070 }
12071
12072 // It is only safe to call this function if isINSERTPSMask is true for
12073 // this shufflevector mask.
12074 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12075                            SelectionDAG &DAG) {
12076   // Generate an insertps instruction when inserting an f32 from memory onto a
12077   // v4f32 or when copying a member from one v4f32 to another.
12078   // We also use it for transferring i32 from one register to another,
12079   // since it simply copies the same bits.
12080   // If we're transferring an i32 from memory to a specific element in a
12081   // register, we output a generic DAG that will match the PINSRD
12082   // instruction.
12083   MVT VT = SVOp->getSimpleValueType(0);
12084   MVT EVT = VT.getVectorElementType();
12085   SDValue V1 = SVOp->getOperand(0);
12086   SDValue V2 = SVOp->getOperand(1);
12087   auto Mask = SVOp->getMask();
12088   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12089          "unsupported vector type for insertps/pinsrd");
12090
12091   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12092   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12093   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12094
12095   SDValue From;
12096   SDValue To;
12097   unsigned DestIndex;
12098   if (FromV1 == 1) {
12099     From = V1;
12100     To = V2;
12101     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12102                 Mask.begin();
12103
12104     // If we have 1 element from each vector, we have to check if we're
12105     // changing V1's element's place. If so, we're done. Otherwise, we
12106     // should assume we're changing V2's element's place and behave
12107     // accordingly.
12108     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12109     assert(DestIndex <= INT32_MAX && "truncated destination index");
12110     if (FromV1 == FromV2 &&
12111         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12112       From = V2;
12113       To = V1;
12114       DestIndex =
12115           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12116     }
12117   } else {
12118     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12119            "More than one element from V1 and from V2, or no elements from one "
12120            "of the vectors. This case should not have returned true from "
12121            "isINSERTPSMask");
12122     From = V2;
12123     To = V1;
12124     DestIndex =
12125         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12126   }
12127
12128   // Get an index into the source vector in the range [0,4) (the mask is
12129   // in the range [0,8) because it can address V1 and V2)
12130   unsigned SrcIndex = Mask[DestIndex] % 4;
12131   if (MayFoldLoad(From)) {
12132     // Trivial case, when From comes from a load and is only used by the
12133     // shuffle. Make it use insertps from the vector that we need from that
12134     // load.
12135     SDValue NewLoad =
12136         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12137     if (!NewLoad.getNode())
12138       return SDValue();
12139
12140     if (EVT == MVT::f32) {
12141       // Create this as a scalar to vector to match the instruction pattern.
12142       SDValue LoadScalarToVector =
12143           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12144       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12145       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12146                          InsertpsMask);
12147     } else { // EVT == MVT::i32
12148       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12149       // instruction, to match the PINSRD instruction, which loads an i32 to a
12150       // certain vector element.
12151       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12152                          DAG.getConstant(DestIndex, MVT::i32));
12153     }
12154   }
12155
12156   // Vector-element-to-vector
12157   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12158   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12159 }
12160
12161 // Reduce a vector shuffle to zext.
12162 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12163                                     SelectionDAG &DAG) {
12164   // PMOVZX is only available from SSE41.
12165   if (!Subtarget->hasSSE41())
12166     return SDValue();
12167
12168   MVT VT = Op.getSimpleValueType();
12169
12170   // Only AVX2 support 256-bit vector integer extending.
12171   if (!Subtarget->hasInt256() && VT.is256BitVector())
12172     return SDValue();
12173
12174   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12175   SDLoc DL(Op);
12176   SDValue V1 = Op.getOperand(0);
12177   SDValue V2 = Op.getOperand(1);
12178   unsigned NumElems = VT.getVectorNumElements();
12179
12180   // Extending is an unary operation and the element type of the source vector
12181   // won't be equal to or larger than i64.
12182   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12183       VT.getVectorElementType() == MVT::i64)
12184     return SDValue();
12185
12186   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12187   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12188   while ((1U << Shift) < NumElems) {
12189     if (SVOp->getMaskElt(1U << Shift) == 1)
12190       break;
12191     Shift += 1;
12192     // The maximal ratio is 8, i.e. from i8 to i64.
12193     if (Shift > 3)
12194       return SDValue();
12195   }
12196
12197   // Check the shuffle mask.
12198   unsigned Mask = (1U << Shift) - 1;
12199   for (unsigned i = 0; i != NumElems; ++i) {
12200     int EltIdx = SVOp->getMaskElt(i);
12201     if ((i & Mask) != 0 && EltIdx != -1)
12202       return SDValue();
12203     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12204       return SDValue();
12205   }
12206
12207   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12208   MVT NeVT = MVT::getIntegerVT(NBits);
12209   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12210
12211   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12212     return SDValue();
12213
12214   return DAG.getNode(ISD::BITCAST, DL, VT,
12215                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12216 }
12217
12218 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12219                                       SelectionDAG &DAG) {
12220   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12221   MVT VT = Op.getSimpleValueType();
12222   SDLoc dl(Op);
12223   SDValue V1 = Op.getOperand(0);
12224   SDValue V2 = Op.getOperand(1);
12225
12226   if (isZeroShuffle(SVOp))
12227     return getZeroVector(VT, Subtarget, DAG, dl);
12228
12229   // Handle splat operations
12230   if (SVOp->isSplat()) {
12231     // Use vbroadcast whenever the splat comes from a foldable load
12232     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12233     if (Broadcast.getNode())
12234       return Broadcast;
12235   }
12236
12237   // Check integer expanding shuffles.
12238   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12239   if (NewOp.getNode())
12240     return NewOp;
12241
12242   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12243   // do it!
12244   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12245       VT == MVT::v32i8) {
12246     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12247     if (NewOp.getNode())
12248       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12249   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12250     // FIXME: Figure out a cleaner way to do this.
12251     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12252       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12253       if (NewOp.getNode()) {
12254         MVT NewVT = NewOp.getSimpleValueType();
12255         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12256                                NewVT, true, false))
12257           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12258                               dl);
12259       }
12260     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12261       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12262       if (NewOp.getNode()) {
12263         MVT NewVT = NewOp.getSimpleValueType();
12264         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12265           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12266                               dl);
12267       }
12268     }
12269   }
12270   return SDValue();
12271 }
12272
12273 SDValue
12274 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12275   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12276   SDValue V1 = Op.getOperand(0);
12277   SDValue V2 = Op.getOperand(1);
12278   MVT VT = Op.getSimpleValueType();
12279   SDLoc dl(Op);
12280   unsigned NumElems = VT.getVectorNumElements();
12281   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12282   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12283   bool V1IsSplat = false;
12284   bool V2IsSplat = false;
12285   bool HasSSE2 = Subtarget->hasSSE2();
12286   bool HasFp256    = Subtarget->hasFp256();
12287   bool HasInt256   = Subtarget->hasInt256();
12288   MachineFunction &MF = DAG.getMachineFunction();
12289   bool OptForSize = MF.getFunction()->getAttributes().
12290     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12291
12292   // Check if we should use the experimental vector shuffle lowering. If so,
12293   // delegate completely to that code path.
12294   if (ExperimentalVectorShuffleLowering)
12295     return lowerVectorShuffle(Op, Subtarget, DAG);
12296
12297   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12298
12299   if (V1IsUndef && V2IsUndef)
12300     return DAG.getUNDEF(VT);
12301
12302   // When we create a shuffle node we put the UNDEF node to second operand,
12303   // but in some cases the first operand may be transformed to UNDEF.
12304   // In this case we should just commute the node.
12305   if (V1IsUndef)
12306     return DAG.getCommutedVectorShuffle(*SVOp);
12307
12308   // Vector shuffle lowering takes 3 steps:
12309   //
12310   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12311   //    narrowing and commutation of operands should be handled.
12312   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12313   //    shuffle nodes.
12314   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12315   //    so the shuffle can be broken into other shuffles and the legalizer can
12316   //    try the lowering again.
12317   //
12318   // The general idea is that no vector_shuffle operation should be left to
12319   // be matched during isel, all of them must be converted to a target specific
12320   // node here.
12321
12322   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12323   // narrowing and commutation of operands should be handled. The actual code
12324   // doesn't include all of those, work in progress...
12325   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12326   if (NewOp.getNode())
12327     return NewOp;
12328
12329   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12330
12331   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12332   // unpckh_undef). Only use pshufd if speed is more important than size.
12333   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12334     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12335   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12336     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12337
12338   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12339       V2IsUndef && MayFoldVectorLoad(V1))
12340     return getMOVDDup(Op, dl, V1, DAG);
12341
12342   if (isMOVHLPS_v_undef_Mask(M, VT))
12343     return getMOVHighToLow(Op, dl, DAG);
12344
12345   // Use to match splats
12346   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12347       (VT == MVT::v2f64 || VT == MVT::v2i64))
12348     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12349
12350   if (isPSHUFDMask(M, VT)) {
12351     // The actual implementation will match the mask in the if above and then
12352     // during isel it can match several different instructions, not only pshufd
12353     // as its name says, sad but true, emulate the behavior for now...
12354     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12355       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12356
12357     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12358
12359     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12360       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12361
12362     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12363       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12364                                   DAG);
12365
12366     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12367                                 TargetMask, DAG);
12368   }
12369
12370   if (isPALIGNRMask(M, VT, Subtarget))
12371     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12372                                 getShufflePALIGNRImmediate(SVOp),
12373                                 DAG);
12374
12375   if (isVALIGNMask(M, VT, Subtarget))
12376     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12377                                 getShuffleVALIGNImmediate(SVOp),
12378                                 DAG);
12379
12380   // Check if this can be converted into a logical shift.
12381   bool isLeft = false;
12382   unsigned ShAmt = 0;
12383   SDValue ShVal;
12384   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12385   if (isShift && ShVal.hasOneUse()) {
12386     // If the shifted value has multiple uses, it may be cheaper to use
12387     // v_set0 + movlhps or movhlps, etc.
12388     MVT EltVT = VT.getVectorElementType();
12389     ShAmt *= EltVT.getSizeInBits();
12390     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12391   }
12392
12393   if (isMOVLMask(M, VT)) {
12394     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12395       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12396     if (!isMOVLPMask(M, VT)) {
12397       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12398         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12399
12400       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12401         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12402     }
12403   }
12404
12405   // FIXME: fold these into legal mask.
12406   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12407     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12408
12409   if (isMOVHLPSMask(M, VT))
12410     return getMOVHighToLow(Op, dl, DAG);
12411
12412   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12413     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12414
12415   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12416     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12417
12418   if (isMOVLPMask(M, VT))
12419     return getMOVLP(Op, dl, DAG, HasSSE2);
12420
12421   if (ShouldXformToMOVHLPS(M, VT) ||
12422       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12423     return DAG.getCommutedVectorShuffle(*SVOp);
12424
12425   if (isShift) {
12426     // No better options. Use a vshldq / vsrldq.
12427     MVT EltVT = VT.getVectorElementType();
12428     ShAmt *= EltVT.getSizeInBits();
12429     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12430   }
12431
12432   bool Commuted = false;
12433   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12434   // 1,1,1,1 -> v8i16 though.
12435   BitVector UndefElements;
12436   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12437     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12438       V1IsSplat = true;
12439   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12440     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12441       V2IsSplat = true;
12442
12443   // Canonicalize the splat or undef, if present, to be on the RHS.
12444   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12445     CommuteVectorShuffleMask(M, NumElems);
12446     std::swap(V1, V2);
12447     std::swap(V1IsSplat, V2IsSplat);
12448     Commuted = true;
12449   }
12450
12451   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12452     // Shuffling low element of v1 into undef, just return v1.
12453     if (V2IsUndef)
12454       return V1;
12455     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12456     // the instruction selector will not match, so get a canonical MOVL with
12457     // swapped operands to undo the commute.
12458     return getMOVL(DAG, dl, VT, V2, V1);
12459   }
12460
12461   if (isUNPCKLMask(M, VT, HasInt256))
12462     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12463
12464   if (isUNPCKHMask(M, VT, HasInt256))
12465     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12466
12467   if (V2IsSplat) {
12468     // Normalize mask so all entries that point to V2 points to its first
12469     // element then try to match unpck{h|l} again. If match, return a
12470     // new vector_shuffle with the corrected mask.p
12471     SmallVector<int, 8> NewMask(M.begin(), M.end());
12472     NormalizeMask(NewMask, NumElems);
12473     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12474       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12475     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12476       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12477   }
12478
12479   if (Commuted) {
12480     // Commute is back and try unpck* again.
12481     // FIXME: this seems wrong.
12482     CommuteVectorShuffleMask(M, NumElems);
12483     std::swap(V1, V2);
12484     std::swap(V1IsSplat, V2IsSplat);
12485
12486     if (isUNPCKLMask(M, VT, HasInt256))
12487       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12488
12489     if (isUNPCKHMask(M, VT, HasInt256))
12490       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12491   }
12492
12493   // Normalize the node to match x86 shuffle ops if needed
12494   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12495     return DAG.getCommutedVectorShuffle(*SVOp);
12496
12497   // The checks below are all present in isShuffleMaskLegal, but they are
12498   // inlined here right now to enable us to directly emit target specific
12499   // nodes, and remove one by one until they don't return Op anymore.
12500
12501   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12502       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12503     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12504       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12505   }
12506
12507   if (isPSHUFHWMask(M, VT, HasInt256))
12508     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12509                                 getShufflePSHUFHWImmediate(SVOp),
12510                                 DAG);
12511
12512   if (isPSHUFLWMask(M, VT, HasInt256))
12513     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12514                                 getShufflePSHUFLWImmediate(SVOp),
12515                                 DAG);
12516
12517   unsigned MaskValue;
12518   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12519                   &MaskValue))
12520     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12521
12522   if (isSHUFPMask(M, VT))
12523     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12524                                 getShuffleSHUFImmediate(SVOp), DAG);
12525
12526   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12527     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12528   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12529     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12530
12531   //===--------------------------------------------------------------------===//
12532   // Generate target specific nodes for 128 or 256-bit shuffles only
12533   // supported in the AVX instruction set.
12534   //
12535
12536   // Handle VMOVDDUPY permutations
12537   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12538     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12539
12540   // Handle VPERMILPS/D* permutations
12541   if (isVPERMILPMask(M, VT)) {
12542     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12543       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12544                                   getShuffleSHUFImmediate(SVOp), DAG);
12545     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12546                                 getShuffleSHUFImmediate(SVOp), DAG);
12547   }
12548
12549   unsigned Idx;
12550   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12551     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12552                               Idx*(NumElems/2), DAG, dl);
12553
12554   // Handle VPERM2F128/VPERM2I128 permutations
12555   if (isVPERM2X128Mask(M, VT, HasFp256))
12556     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12557                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12558
12559   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12560     return getINSERTPS(SVOp, dl, DAG);
12561
12562   unsigned Imm8;
12563   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12564     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12565
12566   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12567       VT.is512BitVector()) {
12568     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12569     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12570     SmallVector<SDValue, 16> permclMask;
12571     for (unsigned i = 0; i != NumElems; ++i) {
12572       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12573     }
12574
12575     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12576     if (V2IsUndef)
12577       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12578       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12579                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12580     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12581                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12582   }
12583
12584   //===--------------------------------------------------------------------===//
12585   // Since no target specific shuffle was selected for this generic one,
12586   // lower it into other known shuffles. FIXME: this isn't true yet, but
12587   // this is the plan.
12588   //
12589
12590   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12591   if (VT == MVT::v8i16) {
12592     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12593     if (NewOp.getNode())
12594       return NewOp;
12595   }
12596
12597   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12598     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12599     if (NewOp.getNode())
12600       return NewOp;
12601   }
12602
12603   if (VT == MVT::v16i8) {
12604     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12605     if (NewOp.getNode())
12606       return NewOp;
12607   }
12608
12609   if (VT == MVT::v32i8) {
12610     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12611     if (NewOp.getNode())
12612       return NewOp;
12613   }
12614
12615   // Handle all 128-bit wide vectors with 4 elements, and match them with
12616   // several different shuffle types.
12617   if (NumElems == 4 && VT.is128BitVector())
12618     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12619
12620   // Handle general 256-bit shuffles
12621   if (VT.is256BitVector())
12622     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12623
12624   return SDValue();
12625 }
12626
12627 // This function assumes its argument is a BUILD_VECTOR of constants or
12628 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12629 // true.
12630 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12631                                     unsigned &MaskValue) {
12632   MaskValue = 0;
12633   unsigned NumElems = BuildVector->getNumOperands();
12634   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12635   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12636   unsigned NumElemsInLane = NumElems / NumLanes;
12637
12638   // Blend for v16i16 should be symetric for the both lanes.
12639   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12640     SDValue EltCond = BuildVector->getOperand(i);
12641     SDValue SndLaneEltCond =
12642         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12643
12644     int Lane1Cond = -1, Lane2Cond = -1;
12645     if (isa<ConstantSDNode>(EltCond))
12646       Lane1Cond = !isZero(EltCond);
12647     if (isa<ConstantSDNode>(SndLaneEltCond))
12648       Lane2Cond = !isZero(SndLaneEltCond);
12649
12650     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12651       // Lane1Cond != 0, means we want the first argument.
12652       // Lane1Cond == 0, means we want the second argument.
12653       // The encoding of this argument is 0 for the first argument, 1
12654       // for the second. Therefore, invert the condition.
12655       MaskValue |= !Lane1Cond << i;
12656     else if (Lane1Cond < 0)
12657       MaskValue |= !Lane2Cond << i;
12658     else
12659       return false;
12660   }
12661   return true;
12662 }
12663
12664 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12665 /// instruction.
12666 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12667                                     SelectionDAG &DAG) {
12668   SDValue Cond = Op.getOperand(0);
12669   SDValue LHS = Op.getOperand(1);
12670   SDValue RHS = Op.getOperand(2);
12671   SDLoc dl(Op);
12672   MVT VT = Op.getSimpleValueType();
12673   MVT EltVT = VT.getVectorElementType();
12674   unsigned NumElems = VT.getVectorNumElements();
12675
12676   // There is no blend with immediate in AVX-512.
12677   if (VT.is512BitVector())
12678     return SDValue();
12679
12680   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12681     return SDValue();
12682   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12683     return SDValue();
12684
12685   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12686     return SDValue();
12687
12688   // Check the mask for BLEND and build the value.
12689   unsigned MaskValue = 0;
12690   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12691     return SDValue();
12692
12693   // Convert i32 vectors to floating point if it is not AVX2.
12694   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12695   MVT BlendVT = VT;
12696   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12697     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12698                                NumElems);
12699     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12700     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12701   }
12702
12703   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12704                             DAG.getConstant(MaskValue, MVT::i32));
12705   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12706 }
12707
12708 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12709   // A vselect where all conditions and data are constants can be optimized into
12710   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12711   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12712       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12713       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12714     return SDValue();
12715
12716   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12717   if (BlendOp.getNode())
12718     return BlendOp;
12719
12720   // Some types for vselect were previously set to Expand, not Legal or
12721   // Custom. Return an empty SDValue so we fall-through to Expand, after
12722   // the Custom lowering phase.
12723   MVT VT = Op.getSimpleValueType();
12724   switch (VT.SimpleTy) {
12725   default:
12726     break;
12727   case MVT::v8i16:
12728   case MVT::v16i16:
12729     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12730       break;
12731     return SDValue();
12732   }
12733
12734   // We couldn't create a "Blend with immediate" node.
12735   // This node should still be legal, but we'll have to emit a blendv*
12736   // instruction.
12737   return Op;
12738 }
12739
12740 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12741   MVT VT = Op.getSimpleValueType();
12742   SDLoc dl(Op);
12743
12744   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12745     return SDValue();
12746
12747   if (VT.getSizeInBits() == 8) {
12748     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12749                                   Op.getOperand(0), Op.getOperand(1));
12750     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12751                                   DAG.getValueType(VT));
12752     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12753   }
12754
12755   if (VT.getSizeInBits() == 16) {
12756     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12757     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12758     if (Idx == 0)
12759       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12760                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12761                                      DAG.getNode(ISD::BITCAST, dl,
12762                                                  MVT::v4i32,
12763                                                  Op.getOperand(0)),
12764                                      Op.getOperand(1)));
12765     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12766                                   Op.getOperand(0), Op.getOperand(1));
12767     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12768                                   DAG.getValueType(VT));
12769     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12770   }
12771
12772   if (VT == MVT::f32) {
12773     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12774     // the result back to FR32 register. It's only worth matching if the
12775     // result has a single use which is a store or a bitcast to i32.  And in
12776     // the case of a store, it's not worth it if the index is a constant 0,
12777     // because a MOVSSmr can be used instead, which is smaller and faster.
12778     if (!Op.hasOneUse())
12779       return SDValue();
12780     SDNode *User = *Op.getNode()->use_begin();
12781     if ((User->getOpcode() != ISD::STORE ||
12782          (isa<ConstantSDNode>(Op.getOperand(1)) &&
12783           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
12784         (User->getOpcode() != ISD::BITCAST ||
12785          User->getValueType(0) != MVT::i32))
12786       return SDValue();
12787     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12788                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
12789                                               Op.getOperand(0)),
12790                                               Op.getOperand(1));
12791     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
12792   }
12793
12794   if (VT == MVT::i32 || VT == MVT::i64) {
12795     // ExtractPS/pextrq works with constant index.
12796     if (isa<ConstantSDNode>(Op.getOperand(1)))
12797       return Op;
12798   }
12799   return SDValue();
12800 }
12801
12802 /// Extract one bit from mask vector, like v16i1 or v8i1.
12803 /// AVX-512 feature.
12804 SDValue
12805 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12806   SDValue Vec = Op.getOperand(0);
12807   SDLoc dl(Vec);
12808   MVT VecVT = Vec.getSimpleValueType();
12809   SDValue Idx = Op.getOperand(1);
12810   MVT EltVT = Op.getSimpleValueType();
12811
12812   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12813
12814   // variable index can't be handled in mask registers,
12815   // extend vector to VR512
12816   if (!isa<ConstantSDNode>(Idx)) {
12817     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12818     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12819     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12820                               ExtVT.getVectorElementType(), Ext, Idx);
12821     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12822   }
12823
12824   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12825   const TargetRegisterClass* rc = getRegClassFor(VecVT);
12826   unsigned MaxSift = rc->getSize()*8 - 1;
12827   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12828                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12829   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12830                     DAG.getConstant(MaxSift, MVT::i8));
12831   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12832                        DAG.getIntPtrConstant(0));
12833 }
12834
12835 SDValue
12836 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12837                                            SelectionDAG &DAG) const {
12838   SDLoc dl(Op);
12839   SDValue Vec = Op.getOperand(0);
12840   MVT VecVT = Vec.getSimpleValueType();
12841   SDValue Idx = Op.getOperand(1);
12842
12843   if (Op.getSimpleValueType() == MVT::i1)
12844     return ExtractBitFromMaskVector(Op, DAG);
12845
12846   if (!isa<ConstantSDNode>(Idx)) {
12847     if (VecVT.is512BitVector() ||
12848         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
12849          VecVT.getVectorElementType().getSizeInBits() == 32)) {
12850
12851       MVT MaskEltVT =
12852         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12853       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12854                                     MaskEltVT.getSizeInBits());
12855
12856       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12857       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12858                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
12859                                 Idx, DAG.getConstant(0, getPointerTy()));
12860       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12861       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
12862                         Perm, DAG.getConstant(0, getPointerTy()));
12863     }
12864     return SDValue();
12865   }
12866
12867   // If this is a 256-bit vector result, first extract the 128-bit vector and
12868   // then extract the element from the 128-bit vector.
12869   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12870
12871     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12872     // Get the 128-bit vector.
12873     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
12874     MVT EltVT = VecVT.getVectorElementType();
12875
12876     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12877
12878     //if (IdxVal >= NumElems/2)
12879     //  IdxVal -= NumElems/2;
12880     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
12881     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12882                        DAG.getConstant(IdxVal, MVT::i32));
12883   }
12884
12885   assert(VecVT.is128BitVector() && "Unexpected vector length");
12886
12887   if (Subtarget->hasSSE41()) {
12888     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
12889     if (Res.getNode())
12890       return Res;
12891   }
12892
12893   MVT VT = Op.getSimpleValueType();
12894   // TODO: handle v16i8.
12895   if (VT.getSizeInBits() == 16) {
12896     SDValue Vec = Op.getOperand(0);
12897     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12898     if (Idx == 0)
12899       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12900                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12901                                      DAG.getNode(ISD::BITCAST, dl,
12902                                                  MVT::v4i32, Vec),
12903                                      Op.getOperand(1)));
12904     // Transform it so it match pextrw which produces a 32-bit result.
12905     MVT EltVT = MVT::i32;
12906     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
12907                                   Op.getOperand(0), Op.getOperand(1));
12908     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12909                                   DAG.getValueType(VT));
12910     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12911   }
12912
12913   if (VT.getSizeInBits() == 32) {
12914     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12915     if (Idx == 0)
12916       return Op;
12917
12918     // SHUFPS the element to the lowest double word, then movss.
12919     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
12920     MVT VVT = Op.getOperand(0).getSimpleValueType();
12921     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12922                                        DAG.getUNDEF(VVT), Mask);
12923     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12924                        DAG.getIntPtrConstant(0));
12925   }
12926
12927   if (VT.getSizeInBits() == 64) {
12928     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12929     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12930     //        to match extract_elt for f64.
12931     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12932     if (Idx == 0)
12933       return Op;
12934
12935     // UNPCKHPD the element to the lowest double word, then movsd.
12936     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12937     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12938     int Mask[2] = { 1, -1 };
12939     MVT VVT = Op.getOperand(0).getSimpleValueType();
12940     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12941                                        DAG.getUNDEF(VVT), Mask);
12942     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12943                        DAG.getIntPtrConstant(0));
12944   }
12945
12946   return SDValue();
12947 }
12948
12949 /// Insert one bit to mask vector, like v16i1 or v8i1.
12950 /// AVX-512 feature.
12951 SDValue
12952 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12953   SDLoc dl(Op);
12954   SDValue Vec = Op.getOperand(0);
12955   SDValue Elt = Op.getOperand(1);
12956   SDValue Idx = Op.getOperand(2);
12957   MVT VecVT = Vec.getSimpleValueType();
12958
12959   if (!isa<ConstantSDNode>(Idx)) {
12960     // Non constant index. Extend source and destination,
12961     // insert element and then truncate the result.
12962     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12963     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
12964     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12965       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12966       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12967     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12968   }
12969
12970   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12971   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12972   if (Vec.getOpcode() == ISD::UNDEF)
12973     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12974                        DAG.getConstant(IdxVal, MVT::i8));
12975   const TargetRegisterClass* rc = getRegClassFor(VecVT);
12976   unsigned MaxSift = rc->getSize()*8 - 1;
12977   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12978                     DAG.getConstant(MaxSift, MVT::i8));
12979   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
12980                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12981   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12982 }
12983
12984 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12985                                                   SelectionDAG &DAG) const {
12986   MVT VT = Op.getSimpleValueType();
12987   MVT EltVT = VT.getVectorElementType();
12988
12989   if (EltVT == MVT::i1)
12990     return InsertBitToMaskVector(Op, DAG);
12991
12992   SDLoc dl(Op);
12993   SDValue N0 = Op.getOperand(0);
12994   SDValue N1 = Op.getOperand(1);
12995   SDValue N2 = Op.getOperand(2);
12996   if (!isa<ConstantSDNode>(N2))
12997     return SDValue();
12998   auto *N2C = cast<ConstantSDNode>(N2);
12999   unsigned IdxVal = N2C->getZExtValue();
13000
13001   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13002   // into that, and then insert the subvector back into the result.
13003   if (VT.is256BitVector() || VT.is512BitVector()) {
13004     // Get the desired 128-bit vector half.
13005     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13006
13007     // Insert the element into the desired half.
13008     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13009     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13010
13011     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13012                     DAG.getConstant(IdxIn128, MVT::i32));
13013
13014     // Insert the changed part back to the 256-bit vector
13015     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13016   }
13017   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13018
13019   if (Subtarget->hasSSE41()) {
13020     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13021       unsigned Opc;
13022       if (VT == MVT::v8i16) {
13023         Opc = X86ISD::PINSRW;
13024       } else {
13025         assert(VT == MVT::v16i8);
13026         Opc = X86ISD::PINSRB;
13027       }
13028
13029       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13030       // argument.
13031       if (N1.getValueType() != MVT::i32)
13032         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13033       if (N2.getValueType() != MVT::i32)
13034         N2 = DAG.getIntPtrConstant(IdxVal);
13035       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13036     }
13037
13038     if (EltVT == MVT::f32) {
13039       // Bits [7:6] of the constant are the source select.  This will always be
13040       //  zero here.  The DAG Combiner may combine an extract_elt index into
13041       //  these
13042       //  bits.  For example (insert (extract, 3), 2) could be matched by
13043       //  putting
13044       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13045       // Bits [5:4] of the constant are the destination select.  This is the
13046       //  value of the incoming immediate.
13047       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13048       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13049       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13050       // Create this as a scalar to vector..
13051       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13052       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13053     }
13054
13055     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13056       // PINSR* works with constant index.
13057       return Op;
13058     }
13059   }
13060
13061   if (EltVT == MVT::i8)
13062     return SDValue();
13063
13064   if (EltVT.getSizeInBits() == 16) {
13065     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13066     // as its second argument.
13067     if (N1.getValueType() != MVT::i32)
13068       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13069     if (N2.getValueType() != MVT::i32)
13070       N2 = DAG.getIntPtrConstant(IdxVal);
13071     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13072   }
13073   return SDValue();
13074 }
13075
13076 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13077   SDLoc dl(Op);
13078   MVT OpVT = Op.getSimpleValueType();
13079
13080   // If this is a 256-bit vector result, first insert into a 128-bit
13081   // vector and then insert into the 256-bit vector.
13082   if (!OpVT.is128BitVector()) {
13083     // Insert into a 128-bit vector.
13084     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13085     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13086                                  OpVT.getVectorNumElements() / SizeFactor);
13087
13088     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13089
13090     // Insert the 128-bit vector.
13091     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13092   }
13093
13094   if (OpVT == MVT::v1i64 &&
13095       Op.getOperand(0).getValueType() == MVT::i64)
13096     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13097
13098   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13099   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13100   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13101                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13102 }
13103
13104 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13105 // a simple subregister reference or explicit instructions to grab
13106 // upper bits of a vector.
13107 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13108                                       SelectionDAG &DAG) {
13109   SDLoc dl(Op);
13110   SDValue In =  Op.getOperand(0);
13111   SDValue Idx = Op.getOperand(1);
13112   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13113   MVT ResVT   = Op.getSimpleValueType();
13114   MVT InVT    = In.getSimpleValueType();
13115
13116   if (Subtarget->hasFp256()) {
13117     if (ResVT.is128BitVector() &&
13118         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13119         isa<ConstantSDNode>(Idx)) {
13120       return Extract128BitVector(In, IdxVal, DAG, dl);
13121     }
13122     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13123         isa<ConstantSDNode>(Idx)) {
13124       return Extract256BitVector(In, IdxVal, DAG, dl);
13125     }
13126   }
13127   return SDValue();
13128 }
13129
13130 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13131 // simple superregister reference or explicit instructions to insert
13132 // the upper bits of a vector.
13133 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13134                                      SelectionDAG &DAG) {
13135   if (Subtarget->hasFp256()) {
13136     SDLoc dl(Op.getNode());
13137     SDValue Vec = Op.getNode()->getOperand(0);
13138     SDValue SubVec = Op.getNode()->getOperand(1);
13139     SDValue Idx = Op.getNode()->getOperand(2);
13140
13141     if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
13142          Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
13143         SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
13144         isa<ConstantSDNode>(Idx)) {
13145       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13146       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13147     }
13148
13149     if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
13150         SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
13151         isa<ConstantSDNode>(Idx)) {
13152       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13153       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13154     }
13155   }
13156   return SDValue();
13157 }
13158
13159 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13160 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13161 // one of the above mentioned nodes. It has to be wrapped because otherwise
13162 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13163 // be used to form addressing mode. These wrapped nodes will be selected
13164 // into MOV32ri.
13165 SDValue
13166 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13167   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13168
13169   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13170   // global base reg.
13171   unsigned char OpFlag = 0;
13172   unsigned WrapperKind = X86ISD::Wrapper;
13173   CodeModel::Model M = DAG.getTarget().getCodeModel();
13174
13175   if (Subtarget->isPICStyleRIPRel() &&
13176       (M == CodeModel::Small || M == CodeModel::Kernel))
13177     WrapperKind = X86ISD::WrapperRIP;
13178   else if (Subtarget->isPICStyleGOT())
13179     OpFlag = X86II::MO_GOTOFF;
13180   else if (Subtarget->isPICStyleStubPIC())
13181     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13182
13183   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13184                                              CP->getAlignment(),
13185                                              CP->getOffset(), OpFlag);
13186   SDLoc DL(CP);
13187   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13188   // With PIC, the address is actually $g + Offset.
13189   if (OpFlag) {
13190     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13191                          DAG.getNode(X86ISD::GlobalBaseReg,
13192                                      SDLoc(), getPointerTy()),
13193                          Result);
13194   }
13195
13196   return Result;
13197 }
13198
13199 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13200   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13201
13202   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13203   // global base reg.
13204   unsigned char OpFlag = 0;
13205   unsigned WrapperKind = X86ISD::Wrapper;
13206   CodeModel::Model M = DAG.getTarget().getCodeModel();
13207
13208   if (Subtarget->isPICStyleRIPRel() &&
13209       (M == CodeModel::Small || M == CodeModel::Kernel))
13210     WrapperKind = X86ISD::WrapperRIP;
13211   else if (Subtarget->isPICStyleGOT())
13212     OpFlag = X86II::MO_GOTOFF;
13213   else if (Subtarget->isPICStyleStubPIC())
13214     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13215
13216   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13217                                           OpFlag);
13218   SDLoc DL(JT);
13219   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13220
13221   // With PIC, the address is actually $g + Offset.
13222   if (OpFlag)
13223     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13224                          DAG.getNode(X86ISD::GlobalBaseReg,
13225                                      SDLoc(), getPointerTy()),
13226                          Result);
13227
13228   return Result;
13229 }
13230
13231 SDValue
13232 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13233   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13234
13235   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13236   // global base reg.
13237   unsigned char OpFlag = 0;
13238   unsigned WrapperKind = X86ISD::Wrapper;
13239   CodeModel::Model M = DAG.getTarget().getCodeModel();
13240
13241   if (Subtarget->isPICStyleRIPRel() &&
13242       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13243     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13244       OpFlag = X86II::MO_GOTPCREL;
13245     WrapperKind = X86ISD::WrapperRIP;
13246   } else if (Subtarget->isPICStyleGOT()) {
13247     OpFlag = X86II::MO_GOT;
13248   } else if (Subtarget->isPICStyleStubPIC()) {
13249     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13250   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13251     OpFlag = X86II::MO_DARWIN_NONLAZY;
13252   }
13253
13254   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13255
13256   SDLoc DL(Op);
13257   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13258
13259   // With PIC, the address is actually $g + Offset.
13260   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13261       !Subtarget->is64Bit()) {
13262     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13263                          DAG.getNode(X86ISD::GlobalBaseReg,
13264                                      SDLoc(), getPointerTy()),
13265                          Result);
13266   }
13267
13268   // For symbols that require a load from a stub to get the address, emit the
13269   // load.
13270   if (isGlobalStubReference(OpFlag))
13271     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13272                          MachinePointerInfo::getGOT(), false, false, false, 0);
13273
13274   return Result;
13275 }
13276
13277 SDValue
13278 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13279   // Create the TargetBlockAddressAddress node.
13280   unsigned char OpFlags =
13281     Subtarget->ClassifyBlockAddressReference();
13282   CodeModel::Model M = DAG.getTarget().getCodeModel();
13283   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13284   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13285   SDLoc dl(Op);
13286   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13287                                              OpFlags);
13288
13289   if (Subtarget->isPICStyleRIPRel() &&
13290       (M == CodeModel::Small || M == CodeModel::Kernel))
13291     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13292   else
13293     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13294
13295   // With PIC, the address is actually $g + Offset.
13296   if (isGlobalRelativeToPICBase(OpFlags)) {
13297     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13298                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13299                          Result);
13300   }
13301
13302   return Result;
13303 }
13304
13305 SDValue
13306 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13307                                       int64_t Offset, SelectionDAG &DAG) const {
13308   // Create the TargetGlobalAddress node, folding in the constant
13309   // offset if it is legal.
13310   unsigned char OpFlags =
13311       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13312   CodeModel::Model M = DAG.getTarget().getCodeModel();
13313   SDValue Result;
13314   if (OpFlags == X86II::MO_NO_FLAG &&
13315       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13316     // A direct static reference to a global.
13317     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13318     Offset = 0;
13319   } else {
13320     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13321   }
13322
13323   if (Subtarget->isPICStyleRIPRel() &&
13324       (M == CodeModel::Small || M == CodeModel::Kernel))
13325     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13326   else
13327     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13328
13329   // With PIC, the address is actually $g + Offset.
13330   if (isGlobalRelativeToPICBase(OpFlags)) {
13331     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13332                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13333                          Result);
13334   }
13335
13336   // For globals that require a load from a stub to get the address, emit the
13337   // load.
13338   if (isGlobalStubReference(OpFlags))
13339     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13340                          MachinePointerInfo::getGOT(), false, false, false, 0);
13341
13342   // If there was a non-zero offset that we didn't fold, create an explicit
13343   // addition for it.
13344   if (Offset != 0)
13345     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13346                          DAG.getConstant(Offset, getPointerTy()));
13347
13348   return Result;
13349 }
13350
13351 SDValue
13352 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13353   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13354   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13355   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13356 }
13357
13358 static SDValue
13359 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13360            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13361            unsigned char OperandFlags, bool LocalDynamic = false) {
13362   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13363   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13364   SDLoc dl(GA);
13365   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13366                                            GA->getValueType(0),
13367                                            GA->getOffset(),
13368                                            OperandFlags);
13369
13370   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13371                                            : X86ISD::TLSADDR;
13372
13373   if (InFlag) {
13374     SDValue Ops[] = { Chain,  TGA, *InFlag };
13375     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13376   } else {
13377     SDValue Ops[]  = { Chain, TGA };
13378     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13379   }
13380
13381   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13382   MFI->setAdjustsStack(true);
13383   MFI->setHasCalls(true);
13384
13385   SDValue Flag = Chain.getValue(1);
13386   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13387 }
13388
13389 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13390 static SDValue
13391 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13392                                 const EVT PtrVT) {
13393   SDValue InFlag;
13394   SDLoc dl(GA);  // ? function entry point might be better
13395   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13396                                    DAG.getNode(X86ISD::GlobalBaseReg,
13397                                                SDLoc(), PtrVT), InFlag);
13398   InFlag = Chain.getValue(1);
13399
13400   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13401 }
13402
13403 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13404 static SDValue
13405 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13406                                 const EVT PtrVT) {
13407   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13408                     X86::RAX, X86II::MO_TLSGD);
13409 }
13410
13411 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13412                                            SelectionDAG &DAG,
13413                                            const EVT PtrVT,
13414                                            bool is64Bit) {
13415   SDLoc dl(GA);
13416
13417   // Get the start address of the TLS block for this module.
13418   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13419       .getInfo<X86MachineFunctionInfo>();
13420   MFI->incNumLocalDynamicTLSAccesses();
13421
13422   SDValue Base;
13423   if (is64Bit) {
13424     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13425                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13426   } else {
13427     SDValue InFlag;
13428     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13429         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13430     InFlag = Chain.getValue(1);
13431     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13432                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13433   }
13434
13435   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13436   // of Base.
13437
13438   // Build x@dtpoff.
13439   unsigned char OperandFlags = X86II::MO_DTPOFF;
13440   unsigned WrapperKind = X86ISD::Wrapper;
13441   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13442                                            GA->getValueType(0),
13443                                            GA->getOffset(), OperandFlags);
13444   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13445
13446   // Add x@dtpoff with the base.
13447   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13448 }
13449
13450 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13451 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13452                                    const EVT PtrVT, TLSModel::Model model,
13453                                    bool is64Bit, bool isPIC) {
13454   SDLoc dl(GA);
13455
13456   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13457   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13458                                                          is64Bit ? 257 : 256));
13459
13460   SDValue ThreadPointer =
13461       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13462                   MachinePointerInfo(Ptr), false, false, false, 0);
13463
13464   unsigned char OperandFlags = 0;
13465   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13466   // initialexec.
13467   unsigned WrapperKind = X86ISD::Wrapper;
13468   if (model == TLSModel::LocalExec) {
13469     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13470   } else if (model == TLSModel::InitialExec) {
13471     if (is64Bit) {
13472       OperandFlags = X86II::MO_GOTTPOFF;
13473       WrapperKind = X86ISD::WrapperRIP;
13474     } else {
13475       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13476     }
13477   } else {
13478     llvm_unreachable("Unexpected model");
13479   }
13480
13481   // emit "addl x@ntpoff,%eax" (local exec)
13482   // or "addl x@indntpoff,%eax" (initial exec)
13483   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13484   SDValue TGA =
13485       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13486                                  GA->getOffset(), OperandFlags);
13487   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13488
13489   if (model == TLSModel::InitialExec) {
13490     if (isPIC && !is64Bit) {
13491       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13492                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13493                            Offset);
13494     }
13495
13496     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13497                          MachinePointerInfo::getGOT(), false, false, false, 0);
13498   }
13499
13500   // The address of the thread local variable is the add of the thread
13501   // pointer with the offset of the variable.
13502   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13503 }
13504
13505 SDValue
13506 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13507
13508   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13509   const GlobalValue *GV = GA->getGlobal();
13510
13511   if (Subtarget->isTargetELF()) {
13512     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13513
13514     switch (model) {
13515       case TLSModel::GeneralDynamic:
13516         if (Subtarget->is64Bit())
13517           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13518         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13519       case TLSModel::LocalDynamic:
13520         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13521                                            Subtarget->is64Bit());
13522       case TLSModel::InitialExec:
13523       case TLSModel::LocalExec:
13524         return LowerToTLSExecModel(
13525             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13526             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13527     }
13528     llvm_unreachable("Unknown TLS model.");
13529   }
13530
13531   if (Subtarget->isTargetDarwin()) {
13532     // Darwin only has one model of TLS.  Lower to that.
13533     unsigned char OpFlag = 0;
13534     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13535                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13536
13537     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13538     // global base reg.
13539     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13540                  !Subtarget->is64Bit();
13541     if (PIC32)
13542       OpFlag = X86II::MO_TLVP_PIC_BASE;
13543     else
13544       OpFlag = X86II::MO_TLVP;
13545     SDLoc DL(Op);
13546     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13547                                                 GA->getValueType(0),
13548                                                 GA->getOffset(), OpFlag);
13549     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13550
13551     // With PIC32, the address is actually $g + Offset.
13552     if (PIC32)
13553       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13554                            DAG.getNode(X86ISD::GlobalBaseReg,
13555                                        SDLoc(), getPointerTy()),
13556                            Offset);
13557
13558     // Lowering the machine isd will make sure everything is in the right
13559     // location.
13560     SDValue Chain = DAG.getEntryNode();
13561     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13562     SDValue Args[] = { Chain, Offset };
13563     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13564
13565     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13566     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13567     MFI->setAdjustsStack(true);
13568
13569     // And our return value (tls address) is in the standard call return value
13570     // location.
13571     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13572     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13573                               Chain.getValue(1));
13574   }
13575
13576   if (Subtarget->isTargetKnownWindowsMSVC() ||
13577       Subtarget->isTargetWindowsGNU()) {
13578     // Just use the implicit TLS architecture
13579     // Need to generate someting similar to:
13580     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13581     //                                  ; from TEB
13582     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13583     //   mov     rcx, qword [rdx+rcx*8]
13584     //   mov     eax, .tls$:tlsvar
13585     //   [rax+rcx] contains the address
13586     // Windows 64bit: gs:0x58
13587     // Windows 32bit: fs:__tls_array
13588
13589     SDLoc dl(GA);
13590     SDValue Chain = DAG.getEntryNode();
13591
13592     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13593     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13594     // use its literal value of 0x2C.
13595     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13596                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13597                                                              256)
13598                                         : Type::getInt32PtrTy(*DAG.getContext(),
13599                                                               257));
13600
13601     SDValue TlsArray =
13602         Subtarget->is64Bit()
13603             ? DAG.getIntPtrConstant(0x58)
13604             : (Subtarget->isTargetWindowsGNU()
13605                    ? DAG.getIntPtrConstant(0x2C)
13606                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13607
13608     SDValue ThreadPointer =
13609         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13610                     MachinePointerInfo(Ptr), false, false, false, 0);
13611
13612     // Load the _tls_index variable
13613     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13614     if (Subtarget->is64Bit())
13615       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13616                            IDX, MachinePointerInfo(), MVT::i32,
13617                            false, false, false, 0);
13618     else
13619       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13620                         false, false, false, 0);
13621
13622     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13623                                     getPointerTy());
13624     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13625
13626     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13627     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13628                       false, false, false, 0);
13629
13630     // Get the offset of start of .tls section
13631     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13632                                              GA->getValueType(0),
13633                                              GA->getOffset(), X86II::MO_SECREL);
13634     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13635
13636     // The address of the thread local variable is the add of the thread
13637     // pointer with the offset of the variable.
13638     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13639   }
13640
13641   llvm_unreachable("TLS not implemented for this target.");
13642 }
13643
13644 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13645 /// and take a 2 x i32 value to shift plus a shift amount.
13646 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13647   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13648   MVT VT = Op.getSimpleValueType();
13649   unsigned VTBits = VT.getSizeInBits();
13650   SDLoc dl(Op);
13651   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13652   SDValue ShOpLo = Op.getOperand(0);
13653   SDValue ShOpHi = Op.getOperand(1);
13654   SDValue ShAmt  = Op.getOperand(2);
13655   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13656   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13657   // during isel.
13658   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13659                                   DAG.getConstant(VTBits - 1, MVT::i8));
13660   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13661                                      DAG.getConstant(VTBits - 1, MVT::i8))
13662                        : DAG.getConstant(0, VT);
13663
13664   SDValue Tmp2, Tmp3;
13665   if (Op.getOpcode() == ISD::SHL_PARTS) {
13666     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13667     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13668   } else {
13669     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13670     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13671   }
13672
13673   // If the shift amount is larger or equal than the width of a part we can't
13674   // rely on the results of shld/shrd. Insert a test and select the appropriate
13675   // values for large shift amounts.
13676   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13677                                 DAG.getConstant(VTBits, MVT::i8));
13678   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13679                              AndNode, DAG.getConstant(0, MVT::i8));
13680
13681   SDValue Hi, Lo;
13682   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13683   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13684   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13685
13686   if (Op.getOpcode() == ISD::SHL_PARTS) {
13687     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13688     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13689   } else {
13690     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13691     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13692   }
13693
13694   SDValue Ops[2] = { Lo, Hi };
13695   return DAG.getMergeValues(Ops, dl);
13696 }
13697
13698 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13699                                            SelectionDAG &DAG) const {
13700   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13701   SDLoc dl(Op);
13702
13703   if (SrcVT.isVector()) {
13704     if (SrcVT.getVectorElementType() == MVT::i1) {
13705       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13706       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13707                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13708                                      Op.getOperand(0)));
13709     }
13710     return SDValue();
13711   }
13712
13713   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13714          "Unknown SINT_TO_FP to lower!");
13715
13716   // These are really Legal; return the operand so the caller accepts it as
13717   // Legal.
13718   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13719     return Op;
13720   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13721       Subtarget->is64Bit()) {
13722     return Op;
13723   }
13724
13725   unsigned Size = SrcVT.getSizeInBits()/8;
13726   MachineFunction &MF = DAG.getMachineFunction();
13727   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13728   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13729   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13730                                StackSlot,
13731                                MachinePointerInfo::getFixedStack(SSFI),
13732                                false, false, 0);
13733   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13734 }
13735
13736 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13737                                      SDValue StackSlot,
13738                                      SelectionDAG &DAG) const {
13739   // Build the FILD
13740   SDLoc DL(Op);
13741   SDVTList Tys;
13742   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13743   if (useSSE)
13744     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13745   else
13746     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13747
13748   unsigned ByteSize = SrcVT.getSizeInBits()/8;
13749
13750   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13751   MachineMemOperand *MMO;
13752   if (FI) {
13753     int SSFI = FI->getIndex();
13754     MMO =
13755       DAG.getMachineFunction()
13756       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13757                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
13758   } else {
13759     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13760     StackSlot = StackSlot.getOperand(1);
13761   }
13762   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13763   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13764                                            X86ISD::FILD, DL,
13765                                            Tys, Ops, SrcVT, MMO);
13766
13767   if (useSSE) {
13768     Chain = Result.getValue(1);
13769     SDValue InFlag = Result.getValue(2);
13770
13771     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13772     // shouldn't be necessary except that RFP cannot be live across
13773     // multiple blocks. When stackifier is fixed, they can be uncoupled.
13774     MachineFunction &MF = DAG.getMachineFunction();
13775     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13776     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13777     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13778     Tys = DAG.getVTList(MVT::Other);
13779     SDValue Ops[] = {
13780       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13781     };
13782     MachineMemOperand *MMO =
13783       DAG.getMachineFunction()
13784       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13785                             MachineMemOperand::MOStore, SSFISize, SSFISize);
13786
13787     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13788                                     Ops, Op.getValueType(), MMO);
13789     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
13790                          MachinePointerInfo::getFixedStack(SSFI),
13791                          false, false, false, 0);
13792   }
13793
13794   return Result;
13795 }
13796
13797 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
13798 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13799                                                SelectionDAG &DAG) const {
13800   // This algorithm is not obvious. Here it is what we're trying to output:
13801   /*
13802      movq       %rax,  %xmm0
13803      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13804      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13805      #ifdef __SSE3__
13806        haddpd   %xmm0, %xmm0
13807      #else
13808        pshufd   $0x4e, %xmm0, %xmm1
13809        addpd    %xmm1, %xmm0
13810      #endif
13811   */
13812
13813   SDLoc dl(Op);
13814   LLVMContext *Context = DAG.getContext();
13815
13816   // Build some magic constants.
13817   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13818   Constant *C0 = ConstantDataVector::get(*Context, CV0);
13819   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
13820
13821   SmallVector<Constant*,2> CV1;
13822   CV1.push_back(
13823     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13824                                       APInt(64, 0x4330000000000000ULL))));
13825   CV1.push_back(
13826     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13827                                       APInt(64, 0x4530000000000000ULL))));
13828   Constant *C1 = ConstantVector::get(CV1);
13829   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
13830
13831   // Load the 64-bit value into an XMM register.
13832   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13833                             Op.getOperand(0));
13834   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13835                               MachinePointerInfo::getConstantPool(),
13836                               false, false, false, 16);
13837   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
13838                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
13839                               CLod0);
13840
13841   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13842                               MachinePointerInfo::getConstantPool(),
13843                               false, false, false, 16);
13844   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
13845   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13846   SDValue Result;
13847
13848   if (Subtarget->hasSSE3()) {
13849     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13850     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13851   } else {
13852     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
13853     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13854                                            S2F, 0x4E, DAG);
13855     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13856                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
13857                          Sub);
13858   }
13859
13860   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13861                      DAG.getIntPtrConstant(0));
13862 }
13863
13864 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
13865 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13866                                                SelectionDAG &DAG) const {
13867   SDLoc dl(Op);
13868   // FP constant to bias correct the final result.
13869   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13870                                    MVT::f64);
13871
13872   // Load the 32-bit value into an XMM register.
13873   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13874                              Op.getOperand(0));
13875
13876   // Zero out the upper parts of the register.
13877   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13878
13879   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13880                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
13881                      DAG.getIntPtrConstant(0));
13882
13883   // Or the load with the bias.
13884   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
13885                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13886                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13887                                                    MVT::v2f64, Load)),
13888                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13889                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13890                                                    MVT::v2f64, Bias)));
13891   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13892                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
13893                    DAG.getIntPtrConstant(0));
13894
13895   // Subtract the bias.
13896   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13897
13898   // Handle final rounding.
13899   EVT DestVT = Op.getValueType();
13900
13901   if (DestVT.bitsLT(MVT::f64))
13902     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13903                        DAG.getIntPtrConstant(0));
13904   if (DestVT.bitsGT(MVT::f64))
13905     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13906
13907   // Handle final rounding.
13908   return Sub;
13909 }
13910
13911 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13912                                      const X86Subtarget &Subtarget) {
13913   // The algorithm is the following:
13914   // #ifdef __SSE4_1__
13915   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13916   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13917   //                                 (uint4) 0x53000000, 0xaa);
13918   // #else
13919   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13920   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13921   // #endif
13922   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13923   //     return (float4) lo + fhi;
13924
13925   SDLoc DL(Op);
13926   SDValue V = Op->getOperand(0);
13927   EVT VecIntVT = V.getValueType();
13928   bool Is128 = VecIntVT == MVT::v4i32;
13929   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13930   // If we convert to something else than the supported type, e.g., to v4f64,
13931   // abort early.
13932   if (VecFloatVT != Op->getValueType(0))
13933     return SDValue();
13934
13935   unsigned NumElts = VecIntVT.getVectorNumElements();
13936   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13937          "Unsupported custom type");
13938   assert(NumElts <= 8 && "The size of the constant array must be fixed");
13939
13940   // In the #idef/#else code, we have in common:
13941   // - The vector of constants:
13942   // -- 0x4b000000
13943   // -- 0x53000000
13944   // - A shift:
13945   // -- v >> 16
13946
13947   // Create the splat vector for 0x4b000000.
13948   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
13949   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
13950                            CstLow, CstLow, CstLow, CstLow};
13951   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13952                                   makeArrayRef(&CstLowArray[0], NumElts));
13953   // Create the splat vector for 0x53000000.
13954   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
13955   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
13956                             CstHigh, CstHigh, CstHigh, CstHigh};
13957   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13958                                    makeArrayRef(&CstHighArray[0], NumElts));
13959
13960   // Create the right shift.
13961   SDValue CstShift = DAG.getConstant(16, MVT::i32);
13962   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
13963                              CstShift, CstShift, CstShift, CstShift};
13964   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13965                                     makeArrayRef(&CstShiftArray[0], NumElts));
13966   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13967
13968   SDValue Low, High;
13969   if (Subtarget.hasSSE41()) {
13970     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13971     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13972     SDValue VecCstLowBitcast =
13973         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
13974     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
13975     // Low will be bitcasted right away, so do not bother bitcasting back to its
13976     // original type.
13977     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13978                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
13979     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13980     //                                 (uint4) 0x53000000, 0xaa);
13981     SDValue VecCstHighBitcast =
13982         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
13983     SDValue VecShiftBitcast =
13984         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
13985     // High will be bitcasted right away, so do not bother bitcasting back to
13986     // its original type.
13987     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13988                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
13989   } else {
13990     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
13991     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
13992                                      CstMask, CstMask, CstMask);
13993     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13994     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13995     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13996
13997     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13998     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13999   }
14000
14001   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14002   SDValue CstFAdd = DAG.getConstantFP(
14003       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14004   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14005                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14006   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14007                                    makeArrayRef(&CstFAddArray[0], NumElts));
14008
14009   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14010   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14011   SDValue FHigh =
14012       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14013   //     return (float4) lo + fhi;
14014   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14015   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14016 }
14017
14018 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14019                                                SelectionDAG &DAG) const {
14020   SDValue N0 = Op.getOperand(0);
14021   MVT SVT = N0.getSimpleValueType();
14022   SDLoc dl(Op);
14023
14024   switch (SVT.SimpleTy) {
14025   default:
14026     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14027   case MVT::v4i8:
14028   case MVT::v4i16:
14029   case MVT::v8i8:
14030   case MVT::v8i16: {
14031     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14032     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14033                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14034   }
14035   case MVT::v4i32:
14036   case MVT::v8i32:
14037     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14038   }
14039   llvm_unreachable(nullptr);
14040 }
14041
14042 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14043                                            SelectionDAG &DAG) const {
14044   SDValue N0 = Op.getOperand(0);
14045   SDLoc dl(Op);
14046
14047   if (Op.getValueType().isVector())
14048     return lowerUINT_TO_FP_vec(Op, DAG);
14049
14050   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14051   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14052   // the optimization here.
14053   if (DAG.SignBitIsZero(N0))
14054     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14055
14056   MVT SrcVT = N0.getSimpleValueType();
14057   MVT DstVT = Op.getSimpleValueType();
14058   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14059     return LowerUINT_TO_FP_i64(Op, DAG);
14060   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14061     return LowerUINT_TO_FP_i32(Op, DAG);
14062   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14063     return SDValue();
14064
14065   // Make a 64-bit buffer, and use it to build an FILD.
14066   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14067   if (SrcVT == MVT::i32) {
14068     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14069     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14070                                      getPointerTy(), StackSlot, WordOff);
14071     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14072                                   StackSlot, MachinePointerInfo(),
14073                                   false, false, 0);
14074     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14075                                   OffsetSlot, MachinePointerInfo(),
14076                                   false, false, 0);
14077     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14078     return Fild;
14079   }
14080
14081   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14082   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14083                                StackSlot, MachinePointerInfo(),
14084                                false, false, 0);
14085   // For i64 source, we need to add the appropriate power of 2 if the input
14086   // was negative.  This is the same as the optimization in
14087   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14088   // we must be careful to do the computation in x87 extended precision, not
14089   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14090   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14091   MachineMemOperand *MMO =
14092     DAG.getMachineFunction()
14093     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14094                           MachineMemOperand::MOLoad, 8, 8);
14095
14096   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14097   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14098   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14099                                          MVT::i64, MMO);
14100
14101   APInt FF(32, 0x5F800000ULL);
14102
14103   // Check whether the sign bit is set.
14104   SDValue SignSet = DAG.getSetCC(dl,
14105                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14106                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14107                                  ISD::SETLT);
14108
14109   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14110   SDValue FudgePtr = DAG.getConstantPool(
14111                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14112                                          getPointerTy());
14113
14114   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14115   SDValue Zero = DAG.getIntPtrConstant(0);
14116   SDValue Four = DAG.getIntPtrConstant(4);
14117   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14118                                Zero, Four);
14119   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14120
14121   // Load the value out, extending it from f32 to f80.
14122   // FIXME: Avoid the extend by constructing the right constant pool?
14123   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14124                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14125                                  MVT::f32, false, false, false, 4);
14126   // Extend everything to 80 bits to force it to be done on x87.
14127   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14128   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14129 }
14130
14131 std::pair<SDValue,SDValue>
14132 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14133                                     bool IsSigned, bool IsReplace) const {
14134   SDLoc DL(Op);
14135
14136   EVT DstTy = Op.getValueType();
14137
14138   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14139     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14140     DstTy = MVT::i64;
14141   }
14142
14143   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14144          DstTy.getSimpleVT() >= MVT::i16 &&
14145          "Unknown FP_TO_INT to lower!");
14146
14147   // These are really Legal.
14148   if (DstTy == MVT::i32 &&
14149       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14150     return std::make_pair(SDValue(), SDValue());
14151   if (Subtarget->is64Bit() &&
14152       DstTy == MVT::i64 &&
14153       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14154     return std::make_pair(SDValue(), SDValue());
14155
14156   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14157   // stack slot, or into the FTOL runtime function.
14158   MachineFunction &MF = DAG.getMachineFunction();
14159   unsigned MemSize = DstTy.getSizeInBits()/8;
14160   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14161   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14162
14163   unsigned Opc;
14164   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14165     Opc = X86ISD::WIN_FTOL;
14166   else
14167     switch (DstTy.getSimpleVT().SimpleTy) {
14168     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14169     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14170     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14171     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14172     }
14173
14174   SDValue Chain = DAG.getEntryNode();
14175   SDValue Value = Op.getOperand(0);
14176   EVT TheVT = Op.getOperand(0).getValueType();
14177   // FIXME This causes a redundant load/store if the SSE-class value is already
14178   // in memory, such as if it is on the callstack.
14179   if (isScalarFPTypeInSSEReg(TheVT)) {
14180     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14181     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14182                          MachinePointerInfo::getFixedStack(SSFI),
14183                          false, false, 0);
14184     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14185     SDValue Ops[] = {
14186       Chain, StackSlot, DAG.getValueType(TheVT)
14187     };
14188
14189     MachineMemOperand *MMO =
14190       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14191                               MachineMemOperand::MOLoad, MemSize, MemSize);
14192     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14193     Chain = Value.getValue(1);
14194     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14195     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14196   }
14197
14198   MachineMemOperand *MMO =
14199     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14200                             MachineMemOperand::MOStore, MemSize, MemSize);
14201
14202   if (Opc != X86ISD::WIN_FTOL) {
14203     // Build the FP_TO_INT*_IN_MEM
14204     SDValue Ops[] = { Chain, Value, StackSlot };
14205     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14206                                            Ops, DstTy, MMO);
14207     return std::make_pair(FIST, StackSlot);
14208   } else {
14209     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14210       DAG.getVTList(MVT::Other, MVT::Glue),
14211       Chain, Value);
14212     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14213       MVT::i32, ftol.getValue(1));
14214     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14215       MVT::i32, eax.getValue(2));
14216     SDValue Ops[] = { eax, edx };
14217     SDValue pair = IsReplace
14218       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14219       : DAG.getMergeValues(Ops, DL);
14220     return std::make_pair(pair, SDValue());
14221   }
14222 }
14223
14224 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14225                               const X86Subtarget *Subtarget) {
14226   MVT VT = Op->getSimpleValueType(0);
14227   SDValue In = Op->getOperand(0);
14228   MVT InVT = In.getSimpleValueType();
14229   SDLoc dl(Op);
14230
14231   // Optimize vectors in AVX mode:
14232   //
14233   //   v8i16 -> v8i32
14234   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14235   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14236   //   Concat upper and lower parts.
14237   //
14238   //   v4i32 -> v4i64
14239   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14240   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14241   //   Concat upper and lower parts.
14242   //
14243
14244   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14245       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14246       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14247     return SDValue();
14248
14249   if (Subtarget->hasInt256())
14250     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14251
14252   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14253   SDValue Undef = DAG.getUNDEF(InVT);
14254   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14255   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14256   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14257
14258   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14259                              VT.getVectorNumElements()/2);
14260
14261   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14262   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14263
14264   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14265 }
14266
14267 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14268                                         SelectionDAG &DAG) {
14269   MVT VT = Op->getSimpleValueType(0);
14270   SDValue In = Op->getOperand(0);
14271   MVT InVT = In.getSimpleValueType();
14272   SDLoc DL(Op);
14273   unsigned int NumElts = VT.getVectorNumElements();
14274   if (NumElts != 8 && NumElts != 16)
14275     return SDValue();
14276
14277   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14278     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14279
14280   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14281   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14282   // Now we have only mask extension
14283   assert(InVT.getVectorElementType() == MVT::i1);
14284   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14285   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14286   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14287   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14288   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14289                            MachinePointerInfo::getConstantPool(),
14290                            false, false, false, Alignment);
14291
14292   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14293   if (VT.is512BitVector())
14294     return Brcst;
14295   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14296 }
14297
14298 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14299                                SelectionDAG &DAG) {
14300   if (Subtarget->hasFp256()) {
14301     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14302     if (Res.getNode())
14303       return Res;
14304   }
14305
14306   return SDValue();
14307 }
14308
14309 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14310                                 SelectionDAG &DAG) {
14311   SDLoc DL(Op);
14312   MVT VT = Op.getSimpleValueType();
14313   SDValue In = Op.getOperand(0);
14314   MVT SVT = In.getSimpleValueType();
14315
14316   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14317     return LowerZERO_EXTEND_AVX512(Op, DAG);
14318
14319   if (Subtarget->hasFp256()) {
14320     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14321     if (Res.getNode())
14322       return Res;
14323   }
14324
14325   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14326          VT.getVectorNumElements() != SVT.getVectorNumElements());
14327   return SDValue();
14328 }
14329
14330 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14331   SDLoc DL(Op);
14332   MVT VT = Op.getSimpleValueType();
14333   SDValue In = Op.getOperand(0);
14334   MVT InVT = In.getSimpleValueType();
14335
14336   if (VT == MVT::i1) {
14337     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14338            "Invalid scalar TRUNCATE operation");
14339     if (InVT.getSizeInBits() >= 32)
14340       return SDValue();
14341     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14342     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14343   }
14344   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14345          "Invalid TRUNCATE operation");
14346
14347   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14348     if (VT.getVectorElementType().getSizeInBits() >=8)
14349       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14350
14351     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14352     unsigned NumElts = InVT.getVectorNumElements();
14353     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14354     if (InVT.getSizeInBits() < 512) {
14355       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14356       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14357       InVT = ExtVT;
14358     }
14359
14360     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14361     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14362     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14363     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14364     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14365                            MachinePointerInfo::getConstantPool(),
14366                            false, false, false, Alignment);
14367     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14368     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14369     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14370   }
14371
14372   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14373     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14374     if (Subtarget->hasInt256()) {
14375       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14376       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14377       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14378                                 ShufMask);
14379       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14380                          DAG.getIntPtrConstant(0));
14381     }
14382
14383     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14384                                DAG.getIntPtrConstant(0));
14385     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14386                                DAG.getIntPtrConstant(2));
14387     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14388     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14389     static const int ShufMask[] = {0, 2, 4, 6};
14390     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14391   }
14392
14393   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14394     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14395     if (Subtarget->hasInt256()) {
14396       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14397
14398       SmallVector<SDValue,32> pshufbMask;
14399       for (unsigned i = 0; i < 2; ++i) {
14400         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14401         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14402         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14403         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14404         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14405         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14406         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14407         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14408         for (unsigned j = 0; j < 8; ++j)
14409           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14410       }
14411       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14412       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14413       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14414
14415       static const int ShufMask[] = {0,  2,  -1,  -1};
14416       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14417                                 &ShufMask[0]);
14418       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14419                        DAG.getIntPtrConstant(0));
14420       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14421     }
14422
14423     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14424                                DAG.getIntPtrConstant(0));
14425
14426     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14427                                DAG.getIntPtrConstant(4));
14428
14429     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14430     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14431
14432     // The PSHUFB mask:
14433     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14434                                    -1, -1, -1, -1, -1, -1, -1, -1};
14435
14436     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14437     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14438     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14439
14440     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14441     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14442
14443     // The MOVLHPS Mask:
14444     static const int ShufMask2[] = {0, 1, 4, 5};
14445     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14446     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14447   }
14448
14449   // Handle truncation of V256 to V128 using shuffles.
14450   if (!VT.is128BitVector() || !InVT.is256BitVector())
14451     return SDValue();
14452
14453   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14454
14455   unsigned NumElems = VT.getVectorNumElements();
14456   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14457
14458   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14459   // Prepare truncation shuffle mask
14460   for (unsigned i = 0; i != NumElems; ++i)
14461     MaskVec[i] = i * 2;
14462   SDValue V = DAG.getVectorShuffle(NVT, DL,
14463                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14464                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14465   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14466                      DAG.getIntPtrConstant(0));
14467 }
14468
14469 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14470                                            SelectionDAG &DAG) const {
14471   assert(!Op.getSimpleValueType().isVector());
14472
14473   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14474     /*IsSigned=*/ true, /*IsReplace=*/ false);
14475   SDValue FIST = Vals.first, StackSlot = Vals.second;
14476   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14477   if (!FIST.getNode()) return Op;
14478
14479   if (StackSlot.getNode())
14480     // Load the result.
14481     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14482                        FIST, StackSlot, MachinePointerInfo(),
14483                        false, false, false, 0);
14484
14485   // The node is the result.
14486   return FIST;
14487 }
14488
14489 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14490                                            SelectionDAG &DAG) const {
14491   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14492     /*IsSigned=*/ false, /*IsReplace=*/ false);
14493   SDValue FIST = Vals.first, StackSlot = Vals.second;
14494   assert(FIST.getNode() && "Unexpected failure");
14495
14496   if (StackSlot.getNode())
14497     // Load the result.
14498     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14499                        FIST, StackSlot, MachinePointerInfo(),
14500                        false, false, false, 0);
14501
14502   // The node is the result.
14503   return FIST;
14504 }
14505
14506 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14507   SDLoc DL(Op);
14508   MVT VT = Op.getSimpleValueType();
14509   SDValue In = Op.getOperand(0);
14510   MVT SVT = In.getSimpleValueType();
14511
14512   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14513
14514   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14515                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14516                                  In, DAG.getUNDEF(SVT)));
14517 }
14518
14519 /// The only differences between FABS and FNEG are the mask and the logic op.
14520 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14521 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14522   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14523          "Wrong opcode for lowering FABS or FNEG.");
14524
14525   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14526
14527   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14528   // into an FNABS. We'll lower the FABS after that if it is still in use.
14529   if (IsFABS)
14530     for (SDNode *User : Op->uses())
14531       if (User->getOpcode() == ISD::FNEG)
14532         return Op;
14533
14534   SDValue Op0 = Op.getOperand(0);
14535   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14536
14537   SDLoc dl(Op);
14538   MVT VT = Op.getSimpleValueType();
14539   // Assume scalar op for initialization; update for vector if needed.
14540   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14541   // generate a 16-byte vector constant and logic op even for the scalar case.
14542   // Using a 16-byte mask allows folding the load of the mask with
14543   // the logic op, so it can save (~4 bytes) on code size.
14544   MVT EltVT = VT;
14545   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14546   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14547   // decide if we should generate a 16-byte constant mask when we only need 4 or
14548   // 8 bytes for the scalar case.
14549   if (VT.isVector()) {
14550     EltVT = VT.getVectorElementType();
14551     NumElts = VT.getVectorNumElements();
14552   }
14553
14554   unsigned EltBits = EltVT.getSizeInBits();
14555   LLVMContext *Context = DAG.getContext();
14556   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14557   APInt MaskElt =
14558     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14559   Constant *C = ConstantInt::get(*Context, MaskElt);
14560   C = ConstantVector::getSplat(NumElts, C);
14561   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14562   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14563   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14564   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14565                              MachinePointerInfo::getConstantPool(),
14566                              false, false, false, Alignment);
14567
14568   if (VT.isVector()) {
14569     // For a vector, cast operands to a vector type, perform the logic op,
14570     // and cast the result back to the original value type.
14571     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14572     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14573     SDValue Operand = IsFNABS ?
14574       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14575       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14576     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14577     return DAG.getNode(ISD::BITCAST, dl, VT,
14578                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14579   }
14580
14581   // If not vector, then scalar.
14582   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14583   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14584   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14585 }
14586
14587 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14588   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14589   LLVMContext *Context = DAG.getContext();
14590   SDValue Op0 = Op.getOperand(0);
14591   SDValue Op1 = Op.getOperand(1);
14592   SDLoc dl(Op);
14593   MVT VT = Op.getSimpleValueType();
14594   MVT SrcVT = Op1.getSimpleValueType();
14595
14596   // If second operand is smaller, extend it first.
14597   if (SrcVT.bitsLT(VT)) {
14598     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14599     SrcVT = VT;
14600   }
14601   // And if it is bigger, shrink it first.
14602   if (SrcVT.bitsGT(VT)) {
14603     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14604     SrcVT = VT;
14605   }
14606
14607   // At this point the operands and the result should have the same
14608   // type, and that won't be f80 since that is not custom lowered.
14609
14610   const fltSemantics &Sem =
14611       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14612   const unsigned SizeInBits = VT.getSizeInBits();
14613
14614   SmallVector<Constant *, 4> CV(
14615       VT == MVT::f64 ? 2 : 4,
14616       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14617
14618   // First, clear all bits but the sign bit from the second operand (sign).
14619   CV[0] = ConstantFP::get(*Context,
14620                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14621   Constant *C = ConstantVector::get(CV);
14622   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14623   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14624                               MachinePointerInfo::getConstantPool(),
14625                               false, false, false, 16);
14626   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14627
14628   // Next, clear the sign bit from the first operand (magnitude).
14629   // If it's a constant, we can clear it here.
14630   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14631     APFloat APF = Op0CN->getValueAPF();
14632     // If the magnitude is a positive zero, the sign bit alone is enough.
14633     if (APF.isPosZero())
14634       return SignBit;
14635     APF.clearSign();
14636     CV[0] = ConstantFP::get(*Context, APF);
14637   } else {
14638     CV[0] = ConstantFP::get(
14639         *Context,
14640         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14641   }
14642   C = ConstantVector::get(CV);
14643   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14644   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14645                             MachinePointerInfo::getConstantPool(),
14646                             false, false, false, 16);
14647   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14648   if (!isa<ConstantFPSDNode>(Op0))
14649     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14650
14651   // OR the magnitude value with the sign bit.
14652   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14653 }
14654
14655 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14656   SDValue N0 = Op.getOperand(0);
14657   SDLoc dl(Op);
14658   MVT VT = Op.getSimpleValueType();
14659
14660   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14661   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14662                                   DAG.getConstant(1, VT));
14663   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14664 }
14665
14666 // Check whether an OR'd tree is PTEST-able.
14667 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14668                                       SelectionDAG &DAG) {
14669   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14670
14671   if (!Subtarget->hasSSE41())
14672     return SDValue();
14673
14674   if (!Op->hasOneUse())
14675     return SDValue();
14676
14677   SDNode *N = Op.getNode();
14678   SDLoc DL(N);
14679
14680   SmallVector<SDValue, 8> Opnds;
14681   DenseMap<SDValue, unsigned> VecInMap;
14682   SmallVector<SDValue, 8> VecIns;
14683   EVT VT = MVT::Other;
14684
14685   // Recognize a special case where a vector is casted into wide integer to
14686   // test all 0s.
14687   Opnds.push_back(N->getOperand(0));
14688   Opnds.push_back(N->getOperand(1));
14689
14690   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14691     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14692     // BFS traverse all OR'd operands.
14693     if (I->getOpcode() == ISD::OR) {
14694       Opnds.push_back(I->getOperand(0));
14695       Opnds.push_back(I->getOperand(1));
14696       // Re-evaluate the number of nodes to be traversed.
14697       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14698       continue;
14699     }
14700
14701     // Quit if a non-EXTRACT_VECTOR_ELT
14702     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14703       return SDValue();
14704
14705     // Quit if without a constant index.
14706     SDValue Idx = I->getOperand(1);
14707     if (!isa<ConstantSDNode>(Idx))
14708       return SDValue();
14709
14710     SDValue ExtractedFromVec = I->getOperand(0);
14711     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14712     if (M == VecInMap.end()) {
14713       VT = ExtractedFromVec.getValueType();
14714       // Quit if not 128/256-bit vector.
14715       if (!VT.is128BitVector() && !VT.is256BitVector())
14716         return SDValue();
14717       // Quit if not the same type.
14718       if (VecInMap.begin() != VecInMap.end() &&
14719           VT != VecInMap.begin()->first.getValueType())
14720         return SDValue();
14721       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14722       VecIns.push_back(ExtractedFromVec);
14723     }
14724     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14725   }
14726
14727   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14728          "Not extracted from 128-/256-bit vector.");
14729
14730   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14731
14732   for (DenseMap<SDValue, unsigned>::const_iterator
14733         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14734     // Quit if not all elements are used.
14735     if (I->second != FullMask)
14736       return SDValue();
14737   }
14738
14739   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14740
14741   // Cast all vectors into TestVT for PTEST.
14742   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14743     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
14744
14745   // If more than one full vectors are evaluated, OR them first before PTEST.
14746   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14747     // Each iteration will OR 2 nodes and append the result until there is only
14748     // 1 node left, i.e. the final OR'd value of all vectors.
14749     SDValue LHS = VecIns[Slot];
14750     SDValue RHS = VecIns[Slot + 1];
14751     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14752   }
14753
14754   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14755                      VecIns.back(), VecIns.back());
14756 }
14757
14758 /// \brief return true if \c Op has a use that doesn't just read flags.
14759 static bool hasNonFlagsUse(SDValue Op) {
14760   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14761        ++UI) {
14762     SDNode *User = *UI;
14763     unsigned UOpNo = UI.getOperandNo();
14764     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14765       // Look pass truncate.
14766       UOpNo = User->use_begin().getOperandNo();
14767       User = *User->use_begin();
14768     }
14769
14770     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14771         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14772       return true;
14773   }
14774   return false;
14775 }
14776
14777 /// Emit nodes that will be selected as "test Op0,Op0", or something
14778 /// equivalent.
14779 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
14780                                     SelectionDAG &DAG) const {
14781   if (Op.getValueType() == MVT::i1)
14782     // KORTEST instruction should be selected
14783     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14784                        DAG.getConstant(0, Op.getValueType()));
14785
14786   // CF and OF aren't always set the way we want. Determine which
14787   // of these we need.
14788   bool NeedCF = false;
14789   bool NeedOF = false;
14790   switch (X86CC) {
14791   default: break;
14792   case X86::COND_A: case X86::COND_AE:
14793   case X86::COND_B: case X86::COND_BE:
14794     NeedCF = true;
14795     break;
14796   case X86::COND_G: case X86::COND_GE:
14797   case X86::COND_L: case X86::COND_LE:
14798   case X86::COND_O: case X86::COND_NO: {
14799     // Check if we really need to set the
14800     // Overflow flag. If NoSignedWrap is present
14801     // that is not actually needed.
14802     switch (Op->getOpcode()) {
14803     case ISD::ADD:
14804     case ISD::SUB:
14805     case ISD::MUL:
14806     case ISD::SHL: {
14807       const BinaryWithFlagsSDNode *BinNode =
14808           cast<BinaryWithFlagsSDNode>(Op.getNode());
14809       if (BinNode->hasNoSignedWrap())
14810         break;
14811     }
14812     default:
14813       NeedOF = true;
14814       break;
14815     }
14816     break;
14817   }
14818   }
14819   // See if we can use the EFLAGS value from the operand instead of
14820   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14821   // we prove that the arithmetic won't overflow, we can't use OF or CF.
14822   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14823     // Emit a CMP with 0, which is the TEST pattern.
14824     //if (Op.getValueType() == MVT::i1)
14825     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
14826     //                     DAG.getConstant(0, MVT::i1));
14827     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14828                        DAG.getConstant(0, Op.getValueType()));
14829   }
14830   unsigned Opcode = 0;
14831   unsigned NumOperands = 0;
14832
14833   // Truncate operations may prevent the merge of the SETCC instruction
14834   // and the arithmetic instruction before it. Attempt to truncate the operands
14835   // of the arithmetic instruction and use a reduced bit-width instruction.
14836   bool NeedTruncation = false;
14837   SDValue ArithOp = Op;
14838   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14839     SDValue Arith = Op->getOperand(0);
14840     // Both the trunc and the arithmetic op need to have one user each.
14841     if (Arith->hasOneUse())
14842       switch (Arith.getOpcode()) {
14843         default: break;
14844         case ISD::ADD:
14845         case ISD::SUB:
14846         case ISD::AND:
14847         case ISD::OR:
14848         case ISD::XOR: {
14849           NeedTruncation = true;
14850           ArithOp = Arith;
14851         }
14852       }
14853   }
14854
14855   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14856   // which may be the result of a CAST.  We use the variable 'Op', which is the
14857   // non-casted variable when we check for possible users.
14858   switch (ArithOp.getOpcode()) {
14859   case ISD::ADD:
14860     // Due to an isel shortcoming, be conservative if this add is likely to be
14861     // selected as part of a load-modify-store instruction. When the root node
14862     // in a match is a store, isel doesn't know how to remap non-chain non-flag
14863     // uses of other nodes in the match, such as the ADD in this case. This
14864     // leads to the ADD being left around and reselected, with the result being
14865     // two adds in the output.  Alas, even if none our users are stores, that
14866     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14867     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14868     // climbing the DAG back to the root, and it doesn't seem to be worth the
14869     // effort.
14870     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14871          UE = Op.getNode()->use_end(); UI != UE; ++UI)
14872       if (UI->getOpcode() != ISD::CopyToReg &&
14873           UI->getOpcode() != ISD::SETCC &&
14874           UI->getOpcode() != ISD::STORE)
14875         goto default_case;
14876
14877     if (ConstantSDNode *C =
14878         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14879       // An add of one will be selected as an INC.
14880       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
14881         Opcode = X86ISD::INC;
14882         NumOperands = 1;
14883         break;
14884       }
14885
14886       // An add of negative one (subtract of one) will be selected as a DEC.
14887       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
14888         Opcode = X86ISD::DEC;
14889         NumOperands = 1;
14890         break;
14891       }
14892     }
14893
14894     // Otherwise use a regular EFLAGS-setting add.
14895     Opcode = X86ISD::ADD;
14896     NumOperands = 2;
14897     break;
14898   case ISD::SHL:
14899   case ISD::SRL:
14900     // If we have a constant logical shift that's only used in a comparison
14901     // against zero turn it into an equivalent AND. This allows turning it into
14902     // a TEST instruction later.
14903     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14904         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14905       EVT VT = Op.getValueType();
14906       unsigned BitWidth = VT.getSizeInBits();
14907       unsigned ShAmt = Op->getConstantOperandVal(1);
14908       if (ShAmt >= BitWidth) // Avoid undefined shifts.
14909         break;
14910       APInt Mask = ArithOp.getOpcode() == ISD::SRL
14911                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14912                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14913       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14914         break;
14915       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14916                                 DAG.getConstant(Mask, VT));
14917       DAG.ReplaceAllUsesWith(Op, New);
14918       Op = New;
14919     }
14920     break;
14921
14922   case ISD::AND:
14923     // If the primary and result isn't used, don't bother using X86ISD::AND,
14924     // because a TEST instruction will be better.
14925     if (!hasNonFlagsUse(Op))
14926       break;
14927     // FALL THROUGH
14928   case ISD::SUB:
14929   case ISD::OR:
14930   case ISD::XOR:
14931     // Due to the ISEL shortcoming noted above, be conservative if this op is
14932     // likely to be selected as part of a load-modify-store instruction.
14933     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14934            UE = Op.getNode()->use_end(); UI != UE; ++UI)
14935       if (UI->getOpcode() == ISD::STORE)
14936         goto default_case;
14937
14938     // Otherwise use a regular EFLAGS-setting instruction.
14939     switch (ArithOp.getOpcode()) {
14940     default: llvm_unreachable("unexpected operator!");
14941     case ISD::SUB: Opcode = X86ISD::SUB; break;
14942     case ISD::XOR: Opcode = X86ISD::XOR; break;
14943     case ISD::AND: Opcode = X86ISD::AND; break;
14944     case ISD::OR: {
14945       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14946         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
14947         if (EFLAGS.getNode())
14948           return EFLAGS;
14949       }
14950       Opcode = X86ISD::OR;
14951       break;
14952     }
14953     }
14954
14955     NumOperands = 2;
14956     break;
14957   case X86ISD::ADD:
14958   case X86ISD::SUB:
14959   case X86ISD::INC:
14960   case X86ISD::DEC:
14961   case X86ISD::OR:
14962   case X86ISD::XOR:
14963   case X86ISD::AND:
14964     return SDValue(Op.getNode(), 1);
14965   default:
14966   default_case:
14967     break;
14968   }
14969
14970   // If we found that truncation is beneficial, perform the truncation and
14971   // update 'Op'.
14972   if (NeedTruncation) {
14973     EVT VT = Op.getValueType();
14974     SDValue WideVal = Op->getOperand(0);
14975     EVT WideVT = WideVal.getValueType();
14976     unsigned ConvertedOp = 0;
14977     // Use a target machine opcode to prevent further DAGCombine
14978     // optimizations that may separate the arithmetic operations
14979     // from the setcc node.
14980     switch (WideVal.getOpcode()) {
14981       default: break;
14982       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14983       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14984       case ISD::AND: ConvertedOp = X86ISD::AND; break;
14985       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
14986       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14987     }
14988
14989     if (ConvertedOp) {
14990       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14991       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14992         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14993         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14994         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14995       }
14996     }
14997   }
14998
14999   if (Opcode == 0)
15000     // Emit a CMP with 0, which is the TEST pattern.
15001     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15002                        DAG.getConstant(0, Op.getValueType()));
15003
15004   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15005   SmallVector<SDValue, 4> Ops;
15006   for (unsigned i = 0; i != NumOperands; ++i)
15007     Ops.push_back(Op.getOperand(i));
15008
15009   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15010   DAG.ReplaceAllUsesWith(Op, New);
15011   return SDValue(New.getNode(), 1);
15012 }
15013
15014 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15015 /// equivalent.
15016 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15017                                    SDLoc dl, SelectionDAG &DAG) const {
15018   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15019     if (C->getAPIntValue() == 0)
15020       return EmitTest(Op0, X86CC, dl, DAG);
15021
15022      if (Op0.getValueType() == MVT::i1)
15023        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15024   }
15025
15026   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15027        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15028     // Do the comparison at i32 if it's smaller, besides the Atom case.
15029     // This avoids subregister aliasing issues. Keep the smaller reference
15030     // if we're optimizing for size, however, as that'll allow better folding
15031     // of memory operations.
15032     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15033         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15034              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15035         !Subtarget->isAtom()) {
15036       unsigned ExtendOp =
15037           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15038       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15039       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15040     }
15041     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15042     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15043     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15044                               Op0, Op1);
15045     return SDValue(Sub.getNode(), 1);
15046   }
15047   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15048 }
15049
15050 /// Convert a comparison if required by the subtarget.
15051 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15052                                                  SelectionDAG &DAG) const {
15053   // If the subtarget does not support the FUCOMI instruction, floating-point
15054   // comparisons have to be converted.
15055   if (Subtarget->hasCMov() ||
15056       Cmp.getOpcode() != X86ISD::CMP ||
15057       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15058       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15059     return Cmp;
15060
15061   // The instruction selector will select an FUCOM instruction instead of
15062   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15063   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15064   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15065   SDLoc dl(Cmp);
15066   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15067   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15068   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15069                             DAG.getConstant(8, MVT::i8));
15070   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15071   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15072 }
15073
15074 /// The minimum architected relative accuracy is 2^-12. We need one
15075 /// Newton-Raphson step to have a good float result (24 bits of precision).
15076 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15077                                             DAGCombinerInfo &DCI,
15078                                             unsigned &RefinementSteps,
15079                                             bool &UseOneConstNR) const {
15080   // FIXME: We should use instruction latency models to calculate the cost of
15081   // each potential sequence, but this is very hard to do reliably because
15082   // at least Intel's Core* chips have variable timing based on the number of
15083   // significant digits in the divisor and/or sqrt operand.
15084   if (!Subtarget->useSqrtEst())
15085     return SDValue();
15086
15087   EVT VT = Op.getValueType();
15088
15089   // SSE1 has rsqrtss and rsqrtps.
15090   // TODO: Add support for AVX512 (v16f32).
15091   // It is likely not profitable to do this for f64 because a double-precision
15092   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15093   // instructions: convert to single, rsqrtss, convert back to double, refine
15094   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15095   // along with FMA, this could be a throughput win.
15096   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15097       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15098     RefinementSteps = 1;
15099     UseOneConstNR = false;
15100     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15101   }
15102   return SDValue();
15103 }
15104
15105 /// The minimum architected relative accuracy is 2^-12. We need one
15106 /// Newton-Raphson step to have a good float result (24 bits of precision).
15107 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15108                                             DAGCombinerInfo &DCI,
15109                                             unsigned &RefinementSteps) const {
15110   // FIXME: We should use instruction latency models to calculate the cost of
15111   // each potential sequence, but this is very hard to do reliably because
15112   // at least Intel's Core* chips have variable timing based on the number of
15113   // significant digits in the divisor.
15114   if (!Subtarget->useReciprocalEst())
15115     return SDValue();
15116
15117   EVT VT = Op.getValueType();
15118
15119   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15120   // TODO: Add support for AVX512 (v16f32).
15121   // It is likely not profitable to do this for f64 because a double-precision
15122   // reciprocal estimate with refinement on x86 prior to FMA requires
15123   // 15 instructions: convert to single, rcpss, convert back to double, refine
15124   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15125   // along with FMA, this could be a throughput win.
15126   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15127       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15128     RefinementSteps = ReciprocalEstimateRefinementSteps;
15129     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15130   }
15131   return SDValue();
15132 }
15133
15134 static bool isAllOnes(SDValue V) {
15135   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15136   return C && C->isAllOnesValue();
15137 }
15138
15139 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15140 /// if it's possible.
15141 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15142                                      SDLoc dl, SelectionDAG &DAG) const {
15143   SDValue Op0 = And.getOperand(0);
15144   SDValue Op1 = And.getOperand(1);
15145   if (Op0.getOpcode() == ISD::TRUNCATE)
15146     Op0 = Op0.getOperand(0);
15147   if (Op1.getOpcode() == ISD::TRUNCATE)
15148     Op1 = Op1.getOperand(0);
15149
15150   SDValue LHS, RHS;
15151   if (Op1.getOpcode() == ISD::SHL)
15152     std::swap(Op0, Op1);
15153   if (Op0.getOpcode() == ISD::SHL) {
15154     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15155       if (And00C->getZExtValue() == 1) {
15156         // If we looked past a truncate, check that it's only truncating away
15157         // known zeros.
15158         unsigned BitWidth = Op0.getValueSizeInBits();
15159         unsigned AndBitWidth = And.getValueSizeInBits();
15160         if (BitWidth > AndBitWidth) {
15161           APInt Zeros, Ones;
15162           DAG.computeKnownBits(Op0, Zeros, Ones);
15163           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15164             return SDValue();
15165         }
15166         LHS = Op1;
15167         RHS = Op0.getOperand(1);
15168       }
15169   } else if (Op1.getOpcode() == ISD::Constant) {
15170     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15171     uint64_t AndRHSVal = AndRHS->getZExtValue();
15172     SDValue AndLHS = Op0;
15173
15174     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15175       LHS = AndLHS.getOperand(0);
15176       RHS = AndLHS.getOperand(1);
15177     }
15178
15179     // Use BT if the immediate can't be encoded in a TEST instruction.
15180     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15181       LHS = AndLHS;
15182       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15183     }
15184   }
15185
15186   if (LHS.getNode()) {
15187     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15188     // instruction.  Since the shift amount is in-range-or-undefined, we know
15189     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15190     // the encoding for the i16 version is larger than the i32 version.
15191     // Also promote i16 to i32 for performance / code size reason.
15192     if (LHS.getValueType() == MVT::i8 ||
15193         LHS.getValueType() == MVT::i16)
15194       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15195
15196     // If the operand types disagree, extend the shift amount to match.  Since
15197     // BT ignores high bits (like shifts) we can use anyextend.
15198     if (LHS.getValueType() != RHS.getValueType())
15199       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15200
15201     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15202     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15203     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15204                        DAG.getConstant(Cond, MVT::i8), BT);
15205   }
15206
15207   return SDValue();
15208 }
15209
15210 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15211 /// mask CMPs.
15212 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15213                               SDValue &Op1) {
15214   unsigned SSECC;
15215   bool Swap = false;
15216
15217   // SSE Condition code mapping:
15218   //  0 - EQ
15219   //  1 - LT
15220   //  2 - LE
15221   //  3 - UNORD
15222   //  4 - NEQ
15223   //  5 - NLT
15224   //  6 - NLE
15225   //  7 - ORD
15226   switch (SetCCOpcode) {
15227   default: llvm_unreachable("Unexpected SETCC condition");
15228   case ISD::SETOEQ:
15229   case ISD::SETEQ:  SSECC = 0; break;
15230   case ISD::SETOGT:
15231   case ISD::SETGT:  Swap = true; // Fallthrough
15232   case ISD::SETLT:
15233   case ISD::SETOLT: SSECC = 1; break;
15234   case ISD::SETOGE:
15235   case ISD::SETGE:  Swap = true; // Fallthrough
15236   case ISD::SETLE:
15237   case ISD::SETOLE: SSECC = 2; break;
15238   case ISD::SETUO:  SSECC = 3; break;
15239   case ISD::SETUNE:
15240   case ISD::SETNE:  SSECC = 4; break;
15241   case ISD::SETULE: Swap = true; // Fallthrough
15242   case ISD::SETUGE: SSECC = 5; break;
15243   case ISD::SETULT: Swap = true; // Fallthrough
15244   case ISD::SETUGT: SSECC = 6; break;
15245   case ISD::SETO:   SSECC = 7; break;
15246   case ISD::SETUEQ:
15247   case ISD::SETONE: SSECC = 8; break;
15248   }
15249   if (Swap)
15250     std::swap(Op0, Op1);
15251
15252   return SSECC;
15253 }
15254
15255 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15256 // ones, and then concatenate the result back.
15257 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15258   MVT VT = Op.getSimpleValueType();
15259
15260   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15261          "Unsupported value type for operation");
15262
15263   unsigned NumElems = VT.getVectorNumElements();
15264   SDLoc dl(Op);
15265   SDValue CC = Op.getOperand(2);
15266
15267   // Extract the LHS vectors
15268   SDValue LHS = Op.getOperand(0);
15269   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15270   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15271
15272   // Extract the RHS vectors
15273   SDValue RHS = Op.getOperand(1);
15274   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15275   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15276
15277   // Issue the operation on the smaller types and concatenate the result back
15278   MVT EltVT = VT.getVectorElementType();
15279   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15280   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15281                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15282                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15283 }
15284
15285 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15286                                      const X86Subtarget *Subtarget) {
15287   SDValue Op0 = Op.getOperand(0);
15288   SDValue Op1 = Op.getOperand(1);
15289   SDValue CC = Op.getOperand(2);
15290   MVT VT = Op.getSimpleValueType();
15291   SDLoc dl(Op);
15292
15293   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15294          Op.getValueType().getScalarType() == MVT::i1 &&
15295          "Cannot set masked compare for this operation");
15296
15297   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15298   unsigned  Opc = 0;
15299   bool Unsigned = false;
15300   bool Swap = false;
15301   unsigned SSECC;
15302   switch (SetCCOpcode) {
15303   default: llvm_unreachable("Unexpected SETCC condition");
15304   case ISD::SETNE:  SSECC = 4; break;
15305   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15306   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15307   case ISD::SETLT:  Swap = true; //fall-through
15308   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15309   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15310   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15311   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15312   case ISD::SETULE: Unsigned = true; //fall-through
15313   case ISD::SETLE:  SSECC = 2; break;
15314   }
15315
15316   if (Swap)
15317     std::swap(Op0, Op1);
15318   if (Opc)
15319     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15320   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15321   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15322                      DAG.getConstant(SSECC, MVT::i8));
15323 }
15324
15325 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15326 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15327 /// return an empty value.
15328 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15329 {
15330   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15331   if (!BV)
15332     return SDValue();
15333
15334   MVT VT = Op1.getSimpleValueType();
15335   MVT EVT = VT.getVectorElementType();
15336   unsigned n = VT.getVectorNumElements();
15337   SmallVector<SDValue, 8> ULTOp1;
15338
15339   for (unsigned i = 0; i < n; ++i) {
15340     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15341     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15342       return SDValue();
15343
15344     // Avoid underflow.
15345     APInt Val = Elt->getAPIntValue();
15346     if (Val == 0)
15347       return SDValue();
15348
15349     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15350   }
15351
15352   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15353 }
15354
15355 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15356                            SelectionDAG &DAG) {
15357   SDValue Op0 = Op.getOperand(0);
15358   SDValue Op1 = Op.getOperand(1);
15359   SDValue CC = Op.getOperand(2);
15360   MVT VT = Op.getSimpleValueType();
15361   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15362   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15363   SDLoc dl(Op);
15364
15365   if (isFP) {
15366 #ifndef NDEBUG
15367     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15368     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15369 #endif
15370
15371     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15372     unsigned Opc = X86ISD::CMPP;
15373     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15374       assert(VT.getVectorNumElements() <= 16);
15375       Opc = X86ISD::CMPM;
15376     }
15377     // In the two special cases we can't handle, emit two comparisons.
15378     if (SSECC == 8) {
15379       unsigned CC0, CC1;
15380       unsigned CombineOpc;
15381       if (SetCCOpcode == ISD::SETUEQ) {
15382         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15383       } else {
15384         assert(SetCCOpcode == ISD::SETONE);
15385         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15386       }
15387
15388       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15389                                  DAG.getConstant(CC0, MVT::i8));
15390       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15391                                  DAG.getConstant(CC1, MVT::i8));
15392       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15393     }
15394     // Handle all other FP comparisons here.
15395     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15396                        DAG.getConstant(SSECC, MVT::i8));
15397   }
15398
15399   // Break 256-bit integer vector compare into smaller ones.
15400   if (VT.is256BitVector() && !Subtarget->hasInt256())
15401     return Lower256IntVSETCC(Op, DAG);
15402
15403   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15404   EVT OpVT = Op1.getValueType();
15405   if (Subtarget->hasAVX512()) {
15406     if (Op1.getValueType().is512BitVector() ||
15407         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15408         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15409       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15410
15411     // In AVX-512 architecture setcc returns mask with i1 elements,
15412     // But there is no compare instruction for i8 and i16 elements in KNL.
15413     // We are not talking about 512-bit operands in this case, these
15414     // types are illegal.
15415     if (MaskResult &&
15416         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15417          OpVT.getVectorElementType().getSizeInBits() >= 8))
15418       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15419                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15420   }
15421
15422   // We are handling one of the integer comparisons here.  Since SSE only has
15423   // GT and EQ comparisons for integer, swapping operands and multiple
15424   // operations may be required for some comparisons.
15425   unsigned Opc;
15426   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15427   bool Subus = false;
15428
15429   switch (SetCCOpcode) {
15430   default: llvm_unreachable("Unexpected SETCC condition");
15431   case ISD::SETNE:  Invert = true;
15432   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15433   case ISD::SETLT:  Swap = true;
15434   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15435   case ISD::SETGE:  Swap = true;
15436   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15437                     Invert = true; break;
15438   case ISD::SETULT: Swap = true;
15439   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15440                     FlipSigns = true; break;
15441   case ISD::SETUGE: Swap = true;
15442   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15443                     FlipSigns = true; Invert = true; break;
15444   }
15445
15446   // Special case: Use min/max operations for SETULE/SETUGE
15447   MVT VET = VT.getVectorElementType();
15448   bool hasMinMax =
15449        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15450     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15451
15452   if (hasMinMax) {
15453     switch (SetCCOpcode) {
15454     default: break;
15455     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15456     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15457     }
15458
15459     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15460   }
15461
15462   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15463   if (!MinMax && hasSubus) {
15464     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15465     // Op0 u<= Op1:
15466     //   t = psubus Op0, Op1
15467     //   pcmpeq t, <0..0>
15468     switch (SetCCOpcode) {
15469     default: break;
15470     case ISD::SETULT: {
15471       // If the comparison is against a constant we can turn this into a
15472       // setule.  With psubus, setule does not require a swap.  This is
15473       // beneficial because the constant in the register is no longer
15474       // destructed as the destination so it can be hoisted out of a loop.
15475       // Only do this pre-AVX since vpcmp* is no longer destructive.
15476       if (Subtarget->hasAVX())
15477         break;
15478       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15479       if (ULEOp1.getNode()) {
15480         Op1 = ULEOp1;
15481         Subus = true; Invert = false; Swap = false;
15482       }
15483       break;
15484     }
15485     // Psubus is better than flip-sign because it requires no inversion.
15486     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15487     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15488     }
15489
15490     if (Subus) {
15491       Opc = X86ISD::SUBUS;
15492       FlipSigns = false;
15493     }
15494   }
15495
15496   if (Swap)
15497     std::swap(Op0, Op1);
15498
15499   // Check that the operation in question is available (most are plain SSE2,
15500   // but PCMPGTQ and PCMPEQQ have different requirements).
15501   if (VT == MVT::v2i64) {
15502     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15503       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15504
15505       // First cast everything to the right type.
15506       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15507       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15508
15509       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15510       // bits of the inputs before performing those operations. The lower
15511       // compare is always unsigned.
15512       SDValue SB;
15513       if (FlipSigns) {
15514         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15515       } else {
15516         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15517         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15518         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15519                          Sign, Zero, Sign, Zero);
15520       }
15521       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15522       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15523
15524       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15525       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15526       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15527
15528       // Create masks for only the low parts/high parts of the 64 bit integers.
15529       static const int MaskHi[] = { 1, 1, 3, 3 };
15530       static const int MaskLo[] = { 0, 0, 2, 2 };
15531       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15532       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15533       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15534
15535       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15536       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15537
15538       if (Invert)
15539         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15540
15541       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15542     }
15543
15544     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15545       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15546       // pcmpeqd + pshufd + pand.
15547       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15548
15549       // First cast everything to the right type.
15550       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15551       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15552
15553       // Do the compare.
15554       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15555
15556       // Make sure the lower and upper halves are both all-ones.
15557       static const int Mask[] = { 1, 0, 3, 2 };
15558       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15559       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15560
15561       if (Invert)
15562         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15563
15564       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15565     }
15566   }
15567
15568   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15569   // bits of the inputs before performing those operations.
15570   if (FlipSigns) {
15571     EVT EltVT = VT.getVectorElementType();
15572     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15573     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15574     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15575   }
15576
15577   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15578
15579   // If the logical-not of the result is required, perform that now.
15580   if (Invert)
15581     Result = DAG.getNOT(dl, Result, VT);
15582
15583   if (MinMax)
15584     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15585
15586   if (Subus)
15587     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15588                          getZeroVector(VT, Subtarget, DAG, dl));
15589
15590   return Result;
15591 }
15592
15593 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15594
15595   MVT VT = Op.getSimpleValueType();
15596
15597   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15598
15599   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15600          && "SetCC type must be 8-bit or 1-bit integer");
15601   SDValue Op0 = Op.getOperand(0);
15602   SDValue Op1 = Op.getOperand(1);
15603   SDLoc dl(Op);
15604   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15605
15606   // Optimize to BT if possible.
15607   // Lower (X & (1 << N)) == 0 to BT(X, N).
15608   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15609   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15610   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15611       Op1.getOpcode() == ISD::Constant &&
15612       cast<ConstantSDNode>(Op1)->isNullValue() &&
15613       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15614     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15615     if (NewSetCC.getNode()) {
15616       if (VT == MVT::i1)
15617         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15618       return NewSetCC;
15619     }
15620   }
15621
15622   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15623   // these.
15624   if (Op1.getOpcode() == ISD::Constant &&
15625       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15626        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15627       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15628
15629     // If the input is a setcc, then reuse the input setcc or use a new one with
15630     // the inverted condition.
15631     if (Op0.getOpcode() == X86ISD::SETCC) {
15632       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15633       bool Invert = (CC == ISD::SETNE) ^
15634         cast<ConstantSDNode>(Op1)->isNullValue();
15635       if (!Invert)
15636         return Op0;
15637
15638       CCode = X86::GetOppositeBranchCondition(CCode);
15639       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15640                                   DAG.getConstant(CCode, MVT::i8),
15641                                   Op0.getOperand(1));
15642       if (VT == MVT::i1)
15643         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15644       return SetCC;
15645     }
15646   }
15647   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15648       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15649       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15650
15651     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15652     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15653   }
15654
15655   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15656   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15657   if (X86CC == X86::COND_INVALID)
15658     return SDValue();
15659
15660   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15661   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15662   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15663                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15664   if (VT == MVT::i1)
15665     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15666   return SetCC;
15667 }
15668
15669 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15670 static bool isX86LogicalCmp(SDValue Op) {
15671   unsigned Opc = Op.getNode()->getOpcode();
15672   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15673       Opc == X86ISD::SAHF)
15674     return true;
15675   if (Op.getResNo() == 1 &&
15676       (Opc == X86ISD::ADD ||
15677        Opc == X86ISD::SUB ||
15678        Opc == X86ISD::ADC ||
15679        Opc == X86ISD::SBB ||
15680        Opc == X86ISD::SMUL ||
15681        Opc == X86ISD::UMUL ||
15682        Opc == X86ISD::INC ||
15683        Opc == X86ISD::DEC ||
15684        Opc == X86ISD::OR ||
15685        Opc == X86ISD::XOR ||
15686        Opc == X86ISD::AND))
15687     return true;
15688
15689   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15690     return true;
15691
15692   return false;
15693 }
15694
15695 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15696   if (V.getOpcode() != ISD::TRUNCATE)
15697     return false;
15698
15699   SDValue VOp0 = V.getOperand(0);
15700   unsigned InBits = VOp0.getValueSizeInBits();
15701   unsigned Bits = V.getValueSizeInBits();
15702   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15703 }
15704
15705 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15706   bool addTest = true;
15707   SDValue Cond  = Op.getOperand(0);
15708   SDValue Op1 = Op.getOperand(1);
15709   SDValue Op2 = Op.getOperand(2);
15710   SDLoc DL(Op);
15711   EVT VT = Op1.getValueType();
15712   SDValue CC;
15713
15714   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15715   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15716   // sequence later on.
15717   if (Cond.getOpcode() == ISD::SETCC &&
15718       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15719        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15720       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15721     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15722     int SSECC = translateX86FSETCC(
15723         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15724
15725     if (SSECC != 8) {
15726       if (Subtarget->hasAVX512()) {
15727         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15728                                   DAG.getConstant(SSECC, MVT::i8));
15729         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15730       }
15731       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15732                                 DAG.getConstant(SSECC, MVT::i8));
15733       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15734       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15735       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15736     }
15737   }
15738
15739   if (Cond.getOpcode() == ISD::SETCC) {
15740     SDValue NewCond = LowerSETCC(Cond, DAG);
15741     if (NewCond.getNode())
15742       Cond = NewCond;
15743   }
15744
15745   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15746   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15747   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15748   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15749   if (Cond.getOpcode() == X86ISD::SETCC &&
15750       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15751       isZero(Cond.getOperand(1).getOperand(1))) {
15752     SDValue Cmp = Cond.getOperand(1);
15753
15754     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15755
15756     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
15757         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15758       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
15759
15760       SDValue CmpOp0 = Cmp.getOperand(0);
15761       // Apply further optimizations for special cases
15762       // (select (x != 0), -1, 0) -> neg & sbb
15763       // (select (x == 0), 0, -1) -> neg & sbb
15764       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
15765         if (YC->isNullValue() &&
15766             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
15767           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15768           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15769                                     DAG.getConstant(0, CmpOp0.getValueType()),
15770                                     CmpOp0);
15771           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15772                                     DAG.getConstant(X86::COND_B, MVT::i8),
15773                                     SDValue(Neg.getNode(), 1));
15774           return Res;
15775         }
15776
15777       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15778                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
15779       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15780
15781       SDValue Res =   // Res = 0 or -1.
15782         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15783                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
15784
15785       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
15786         Res = DAG.getNOT(DL, Res, Res.getValueType());
15787
15788       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
15789       if (!N2C || !N2C->isNullValue())
15790         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15791       return Res;
15792     }
15793   }
15794
15795   // Look past (and (setcc_carry (cmp ...)), 1).
15796   if (Cond.getOpcode() == ISD::AND &&
15797       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
15798     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
15799     if (C && C->getAPIntValue() == 1)
15800       Cond = Cond.getOperand(0);
15801   }
15802
15803   // If condition flag is set by a X86ISD::CMP, then use it as the condition
15804   // setting operand in place of the X86ISD::SETCC.
15805   unsigned CondOpcode = Cond.getOpcode();
15806   if (CondOpcode == X86ISD::SETCC ||
15807       CondOpcode == X86ISD::SETCC_CARRY) {
15808     CC = Cond.getOperand(0);
15809
15810     SDValue Cmp = Cond.getOperand(1);
15811     unsigned Opc = Cmp.getOpcode();
15812     MVT VT = Op.getSimpleValueType();
15813
15814     bool IllegalFPCMov = false;
15815     if (VT.isFloatingPoint() && !VT.isVector() &&
15816         !isScalarFPTypeInSSEReg(VT))  // FPStack?
15817       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15818
15819     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15820         Opc == X86ISD::BT) { // FIXME
15821       Cond = Cmp;
15822       addTest = false;
15823     }
15824   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15825              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15826              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15827               Cond.getOperand(0).getValueType() != MVT::i8)) {
15828     SDValue LHS = Cond.getOperand(0);
15829     SDValue RHS = Cond.getOperand(1);
15830     unsigned X86Opcode;
15831     unsigned X86Cond;
15832     SDVTList VTs;
15833     switch (CondOpcode) {
15834     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15835     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15836     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15837     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15838     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15839     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15840     default: llvm_unreachable("unexpected overflowing operator");
15841     }
15842     if (CondOpcode == ISD::UMULO)
15843       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15844                           MVT::i32);
15845     else
15846       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15847
15848     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15849
15850     if (CondOpcode == ISD::UMULO)
15851       Cond = X86Op.getValue(2);
15852     else
15853       Cond = X86Op.getValue(1);
15854
15855     CC = DAG.getConstant(X86Cond, MVT::i8);
15856     addTest = false;
15857   }
15858
15859   if (addTest) {
15860     // Look pass the truncate if the high bits are known zero.
15861     if (isTruncWithZeroHighBitsInput(Cond, DAG))
15862         Cond = Cond.getOperand(0);
15863
15864     // We know the result of AND is compared against zero. Try to match
15865     // it to BT.
15866     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15867       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
15868       if (NewSetCC.getNode()) {
15869         CC = NewSetCC.getOperand(0);
15870         Cond = NewSetCC.getOperand(1);
15871         addTest = false;
15872       }
15873     }
15874   }
15875
15876   if (addTest) {
15877     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
15878     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15879   }
15880
15881   // a <  b ? -1 :  0 -> RES = ~setcc_carry
15882   // a <  b ?  0 : -1 -> RES = setcc_carry
15883   // a >= b ? -1 :  0 -> RES = setcc_carry
15884   // a >= b ?  0 : -1 -> RES = ~setcc_carry
15885   if (Cond.getOpcode() == X86ISD::SUB) {
15886     Cond = ConvertCmpIfNecessary(Cond, DAG);
15887     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15888
15889     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15890         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
15891       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15892                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
15893       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
15894         return DAG.getNOT(DL, Res, Res.getValueType());
15895       return Res;
15896     }
15897   }
15898
15899   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15900   // widen the cmov and push the truncate through. This avoids introducing a new
15901   // branch during isel and doesn't add any extensions.
15902   if (Op.getValueType() == MVT::i8 &&
15903       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15904     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15905     if (T1.getValueType() == T2.getValueType() &&
15906         // Blacklist CopyFromReg to avoid partial register stalls.
15907         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15908       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15909       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15910       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15911     }
15912   }
15913
15914   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15915   // condition is true.
15916   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15917   SDValue Ops[] = { Op2, Op1, CC, Cond };
15918   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15919 }
15920
15921 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
15922                                        SelectionDAG &DAG) {
15923   MVT VT = Op->getSimpleValueType(0);
15924   SDValue In = Op->getOperand(0);
15925   MVT InVT = In.getSimpleValueType();
15926   MVT VTElt = VT.getVectorElementType();
15927   MVT InVTElt = InVT.getVectorElementType();
15928   SDLoc dl(Op);
15929
15930   // SKX processor
15931   if ((InVTElt == MVT::i1) &&
15932       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
15933         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
15934
15935        ((Subtarget->hasBWI() && VT.is512BitVector() &&
15936         VTElt.getSizeInBits() <= 16)) ||
15937
15938        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
15939         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
15940
15941        ((Subtarget->hasDQI() && VT.is512BitVector() &&
15942         VTElt.getSizeInBits() >= 32))))
15943     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15944
15945   unsigned int NumElts = VT.getVectorNumElements();
15946
15947   if (NumElts != 8 && NumElts != 16)
15948     return SDValue();
15949
15950   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
15951     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
15952       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
15953     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15954   }
15955
15956   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15957   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
15958
15959   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
15960   Constant *C = ConstantInt::get(*DAG.getContext(),
15961     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
15962
15963   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
15964   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
15965   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
15966                           MachinePointerInfo::getConstantPool(),
15967                           false, false, false, Alignment);
15968   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
15969   if (VT.is512BitVector())
15970     return Brcst;
15971   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
15972 }
15973
15974 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
15975                                 SelectionDAG &DAG) {
15976   MVT VT = Op->getSimpleValueType(0);
15977   SDValue In = Op->getOperand(0);
15978   MVT InVT = In.getSimpleValueType();
15979   SDLoc dl(Op);
15980
15981   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15982     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
15983
15984   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
15985       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
15986       (VT != MVT::v16i16 || InVT != MVT::v16i8))
15987     return SDValue();
15988
15989   if (Subtarget->hasInt256())
15990     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15991
15992   // Optimize vectors in AVX mode
15993   // Sign extend  v8i16 to v8i32 and
15994   //              v4i32 to v4i64
15995   //
15996   // Divide input vector into two parts
15997   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
15998   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
15999   // concat the vectors to original VT
16000
16001   unsigned NumElems = InVT.getVectorNumElements();
16002   SDValue Undef = DAG.getUNDEF(InVT);
16003
16004   SmallVector<int,8> ShufMask1(NumElems, -1);
16005   for (unsigned i = 0; i != NumElems/2; ++i)
16006     ShufMask1[i] = i;
16007
16008   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16009
16010   SmallVector<int,8> ShufMask2(NumElems, -1);
16011   for (unsigned i = 0; i != NumElems/2; ++i)
16012     ShufMask2[i] = i + NumElems/2;
16013
16014   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16015
16016   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16017                                 VT.getVectorNumElements()/2);
16018
16019   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16020   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16021
16022   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16023 }
16024
16025 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16026 // may emit an illegal shuffle but the expansion is still better than scalar
16027 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16028 // we'll emit a shuffle and a arithmetic shift.
16029 // TODO: It is possible to support ZExt by zeroing the undef values during
16030 // the shuffle phase or after the shuffle.
16031 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16032                                  SelectionDAG &DAG) {
16033   MVT RegVT = Op.getSimpleValueType();
16034   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16035   assert(RegVT.isInteger() &&
16036          "We only custom lower integer vector sext loads.");
16037
16038   // Nothing useful we can do without SSE2 shuffles.
16039   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16040
16041   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16042   SDLoc dl(Ld);
16043   EVT MemVT = Ld->getMemoryVT();
16044   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16045   unsigned RegSz = RegVT.getSizeInBits();
16046
16047   ISD::LoadExtType Ext = Ld->getExtensionType();
16048
16049   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16050          && "Only anyext and sext are currently implemented.");
16051   assert(MemVT != RegVT && "Cannot extend to the same type");
16052   assert(MemVT.isVector() && "Must load a vector from memory");
16053
16054   unsigned NumElems = RegVT.getVectorNumElements();
16055   unsigned MemSz = MemVT.getSizeInBits();
16056   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16057
16058   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16059     // The only way in which we have a legal 256-bit vector result but not the
16060     // integer 256-bit operations needed to directly lower a sextload is if we
16061     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16062     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16063     // correctly legalized. We do this late to allow the canonical form of
16064     // sextload to persist throughout the rest of the DAG combiner -- it wants
16065     // to fold together any extensions it can, and so will fuse a sign_extend
16066     // of an sextload into a sextload targeting a wider value.
16067     SDValue Load;
16068     if (MemSz == 128) {
16069       // Just switch this to a normal load.
16070       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16071                                        "it must be a legal 128-bit vector "
16072                                        "type!");
16073       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16074                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16075                   Ld->isInvariant(), Ld->getAlignment());
16076     } else {
16077       assert(MemSz < 128 &&
16078              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16079       // Do an sext load to a 128-bit vector type. We want to use the same
16080       // number of elements, but elements half as wide. This will end up being
16081       // recursively lowered by this routine, but will succeed as we definitely
16082       // have all the necessary features if we're using AVX1.
16083       EVT HalfEltVT =
16084           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16085       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16086       Load =
16087           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16088                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16089                          Ld->isNonTemporal(), Ld->isInvariant(),
16090                          Ld->getAlignment());
16091     }
16092
16093     // Replace chain users with the new chain.
16094     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16095     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16096
16097     // Finally, do a normal sign-extend to the desired register.
16098     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16099   }
16100
16101   // All sizes must be a power of two.
16102   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16103          "Non-power-of-two elements are not custom lowered!");
16104
16105   // Attempt to load the original value using scalar loads.
16106   // Find the largest scalar type that divides the total loaded size.
16107   MVT SclrLoadTy = MVT::i8;
16108   for (MVT Tp : MVT::integer_valuetypes()) {
16109     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16110       SclrLoadTy = Tp;
16111     }
16112   }
16113
16114   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16115   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16116       (64 <= MemSz))
16117     SclrLoadTy = MVT::f64;
16118
16119   // Calculate the number of scalar loads that we need to perform
16120   // in order to load our vector from memory.
16121   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16122
16123   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16124          "Can only lower sext loads with a single scalar load!");
16125
16126   unsigned loadRegZize = RegSz;
16127   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16128     loadRegZize /= 2;
16129
16130   // Represent our vector as a sequence of elements which are the
16131   // largest scalar that we can load.
16132   EVT LoadUnitVecVT = EVT::getVectorVT(
16133       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16134
16135   // Represent the data using the same element type that is stored in
16136   // memory. In practice, we ''widen'' MemVT.
16137   EVT WideVecVT =
16138       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16139                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16140
16141   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16142          "Invalid vector type");
16143
16144   // We can't shuffle using an illegal type.
16145   assert(TLI.isTypeLegal(WideVecVT) &&
16146          "We only lower types that form legal widened vector types");
16147
16148   SmallVector<SDValue, 8> Chains;
16149   SDValue Ptr = Ld->getBasePtr();
16150   SDValue Increment =
16151       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16152   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16153
16154   for (unsigned i = 0; i < NumLoads; ++i) {
16155     // Perform a single load.
16156     SDValue ScalarLoad =
16157         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16158                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16159                     Ld->getAlignment());
16160     Chains.push_back(ScalarLoad.getValue(1));
16161     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16162     // another round of DAGCombining.
16163     if (i == 0)
16164       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16165     else
16166       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16167                         ScalarLoad, DAG.getIntPtrConstant(i));
16168
16169     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16170   }
16171
16172   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16173
16174   // Bitcast the loaded value to a vector of the original element type, in
16175   // the size of the target vector type.
16176   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16177   unsigned SizeRatio = RegSz / MemSz;
16178
16179   if (Ext == ISD::SEXTLOAD) {
16180     // If we have SSE4.1, we can directly emit a VSEXT node.
16181     if (Subtarget->hasSSE41()) {
16182       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16183       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16184       return Sext;
16185     }
16186
16187     // Otherwise we'll shuffle the small elements in the high bits of the
16188     // larger type and perform an arithmetic shift. If the shift is not legal
16189     // it's better to scalarize.
16190     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16191            "We can't implement a sext load without an arithmetic right shift!");
16192
16193     // Redistribute the loaded elements into the different locations.
16194     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16195     for (unsigned i = 0; i != NumElems; ++i)
16196       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16197
16198     SDValue Shuff = DAG.getVectorShuffle(
16199         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16200
16201     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16202
16203     // Build the arithmetic shift.
16204     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16205                    MemVT.getVectorElementType().getSizeInBits();
16206     Shuff =
16207         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16208
16209     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16210     return Shuff;
16211   }
16212
16213   // Redistribute the loaded elements into the different locations.
16214   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16215   for (unsigned i = 0; i != NumElems; ++i)
16216     ShuffleVec[i * SizeRatio] = i;
16217
16218   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16219                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16220
16221   // Bitcast to the requested type.
16222   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16223   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16224   return Shuff;
16225 }
16226
16227 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16228 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16229 // from the AND / OR.
16230 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16231   Opc = Op.getOpcode();
16232   if (Opc != ISD::OR && Opc != ISD::AND)
16233     return false;
16234   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16235           Op.getOperand(0).hasOneUse() &&
16236           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16237           Op.getOperand(1).hasOneUse());
16238 }
16239
16240 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16241 // 1 and that the SETCC node has a single use.
16242 static bool isXor1OfSetCC(SDValue Op) {
16243   if (Op.getOpcode() != ISD::XOR)
16244     return false;
16245   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16246   if (N1C && N1C->getAPIntValue() == 1) {
16247     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16248       Op.getOperand(0).hasOneUse();
16249   }
16250   return false;
16251 }
16252
16253 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16254   bool addTest = true;
16255   SDValue Chain = Op.getOperand(0);
16256   SDValue Cond  = Op.getOperand(1);
16257   SDValue Dest  = Op.getOperand(2);
16258   SDLoc dl(Op);
16259   SDValue CC;
16260   bool Inverted = false;
16261
16262   if (Cond.getOpcode() == ISD::SETCC) {
16263     // Check for setcc([su]{add,sub,mul}o == 0).
16264     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16265         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16266         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16267         Cond.getOperand(0).getResNo() == 1 &&
16268         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16269          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16270          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16271          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16272          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16273          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16274       Inverted = true;
16275       Cond = Cond.getOperand(0);
16276     } else {
16277       SDValue NewCond = LowerSETCC(Cond, DAG);
16278       if (NewCond.getNode())
16279         Cond = NewCond;
16280     }
16281   }
16282 #if 0
16283   // FIXME: LowerXALUO doesn't handle these!!
16284   else if (Cond.getOpcode() == X86ISD::ADD  ||
16285            Cond.getOpcode() == X86ISD::SUB  ||
16286            Cond.getOpcode() == X86ISD::SMUL ||
16287            Cond.getOpcode() == X86ISD::UMUL)
16288     Cond = LowerXALUO(Cond, DAG);
16289 #endif
16290
16291   // Look pass (and (setcc_carry (cmp ...)), 1).
16292   if (Cond.getOpcode() == ISD::AND &&
16293       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16294     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16295     if (C && C->getAPIntValue() == 1)
16296       Cond = Cond.getOperand(0);
16297   }
16298
16299   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16300   // setting operand in place of the X86ISD::SETCC.
16301   unsigned CondOpcode = Cond.getOpcode();
16302   if (CondOpcode == X86ISD::SETCC ||
16303       CondOpcode == X86ISD::SETCC_CARRY) {
16304     CC = Cond.getOperand(0);
16305
16306     SDValue Cmp = Cond.getOperand(1);
16307     unsigned Opc = Cmp.getOpcode();
16308     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16309     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16310       Cond = Cmp;
16311       addTest = false;
16312     } else {
16313       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16314       default: break;
16315       case X86::COND_O:
16316       case X86::COND_B:
16317         // These can only come from an arithmetic instruction with overflow,
16318         // e.g. SADDO, UADDO.
16319         Cond = Cond.getNode()->getOperand(1);
16320         addTest = false;
16321         break;
16322       }
16323     }
16324   }
16325   CondOpcode = Cond.getOpcode();
16326   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16327       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16328       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16329        Cond.getOperand(0).getValueType() != MVT::i8)) {
16330     SDValue LHS = Cond.getOperand(0);
16331     SDValue RHS = Cond.getOperand(1);
16332     unsigned X86Opcode;
16333     unsigned X86Cond;
16334     SDVTList VTs;
16335     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16336     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16337     // X86ISD::INC).
16338     switch (CondOpcode) {
16339     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16340     case ISD::SADDO:
16341       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16342         if (C->isOne()) {
16343           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16344           break;
16345         }
16346       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16347     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16348     case ISD::SSUBO:
16349       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16350         if (C->isOne()) {
16351           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16352           break;
16353         }
16354       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16355     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16356     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16357     default: llvm_unreachable("unexpected overflowing operator");
16358     }
16359     if (Inverted)
16360       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16361     if (CondOpcode == ISD::UMULO)
16362       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16363                           MVT::i32);
16364     else
16365       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16366
16367     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16368
16369     if (CondOpcode == ISD::UMULO)
16370       Cond = X86Op.getValue(2);
16371     else
16372       Cond = X86Op.getValue(1);
16373
16374     CC = DAG.getConstant(X86Cond, MVT::i8);
16375     addTest = false;
16376   } else {
16377     unsigned CondOpc;
16378     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16379       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16380       if (CondOpc == ISD::OR) {
16381         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16382         // two branches instead of an explicit OR instruction with a
16383         // separate test.
16384         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16385             isX86LogicalCmp(Cmp)) {
16386           CC = Cond.getOperand(0).getOperand(0);
16387           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16388                               Chain, Dest, CC, Cmp);
16389           CC = Cond.getOperand(1).getOperand(0);
16390           Cond = Cmp;
16391           addTest = false;
16392         }
16393       } else { // ISD::AND
16394         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16395         // two branches instead of an explicit AND instruction with a
16396         // separate test. However, we only do this if this block doesn't
16397         // have a fall-through edge, because this requires an explicit
16398         // jmp when the condition is false.
16399         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16400             isX86LogicalCmp(Cmp) &&
16401             Op.getNode()->hasOneUse()) {
16402           X86::CondCode CCode =
16403             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16404           CCode = X86::GetOppositeBranchCondition(CCode);
16405           CC = DAG.getConstant(CCode, MVT::i8);
16406           SDNode *User = *Op.getNode()->use_begin();
16407           // Look for an unconditional branch following this conditional branch.
16408           // We need this because we need to reverse the successors in order
16409           // to implement FCMP_OEQ.
16410           if (User->getOpcode() == ISD::BR) {
16411             SDValue FalseBB = User->getOperand(1);
16412             SDNode *NewBR =
16413               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16414             assert(NewBR == User);
16415             (void)NewBR;
16416             Dest = FalseBB;
16417
16418             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16419                                 Chain, Dest, CC, Cmp);
16420             X86::CondCode CCode =
16421               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16422             CCode = X86::GetOppositeBranchCondition(CCode);
16423             CC = DAG.getConstant(CCode, MVT::i8);
16424             Cond = Cmp;
16425             addTest = false;
16426           }
16427         }
16428       }
16429     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16430       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16431       // It should be transformed during dag combiner except when the condition
16432       // is set by a arithmetics with overflow node.
16433       X86::CondCode CCode =
16434         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16435       CCode = X86::GetOppositeBranchCondition(CCode);
16436       CC = DAG.getConstant(CCode, MVT::i8);
16437       Cond = Cond.getOperand(0).getOperand(1);
16438       addTest = false;
16439     } else if (Cond.getOpcode() == ISD::SETCC &&
16440                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16441       // For FCMP_OEQ, we can emit
16442       // two branches instead of an explicit AND instruction with a
16443       // separate test. However, we only do this if this block doesn't
16444       // have a fall-through edge, because this requires an explicit
16445       // jmp when the condition is false.
16446       if (Op.getNode()->hasOneUse()) {
16447         SDNode *User = *Op.getNode()->use_begin();
16448         // Look for an unconditional branch following this conditional branch.
16449         // We need this because we need to reverse the successors in order
16450         // to implement FCMP_OEQ.
16451         if (User->getOpcode() == ISD::BR) {
16452           SDValue FalseBB = User->getOperand(1);
16453           SDNode *NewBR =
16454             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16455           assert(NewBR == User);
16456           (void)NewBR;
16457           Dest = FalseBB;
16458
16459           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16460                                     Cond.getOperand(0), Cond.getOperand(1));
16461           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16462           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16463           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16464                               Chain, Dest, CC, Cmp);
16465           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16466           Cond = Cmp;
16467           addTest = false;
16468         }
16469       }
16470     } else if (Cond.getOpcode() == ISD::SETCC &&
16471                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16472       // For FCMP_UNE, we can emit
16473       // two branches instead of an explicit AND instruction with a
16474       // separate test. However, we only do this if this block doesn't
16475       // have a fall-through edge, because this requires an explicit
16476       // jmp when the condition is false.
16477       if (Op.getNode()->hasOneUse()) {
16478         SDNode *User = *Op.getNode()->use_begin();
16479         // Look for an unconditional branch following this conditional branch.
16480         // We need this because we need to reverse the successors in order
16481         // to implement FCMP_UNE.
16482         if (User->getOpcode() == ISD::BR) {
16483           SDValue FalseBB = User->getOperand(1);
16484           SDNode *NewBR =
16485             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16486           assert(NewBR == User);
16487           (void)NewBR;
16488
16489           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16490                                     Cond.getOperand(0), Cond.getOperand(1));
16491           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16492           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16493           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16494                               Chain, Dest, CC, Cmp);
16495           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16496           Cond = Cmp;
16497           addTest = false;
16498           Dest = FalseBB;
16499         }
16500       }
16501     }
16502   }
16503
16504   if (addTest) {
16505     // Look pass the truncate if the high bits are known zero.
16506     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16507         Cond = Cond.getOperand(0);
16508
16509     // We know the result of AND is compared against zero. Try to match
16510     // it to BT.
16511     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16512       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16513       if (NewSetCC.getNode()) {
16514         CC = NewSetCC.getOperand(0);
16515         Cond = NewSetCC.getOperand(1);
16516         addTest = false;
16517       }
16518     }
16519   }
16520
16521   if (addTest) {
16522     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16523     CC = DAG.getConstant(X86Cond, MVT::i8);
16524     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16525   }
16526   Cond = ConvertCmpIfNecessary(Cond, DAG);
16527   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16528                      Chain, Dest, CC, Cond);
16529 }
16530
16531 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16532 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16533 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16534 // that the guard pages used by the OS virtual memory manager are allocated in
16535 // correct sequence.
16536 SDValue
16537 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16538                                            SelectionDAG &DAG) const {
16539   MachineFunction &MF = DAG.getMachineFunction();
16540   bool SplitStack = MF.shouldSplitStack();
16541   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16542                SplitStack;
16543   SDLoc dl(Op);
16544
16545   if (!Lower) {
16546     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16547     SDNode* Node = Op.getNode();
16548
16549     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16550     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16551         " not tell us which reg is the stack pointer!");
16552     EVT VT = Node->getValueType(0);
16553     SDValue Tmp1 = SDValue(Node, 0);
16554     SDValue Tmp2 = SDValue(Node, 1);
16555     SDValue Tmp3 = Node->getOperand(2);
16556     SDValue Chain = Tmp1.getOperand(0);
16557
16558     // Chain the dynamic stack allocation so that it doesn't modify the stack
16559     // pointer when other instructions are using the stack.
16560     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16561         SDLoc(Node));
16562
16563     SDValue Size = Tmp2.getOperand(1);
16564     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16565     Chain = SP.getValue(1);
16566     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16567     const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
16568     unsigned StackAlign = TFI.getStackAlignment();
16569     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16570     if (Align > StackAlign)
16571       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16572           DAG.getConstant(-(uint64_t)Align, VT));
16573     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16574
16575     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16576         DAG.getIntPtrConstant(0, true), SDValue(),
16577         SDLoc(Node));
16578
16579     SDValue Ops[2] = { Tmp1, Tmp2 };
16580     return DAG.getMergeValues(Ops, dl);
16581   }
16582
16583   // Get the inputs.
16584   SDValue Chain = Op.getOperand(0);
16585   SDValue Size  = Op.getOperand(1);
16586   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16587   EVT VT = Op.getNode()->getValueType(0);
16588
16589   bool Is64Bit = Subtarget->is64Bit();
16590   EVT SPTy = getPointerTy();
16591
16592   if (SplitStack) {
16593     MachineRegisterInfo &MRI = MF.getRegInfo();
16594
16595     if (Is64Bit) {
16596       // The 64 bit implementation of segmented stacks needs to clobber both r10
16597       // r11. This makes it impossible to use it along with nested parameters.
16598       const Function *F = MF.getFunction();
16599
16600       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16601            I != E; ++I)
16602         if (I->hasNestAttr())
16603           report_fatal_error("Cannot use segmented stacks with functions that "
16604                              "have nested arguments.");
16605     }
16606
16607     const TargetRegisterClass *AddrRegClass =
16608       getRegClassFor(getPointerTy());
16609     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16610     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16611     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16612                                 DAG.getRegister(Vreg, SPTy));
16613     SDValue Ops1[2] = { Value, Chain };
16614     return DAG.getMergeValues(Ops1, dl);
16615   } else {
16616     SDValue Flag;
16617     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16618
16619     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16620     Flag = Chain.getValue(1);
16621     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16622
16623     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16624
16625     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
16626         DAG.getSubtarget().getRegisterInfo());
16627     unsigned SPReg = RegInfo->getStackRegister();
16628     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16629     Chain = SP.getValue(1);
16630
16631     if (Align) {
16632       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16633                        DAG.getConstant(-(uint64_t)Align, VT));
16634       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16635     }
16636
16637     SDValue Ops1[2] = { SP, Chain };
16638     return DAG.getMergeValues(Ops1, dl);
16639   }
16640 }
16641
16642 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16643   MachineFunction &MF = DAG.getMachineFunction();
16644   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16645
16646   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16647   SDLoc DL(Op);
16648
16649   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16650     // vastart just stores the address of the VarArgsFrameIndex slot into the
16651     // memory location argument.
16652     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16653                                    getPointerTy());
16654     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16655                         MachinePointerInfo(SV), false, false, 0);
16656   }
16657
16658   // __va_list_tag:
16659   //   gp_offset         (0 - 6 * 8)
16660   //   fp_offset         (48 - 48 + 8 * 16)
16661   //   overflow_arg_area (point to parameters coming in memory).
16662   //   reg_save_area
16663   SmallVector<SDValue, 8> MemOps;
16664   SDValue FIN = Op.getOperand(1);
16665   // Store gp_offset
16666   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16667                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16668                                                MVT::i32),
16669                                FIN, MachinePointerInfo(SV), false, false, 0);
16670   MemOps.push_back(Store);
16671
16672   // Store fp_offset
16673   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16674                     FIN, DAG.getIntPtrConstant(4));
16675   Store = DAG.getStore(Op.getOperand(0), DL,
16676                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16677                                        MVT::i32),
16678                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16679   MemOps.push_back(Store);
16680
16681   // Store ptr to overflow_arg_area
16682   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16683                     FIN, DAG.getIntPtrConstant(4));
16684   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16685                                     getPointerTy());
16686   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16687                        MachinePointerInfo(SV, 8),
16688                        false, false, 0);
16689   MemOps.push_back(Store);
16690
16691   // Store ptr to reg_save_area.
16692   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16693                     FIN, DAG.getIntPtrConstant(8));
16694   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16695                                     getPointerTy());
16696   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16697                        MachinePointerInfo(SV, 16), false, false, 0);
16698   MemOps.push_back(Store);
16699   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16700 }
16701
16702 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16703   assert(Subtarget->is64Bit() &&
16704          "LowerVAARG only handles 64-bit va_arg!");
16705   assert((Subtarget->isTargetLinux() ||
16706           Subtarget->isTargetDarwin()) &&
16707           "Unhandled target in LowerVAARG");
16708   assert(Op.getNode()->getNumOperands() == 4);
16709   SDValue Chain = Op.getOperand(0);
16710   SDValue SrcPtr = Op.getOperand(1);
16711   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16712   unsigned Align = Op.getConstantOperandVal(3);
16713   SDLoc dl(Op);
16714
16715   EVT ArgVT = Op.getNode()->getValueType(0);
16716   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16717   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16718   uint8_t ArgMode;
16719
16720   // Decide which area this value should be read from.
16721   // TODO: Implement the AMD64 ABI in its entirety. This simple
16722   // selection mechanism works only for the basic types.
16723   if (ArgVT == MVT::f80) {
16724     llvm_unreachable("va_arg for f80 not yet implemented");
16725   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16726     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16727   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16728     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16729   } else {
16730     llvm_unreachable("Unhandled argument type in LowerVAARG");
16731   }
16732
16733   if (ArgMode == 2) {
16734     // Sanity Check: Make sure using fp_offset makes sense.
16735     assert(!DAG.getTarget().Options.UseSoftFloat &&
16736            !(DAG.getMachineFunction()
16737                 .getFunction()->getAttributes()
16738                 .hasAttribute(AttributeSet::FunctionIndex,
16739                               Attribute::NoImplicitFloat)) &&
16740            Subtarget->hasSSE1());
16741   }
16742
16743   // Insert VAARG_64 node into the DAG
16744   // VAARG_64 returns two values: Variable Argument Address, Chain
16745   SmallVector<SDValue, 11> InstOps;
16746   InstOps.push_back(Chain);
16747   InstOps.push_back(SrcPtr);
16748   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
16749   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
16750   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
16751   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
16752   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16753                                           VTs, InstOps, MVT::i64,
16754                                           MachinePointerInfo(SV),
16755                                           /*Align=*/0,
16756                                           /*Volatile=*/false,
16757                                           /*ReadMem=*/true,
16758                                           /*WriteMem=*/true);
16759   Chain = VAARG.getValue(1);
16760
16761   // Load the next argument and return it
16762   return DAG.getLoad(ArgVT, dl,
16763                      Chain,
16764                      VAARG,
16765                      MachinePointerInfo(),
16766                      false, false, false, 0);
16767 }
16768
16769 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
16770                            SelectionDAG &DAG) {
16771   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
16772   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
16773   SDValue Chain = Op.getOperand(0);
16774   SDValue DstPtr = Op.getOperand(1);
16775   SDValue SrcPtr = Op.getOperand(2);
16776   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
16777   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16778   SDLoc DL(Op);
16779
16780   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
16781                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
16782                        false,
16783                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
16784 }
16785
16786 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
16787 // amount is a constant. Takes immediate version of shift as input.
16788 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
16789                                           SDValue SrcOp, uint64_t ShiftAmt,
16790                                           SelectionDAG &DAG) {
16791   MVT ElementType = VT.getVectorElementType();
16792
16793   // Fold this packed shift into its first operand if ShiftAmt is 0.
16794   if (ShiftAmt == 0)
16795     return SrcOp;
16796
16797   // Check for ShiftAmt >= element width
16798   if (ShiftAmt >= ElementType.getSizeInBits()) {
16799     if (Opc == X86ISD::VSRAI)
16800       ShiftAmt = ElementType.getSizeInBits() - 1;
16801     else
16802       return DAG.getConstant(0, VT);
16803   }
16804
16805   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
16806          && "Unknown target vector shift-by-constant node");
16807
16808   // Fold this packed vector shift into a build vector if SrcOp is a
16809   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
16810   if (VT == SrcOp.getSimpleValueType() &&
16811       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
16812     SmallVector<SDValue, 8> Elts;
16813     unsigned NumElts = SrcOp->getNumOperands();
16814     ConstantSDNode *ND;
16815
16816     switch(Opc) {
16817     default: llvm_unreachable(nullptr);
16818     case X86ISD::VSHLI:
16819       for (unsigned i=0; i!=NumElts; ++i) {
16820         SDValue CurrentOp = SrcOp->getOperand(i);
16821         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16822           Elts.push_back(CurrentOp);
16823           continue;
16824         }
16825         ND = cast<ConstantSDNode>(CurrentOp);
16826         const APInt &C = ND->getAPIntValue();
16827         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
16828       }
16829       break;
16830     case X86ISD::VSRLI:
16831       for (unsigned i=0; i!=NumElts; ++i) {
16832         SDValue CurrentOp = SrcOp->getOperand(i);
16833         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16834           Elts.push_back(CurrentOp);
16835           continue;
16836         }
16837         ND = cast<ConstantSDNode>(CurrentOp);
16838         const APInt &C = ND->getAPIntValue();
16839         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
16840       }
16841       break;
16842     case X86ISD::VSRAI:
16843       for (unsigned i=0; i!=NumElts; ++i) {
16844         SDValue CurrentOp = SrcOp->getOperand(i);
16845         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16846           Elts.push_back(CurrentOp);
16847           continue;
16848         }
16849         ND = cast<ConstantSDNode>(CurrentOp);
16850         const APInt &C = ND->getAPIntValue();
16851         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
16852       }
16853       break;
16854     }
16855
16856     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16857   }
16858
16859   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
16860 }
16861
16862 // getTargetVShiftNode - Handle vector element shifts where the shift amount
16863 // may or may not be a constant. Takes immediate version of shift as input.
16864 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
16865                                    SDValue SrcOp, SDValue ShAmt,
16866                                    SelectionDAG &DAG) {
16867   MVT SVT = ShAmt.getSimpleValueType();
16868   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
16869
16870   // Catch shift-by-constant.
16871   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
16872     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
16873                                       CShAmt->getZExtValue(), DAG);
16874
16875   // Change opcode to non-immediate version
16876   switch (Opc) {
16877     default: llvm_unreachable("Unknown target vector shift node");
16878     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
16879     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
16880     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
16881   }
16882
16883   const X86Subtarget &Subtarget =
16884       DAG.getTarget().getSubtarget<X86Subtarget>();
16885   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
16886       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
16887     // Let the shuffle legalizer expand this shift amount node.
16888     SDValue Op0 = ShAmt.getOperand(0);
16889     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
16890     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
16891   } else {
16892     // Need to build a vector containing shift amount.
16893     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
16894     SmallVector<SDValue, 4> ShOps;
16895     ShOps.push_back(ShAmt);
16896     if (SVT == MVT::i32) {
16897       ShOps.push_back(DAG.getConstant(0, SVT));
16898       ShOps.push_back(DAG.getUNDEF(SVT));
16899     }
16900     ShOps.push_back(DAG.getUNDEF(SVT));
16901
16902     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
16903     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
16904   }
16905
16906   // The return type has to be a 128-bit type with the same element
16907   // type as the input type.
16908   MVT EltVT = VT.getVectorElementType();
16909   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
16910
16911   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
16912   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
16913 }
16914
16915 /// \brief Return (and \p Op, \p Mask) for compare instructions or
16916 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
16917 /// necessary casting for \p Mask when lowering masking intrinsics.
16918 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
16919                                     SDValue PreservedSrc,
16920                                     const X86Subtarget *Subtarget,
16921                                     SelectionDAG &DAG) {
16922     EVT VT = Op.getValueType();
16923     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
16924                                   MVT::i1, VT.getVectorNumElements());
16925     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
16926                                      Mask.getValueType().getSizeInBits());
16927     SDLoc dl(Op);
16928
16929     assert(MaskVT.isSimple() && "invalid mask type");
16930
16931     if (isAllOnes(Mask))
16932       return Op;
16933
16934     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
16935     // are extracted by EXTRACT_SUBVECTOR.
16936     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
16937                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
16938                               DAG.getIntPtrConstant(0));
16939
16940     switch (Op.getOpcode()) {
16941       default: break;
16942       case X86ISD::PCMPEQM:
16943       case X86ISD::PCMPGTM:
16944       case X86ISD::CMPM:
16945       case X86ISD::CMPMU:
16946         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
16947     }
16948     if (PreservedSrc.getOpcode() == ISD::UNDEF)
16949       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16950     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
16951 }
16952
16953 /// \brief Creates an SDNode for a predicated scalar operation.
16954 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
16955 /// The mask is comming as MVT::i8 and it should be truncated
16956 /// to MVT::i1 while lowering masking intrinsics.
16957 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
16958 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
16959 /// a scalar instruction.
16960 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
16961                                     SDValue PreservedSrc,
16962                                     const X86Subtarget *Subtarget,
16963                                     SelectionDAG &DAG) {
16964     if (isAllOnes(Mask))
16965       return Op;
16966
16967     EVT VT = Op.getValueType();
16968     SDLoc dl(Op);
16969     // The mask should be of type MVT::i1
16970     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
16971
16972     if (PreservedSrc.getOpcode() == ISD::UNDEF)
16973       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16974     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
16975 }
16976
16977 static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
16978     switch (IntNo) {
16979     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
16980     case Intrinsic::x86_fma_vfmadd_ps:
16981     case Intrinsic::x86_fma_vfmadd_pd:
16982     case Intrinsic::x86_fma_vfmadd_ps_256:
16983     case Intrinsic::x86_fma_vfmadd_pd_256:
16984     case Intrinsic::x86_fma_mask_vfmadd_ps_512:
16985     case Intrinsic::x86_fma_mask_vfmadd_pd_512:
16986       return X86ISD::FMADD;
16987     case Intrinsic::x86_fma_vfmsub_ps:
16988     case Intrinsic::x86_fma_vfmsub_pd:
16989     case Intrinsic::x86_fma_vfmsub_ps_256:
16990     case Intrinsic::x86_fma_vfmsub_pd_256:
16991     case Intrinsic::x86_fma_mask_vfmsub_ps_512:
16992     case Intrinsic::x86_fma_mask_vfmsub_pd_512:
16993       return X86ISD::FMSUB;
16994     case Intrinsic::x86_fma_vfnmadd_ps:
16995     case Intrinsic::x86_fma_vfnmadd_pd:
16996     case Intrinsic::x86_fma_vfnmadd_ps_256:
16997     case Intrinsic::x86_fma_vfnmadd_pd_256:
16998     case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
16999     case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17000       return X86ISD::FNMADD;
17001     case Intrinsic::x86_fma_vfnmsub_ps:
17002     case Intrinsic::x86_fma_vfnmsub_pd:
17003     case Intrinsic::x86_fma_vfnmsub_ps_256:
17004     case Intrinsic::x86_fma_vfnmsub_pd_256:
17005     case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17006     case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17007       return X86ISD::FNMSUB;
17008     case Intrinsic::x86_fma_vfmaddsub_ps:
17009     case Intrinsic::x86_fma_vfmaddsub_pd:
17010     case Intrinsic::x86_fma_vfmaddsub_ps_256:
17011     case Intrinsic::x86_fma_vfmaddsub_pd_256:
17012     case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17013     case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17014       return X86ISD::FMADDSUB;
17015     case Intrinsic::x86_fma_vfmsubadd_ps:
17016     case Intrinsic::x86_fma_vfmsubadd_pd:
17017     case Intrinsic::x86_fma_vfmsubadd_ps_256:
17018     case Intrinsic::x86_fma_vfmsubadd_pd_256:
17019     case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17020     case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
17021       return X86ISD::FMSUBADD;
17022     }
17023 }
17024
17025 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17026                                        SelectionDAG &DAG) {
17027   SDLoc dl(Op);
17028   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17029   EVT VT = Op.getValueType();
17030   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17031   if (IntrData) {
17032     switch(IntrData->Type) {
17033     case INTR_TYPE_1OP:
17034       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17035     case INTR_TYPE_2OP:
17036       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17037         Op.getOperand(2));
17038     case INTR_TYPE_3OP:
17039       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17040         Op.getOperand(2), Op.getOperand(3));
17041     case INTR_TYPE_1OP_MASK_RM: {
17042       SDValue Src = Op.getOperand(1);
17043       SDValue Src0 = Op.getOperand(2);
17044       SDValue Mask = Op.getOperand(3);
17045       SDValue RoundingMode = Op.getOperand(4);
17046       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17047                                               RoundingMode),
17048                                   Mask, Src0, Subtarget, DAG);
17049     }
17050     case INTR_TYPE_SCALAR_MASK_RM: {
17051       SDValue Src1 = Op.getOperand(1);
17052       SDValue Src2 = Op.getOperand(2);
17053       SDValue Src0 = Op.getOperand(3);
17054       SDValue Mask = Op.getOperand(4);
17055       SDValue RoundingMode = Op.getOperand(5);
17056       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17057                                               RoundingMode),
17058                                   Mask, Src0, Subtarget, DAG);
17059     }
17060     case INTR_TYPE_2OP_MASK: {
17061       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
17062                                               Op.getOperand(2)),
17063                                   Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
17064     }
17065     case CMP_MASK:
17066     case CMP_MASK_CC: {
17067       // Comparison intrinsics with masks.
17068       // Example of transformation:
17069       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17070       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17071       // (i8 (bitcast
17072       //   (v8i1 (insert_subvector undef,
17073       //           (v2i1 (and (PCMPEQM %a, %b),
17074       //                      (extract_subvector
17075       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17076       EVT VT = Op.getOperand(1).getValueType();
17077       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17078                                     VT.getVectorNumElements());
17079       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17080       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17081                                        Mask.getValueType().getSizeInBits());
17082       SDValue Cmp;
17083       if (IntrData->Type == CMP_MASK_CC) {
17084         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17085                     Op.getOperand(2), Op.getOperand(3));
17086       } else {
17087         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17088         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17089                     Op.getOperand(2));
17090       }
17091       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17092                                              DAG.getTargetConstant(0, MaskVT),
17093                                              Subtarget, DAG);
17094       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17095                                 DAG.getUNDEF(BitcastVT), CmpMask,
17096                                 DAG.getIntPtrConstant(0));
17097       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17098     }
17099     case COMI: { // Comparison intrinsics
17100       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17101       SDValue LHS = Op.getOperand(1);
17102       SDValue RHS = Op.getOperand(2);
17103       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17104       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17105       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17106       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17107                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17108       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17109     }
17110     case VSHIFT:
17111       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17112                                  Op.getOperand(1), Op.getOperand(2), DAG);
17113     case VSHIFT_MASK:
17114       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17115                                                       Op.getSimpleValueType(),
17116                                                       Op.getOperand(1),
17117                                                       Op.getOperand(2), DAG),
17118                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17119                                   DAG);
17120     case COMPRESS_EXPAND_IN_REG: {
17121       SDValue Mask = Op.getOperand(3);
17122       SDValue DataToCompress = Op.getOperand(1);
17123       SDValue PassThru = Op.getOperand(2);
17124       if (isAllOnes(Mask)) // return data as is
17125         return Op.getOperand(1);
17126       EVT VT = Op.getValueType();
17127       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17128                                     VT.getVectorNumElements());
17129       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17130                                        Mask.getValueType().getSizeInBits());
17131       SDLoc dl(Op);
17132       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17133                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17134                                   DAG.getIntPtrConstant(0));
17135
17136       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17137                          PassThru);
17138     }
17139     case BLEND: {
17140       SDValue Mask = Op.getOperand(3);
17141       EVT VT = Op.getValueType();
17142       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17143                                     VT.getVectorNumElements());
17144       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17145                                        Mask.getValueType().getSizeInBits());
17146       SDLoc dl(Op);
17147       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17148                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17149                                   DAG.getIntPtrConstant(0));
17150       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17151                          Op.getOperand(2));
17152     }
17153     case FMA_OP_MASK:
17154     {
17155         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17156             dl, Op.getValueType(),
17157             Op.getOperand(1),
17158             Op.getOperand(2),
17159             Op.getOperand(3)),
17160             Op.getOperand(4), Op.getOperand(1),
17161             Subtarget, DAG);
17162     }
17163     default:
17164       break;
17165     }
17166   }
17167
17168   switch (IntNo) {
17169   default: return SDValue();    // Don't custom lower most intrinsics.
17170
17171   case Intrinsic::x86_avx512_mask_valign_q_512:
17172   case Intrinsic::x86_avx512_mask_valign_d_512:
17173     // Vector source operands are swapped.
17174     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17175                                             Op.getValueType(), Op.getOperand(2),
17176                                             Op.getOperand(1),
17177                                             Op.getOperand(3)),
17178                                 Op.getOperand(5), Op.getOperand(4),
17179                                 Subtarget, DAG);
17180
17181   // ptest and testp intrinsics. The intrinsic these come from are designed to
17182   // return an integer value, not just an instruction so lower it to the ptest
17183   // or testp pattern and a setcc for the result.
17184   case Intrinsic::x86_sse41_ptestz:
17185   case Intrinsic::x86_sse41_ptestc:
17186   case Intrinsic::x86_sse41_ptestnzc:
17187   case Intrinsic::x86_avx_ptestz_256:
17188   case Intrinsic::x86_avx_ptestc_256:
17189   case Intrinsic::x86_avx_ptestnzc_256:
17190   case Intrinsic::x86_avx_vtestz_ps:
17191   case Intrinsic::x86_avx_vtestc_ps:
17192   case Intrinsic::x86_avx_vtestnzc_ps:
17193   case Intrinsic::x86_avx_vtestz_pd:
17194   case Intrinsic::x86_avx_vtestc_pd:
17195   case Intrinsic::x86_avx_vtestnzc_pd:
17196   case Intrinsic::x86_avx_vtestz_ps_256:
17197   case Intrinsic::x86_avx_vtestc_ps_256:
17198   case Intrinsic::x86_avx_vtestnzc_ps_256:
17199   case Intrinsic::x86_avx_vtestz_pd_256:
17200   case Intrinsic::x86_avx_vtestc_pd_256:
17201   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17202     bool IsTestPacked = false;
17203     unsigned X86CC;
17204     switch (IntNo) {
17205     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17206     case Intrinsic::x86_avx_vtestz_ps:
17207     case Intrinsic::x86_avx_vtestz_pd:
17208     case Intrinsic::x86_avx_vtestz_ps_256:
17209     case Intrinsic::x86_avx_vtestz_pd_256:
17210       IsTestPacked = true; // Fallthrough
17211     case Intrinsic::x86_sse41_ptestz:
17212     case Intrinsic::x86_avx_ptestz_256:
17213       // ZF = 1
17214       X86CC = X86::COND_E;
17215       break;
17216     case Intrinsic::x86_avx_vtestc_ps:
17217     case Intrinsic::x86_avx_vtestc_pd:
17218     case Intrinsic::x86_avx_vtestc_ps_256:
17219     case Intrinsic::x86_avx_vtestc_pd_256:
17220       IsTestPacked = true; // Fallthrough
17221     case Intrinsic::x86_sse41_ptestc:
17222     case Intrinsic::x86_avx_ptestc_256:
17223       // CF = 1
17224       X86CC = X86::COND_B;
17225       break;
17226     case Intrinsic::x86_avx_vtestnzc_ps:
17227     case Intrinsic::x86_avx_vtestnzc_pd:
17228     case Intrinsic::x86_avx_vtestnzc_ps_256:
17229     case Intrinsic::x86_avx_vtestnzc_pd_256:
17230       IsTestPacked = true; // Fallthrough
17231     case Intrinsic::x86_sse41_ptestnzc:
17232     case Intrinsic::x86_avx_ptestnzc_256:
17233       // ZF and CF = 0
17234       X86CC = X86::COND_A;
17235       break;
17236     }
17237
17238     SDValue LHS = Op.getOperand(1);
17239     SDValue RHS = Op.getOperand(2);
17240     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17241     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17242     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17243     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17244     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17245   }
17246   case Intrinsic::x86_avx512_kortestz_w:
17247   case Intrinsic::x86_avx512_kortestc_w: {
17248     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17249     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17250     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17251     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17252     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17253     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17254     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17255   }
17256
17257   case Intrinsic::x86_sse42_pcmpistria128:
17258   case Intrinsic::x86_sse42_pcmpestria128:
17259   case Intrinsic::x86_sse42_pcmpistric128:
17260   case Intrinsic::x86_sse42_pcmpestric128:
17261   case Intrinsic::x86_sse42_pcmpistrio128:
17262   case Intrinsic::x86_sse42_pcmpestrio128:
17263   case Intrinsic::x86_sse42_pcmpistris128:
17264   case Intrinsic::x86_sse42_pcmpestris128:
17265   case Intrinsic::x86_sse42_pcmpistriz128:
17266   case Intrinsic::x86_sse42_pcmpestriz128: {
17267     unsigned Opcode;
17268     unsigned X86CC;
17269     switch (IntNo) {
17270     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17271     case Intrinsic::x86_sse42_pcmpistria128:
17272       Opcode = X86ISD::PCMPISTRI;
17273       X86CC = X86::COND_A;
17274       break;
17275     case Intrinsic::x86_sse42_pcmpestria128:
17276       Opcode = X86ISD::PCMPESTRI;
17277       X86CC = X86::COND_A;
17278       break;
17279     case Intrinsic::x86_sse42_pcmpistric128:
17280       Opcode = X86ISD::PCMPISTRI;
17281       X86CC = X86::COND_B;
17282       break;
17283     case Intrinsic::x86_sse42_pcmpestric128:
17284       Opcode = X86ISD::PCMPESTRI;
17285       X86CC = X86::COND_B;
17286       break;
17287     case Intrinsic::x86_sse42_pcmpistrio128:
17288       Opcode = X86ISD::PCMPISTRI;
17289       X86CC = X86::COND_O;
17290       break;
17291     case Intrinsic::x86_sse42_pcmpestrio128:
17292       Opcode = X86ISD::PCMPESTRI;
17293       X86CC = X86::COND_O;
17294       break;
17295     case Intrinsic::x86_sse42_pcmpistris128:
17296       Opcode = X86ISD::PCMPISTRI;
17297       X86CC = X86::COND_S;
17298       break;
17299     case Intrinsic::x86_sse42_pcmpestris128:
17300       Opcode = X86ISD::PCMPESTRI;
17301       X86CC = X86::COND_S;
17302       break;
17303     case Intrinsic::x86_sse42_pcmpistriz128:
17304       Opcode = X86ISD::PCMPISTRI;
17305       X86CC = X86::COND_E;
17306       break;
17307     case Intrinsic::x86_sse42_pcmpestriz128:
17308       Opcode = X86ISD::PCMPESTRI;
17309       X86CC = X86::COND_E;
17310       break;
17311     }
17312     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17313     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17314     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17315     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17316                                 DAG.getConstant(X86CC, MVT::i8),
17317                                 SDValue(PCMP.getNode(), 1));
17318     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17319   }
17320
17321   case Intrinsic::x86_sse42_pcmpistri128:
17322   case Intrinsic::x86_sse42_pcmpestri128: {
17323     unsigned Opcode;
17324     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17325       Opcode = X86ISD::PCMPISTRI;
17326     else
17327       Opcode = X86ISD::PCMPESTRI;
17328
17329     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17330     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17331     return DAG.getNode(Opcode, dl, VTs, NewOps);
17332   }
17333
17334   case Intrinsic::x86_fma_mask_vfmadd_ps_512:
17335   case Intrinsic::x86_fma_mask_vfmadd_pd_512:
17336   case Intrinsic::x86_fma_mask_vfmsub_ps_512:
17337   case Intrinsic::x86_fma_mask_vfmsub_pd_512:
17338   case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
17339   case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17340   case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17341   case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17342   case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17343   case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17344   case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17345   case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
17346     auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
17347     if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
17348       return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
17349                                               dl, Op.getValueType(),
17350                                               Op.getOperand(1),
17351                                               Op.getOperand(2),
17352                                               Op.getOperand(3)),
17353                                   Op.getOperand(4), Op.getOperand(1),
17354                                   Subtarget, DAG);
17355     else
17356       return SDValue();
17357   }
17358
17359   case Intrinsic::x86_fma_vfmadd_ps:
17360   case Intrinsic::x86_fma_vfmadd_pd:
17361   case Intrinsic::x86_fma_vfmsub_ps:
17362   case Intrinsic::x86_fma_vfmsub_pd:
17363   case Intrinsic::x86_fma_vfnmadd_ps:
17364   case Intrinsic::x86_fma_vfnmadd_pd:
17365   case Intrinsic::x86_fma_vfnmsub_ps:
17366   case Intrinsic::x86_fma_vfnmsub_pd:
17367   case Intrinsic::x86_fma_vfmaddsub_ps:
17368   case Intrinsic::x86_fma_vfmaddsub_pd:
17369   case Intrinsic::x86_fma_vfmsubadd_ps:
17370   case Intrinsic::x86_fma_vfmsubadd_pd:
17371   case Intrinsic::x86_fma_vfmadd_ps_256:
17372   case Intrinsic::x86_fma_vfmadd_pd_256:
17373   case Intrinsic::x86_fma_vfmsub_ps_256:
17374   case Intrinsic::x86_fma_vfmsub_pd_256:
17375   case Intrinsic::x86_fma_vfnmadd_ps_256:
17376   case Intrinsic::x86_fma_vfnmadd_pd_256:
17377   case Intrinsic::x86_fma_vfnmsub_ps_256:
17378   case Intrinsic::x86_fma_vfnmsub_pd_256:
17379   case Intrinsic::x86_fma_vfmaddsub_ps_256:
17380   case Intrinsic::x86_fma_vfmaddsub_pd_256:
17381   case Intrinsic::x86_fma_vfmsubadd_ps_256:
17382   case Intrinsic::x86_fma_vfmsubadd_pd_256:
17383     return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
17384                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
17385   }
17386 }
17387
17388 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17389                               SDValue Src, SDValue Mask, SDValue Base,
17390                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17391                               const X86Subtarget * Subtarget) {
17392   SDLoc dl(Op);
17393   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17394   assert(C && "Invalid scale type");
17395   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17396   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17397                              Index.getSimpleValueType().getVectorNumElements());
17398   SDValue MaskInReg;
17399   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17400   if (MaskC)
17401     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17402   else
17403     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17404   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17405   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17406   SDValue Segment = DAG.getRegister(0, MVT::i32);
17407   if (Src.getOpcode() == ISD::UNDEF)
17408     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17409   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17410   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17411   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17412   return DAG.getMergeValues(RetOps, dl);
17413 }
17414
17415 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17416                                SDValue Src, SDValue Mask, SDValue Base,
17417                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17418   SDLoc dl(Op);
17419   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17420   assert(C && "Invalid scale type");
17421   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17422   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17423   SDValue Segment = DAG.getRegister(0, MVT::i32);
17424   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17425                              Index.getSimpleValueType().getVectorNumElements());
17426   SDValue MaskInReg;
17427   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17428   if (MaskC)
17429     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17430   else
17431     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17432   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17433   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17434   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17435   return SDValue(Res, 1);
17436 }
17437
17438 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17439                                SDValue Mask, SDValue Base, SDValue Index,
17440                                SDValue ScaleOp, SDValue Chain) {
17441   SDLoc dl(Op);
17442   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17443   assert(C && "Invalid scale type");
17444   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17445   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17446   SDValue Segment = DAG.getRegister(0, MVT::i32);
17447   EVT MaskVT =
17448     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17449   SDValue MaskInReg;
17450   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17451   if (MaskC)
17452     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17453   else
17454     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17455   //SDVTList VTs = DAG.getVTList(MVT::Other);
17456   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17457   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17458   return SDValue(Res, 0);
17459 }
17460
17461 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17462 // read performance monitor counters (x86_rdpmc).
17463 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17464                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17465                               SmallVectorImpl<SDValue> &Results) {
17466   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17467   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17468   SDValue LO, HI;
17469
17470   // The ECX register is used to select the index of the performance counter
17471   // to read.
17472   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17473                                    N->getOperand(2));
17474   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17475
17476   // Reads the content of a 64-bit performance counter and returns it in the
17477   // registers EDX:EAX.
17478   if (Subtarget->is64Bit()) {
17479     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17480     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17481                             LO.getValue(2));
17482   } else {
17483     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17484     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17485                             LO.getValue(2));
17486   }
17487   Chain = HI.getValue(1);
17488
17489   if (Subtarget->is64Bit()) {
17490     // The EAX register is loaded with the low-order 32 bits. The EDX register
17491     // is loaded with the supported high-order bits of the counter.
17492     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17493                               DAG.getConstant(32, MVT::i8));
17494     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17495     Results.push_back(Chain);
17496     return;
17497   }
17498
17499   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17500   SDValue Ops[] = { LO, HI };
17501   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17502   Results.push_back(Pair);
17503   Results.push_back(Chain);
17504 }
17505
17506 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17507 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17508 // also used to custom lower READCYCLECOUNTER nodes.
17509 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17510                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17511                               SmallVectorImpl<SDValue> &Results) {
17512   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17513   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17514   SDValue LO, HI;
17515
17516   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17517   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17518   // and the EAX register is loaded with the low-order 32 bits.
17519   if (Subtarget->is64Bit()) {
17520     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17521     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17522                             LO.getValue(2));
17523   } else {
17524     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17525     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17526                             LO.getValue(2));
17527   }
17528   SDValue Chain = HI.getValue(1);
17529
17530   if (Opcode == X86ISD::RDTSCP_DAG) {
17531     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17532
17533     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17534     // the ECX register. Add 'ecx' explicitly to the chain.
17535     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17536                                      HI.getValue(2));
17537     // Explicitly store the content of ECX at the location passed in input
17538     // to the 'rdtscp' intrinsic.
17539     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17540                          MachinePointerInfo(), false, false, 0);
17541   }
17542
17543   if (Subtarget->is64Bit()) {
17544     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17545     // the EAX register is loaded with the low-order 32 bits.
17546     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17547                               DAG.getConstant(32, MVT::i8));
17548     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17549     Results.push_back(Chain);
17550     return;
17551   }
17552
17553   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17554   SDValue Ops[] = { LO, HI };
17555   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17556   Results.push_back(Pair);
17557   Results.push_back(Chain);
17558 }
17559
17560 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17561                                      SelectionDAG &DAG) {
17562   SmallVector<SDValue, 2> Results;
17563   SDLoc DL(Op);
17564   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17565                           Results);
17566   return DAG.getMergeValues(Results, DL);
17567 }
17568
17569
17570 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17571                                       SelectionDAG &DAG) {
17572   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17573
17574   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17575   if (!IntrData)
17576     return SDValue();
17577
17578   SDLoc dl(Op);
17579   switch(IntrData->Type) {
17580   default:
17581     llvm_unreachable("Unknown Intrinsic Type");
17582     break;
17583   case RDSEED:
17584   case RDRAND: {
17585     // Emit the node with the right value type.
17586     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17587     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17588
17589     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17590     // Otherwise return the value from Rand, which is always 0, casted to i32.
17591     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17592                       DAG.getConstant(1, Op->getValueType(1)),
17593                       DAG.getConstant(X86::COND_B, MVT::i32),
17594                       SDValue(Result.getNode(), 1) };
17595     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17596                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17597                                   Ops);
17598
17599     // Return { result, isValid, chain }.
17600     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17601                        SDValue(Result.getNode(), 2));
17602   }
17603   case GATHER: {
17604   //gather(v1, mask, index, base, scale);
17605     SDValue Chain = Op.getOperand(0);
17606     SDValue Src   = Op.getOperand(2);
17607     SDValue Base  = Op.getOperand(3);
17608     SDValue Index = Op.getOperand(4);
17609     SDValue Mask  = Op.getOperand(5);
17610     SDValue Scale = Op.getOperand(6);
17611     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17612                           Subtarget);
17613   }
17614   case SCATTER: {
17615   //scatter(base, mask, index, v1, scale);
17616     SDValue Chain = Op.getOperand(0);
17617     SDValue Base  = Op.getOperand(2);
17618     SDValue Mask  = Op.getOperand(3);
17619     SDValue Index = Op.getOperand(4);
17620     SDValue Src   = Op.getOperand(5);
17621     SDValue Scale = Op.getOperand(6);
17622     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17623   }
17624   case PREFETCH: {
17625     SDValue Hint = Op.getOperand(6);
17626     unsigned HintVal;
17627     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17628         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17629       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17630     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17631     SDValue Chain = Op.getOperand(0);
17632     SDValue Mask  = Op.getOperand(2);
17633     SDValue Index = Op.getOperand(3);
17634     SDValue Base  = Op.getOperand(4);
17635     SDValue Scale = Op.getOperand(5);
17636     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17637   }
17638   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17639   case RDTSC: {
17640     SmallVector<SDValue, 2> Results;
17641     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17642     return DAG.getMergeValues(Results, dl);
17643   }
17644   // Read Performance Monitoring Counters.
17645   case RDPMC: {
17646     SmallVector<SDValue, 2> Results;
17647     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17648     return DAG.getMergeValues(Results, dl);
17649   }
17650   // XTEST intrinsics.
17651   case XTEST: {
17652     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17653     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17654     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17655                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17656                                 InTrans);
17657     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17658     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17659                        Ret, SDValue(InTrans.getNode(), 1));
17660   }
17661   // ADC/ADCX/SBB
17662   case ADX: {
17663     SmallVector<SDValue, 2> Results;
17664     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17665     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17666     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17667                                 DAG.getConstant(-1, MVT::i8));
17668     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17669                               Op.getOperand(4), GenCF.getValue(1));
17670     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17671                                  Op.getOperand(5), MachinePointerInfo(),
17672                                  false, false, 0);
17673     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17674                                 DAG.getConstant(X86::COND_B, MVT::i8),
17675                                 Res.getValue(1));
17676     Results.push_back(SetCC);
17677     Results.push_back(Store);
17678     return DAG.getMergeValues(Results, dl);
17679   }
17680   case COMPRESS_TO_MEM: {
17681     SDLoc dl(Op);
17682     SDValue Mask = Op.getOperand(4);
17683     SDValue DataToCompress = Op.getOperand(3);
17684     SDValue Addr = Op.getOperand(2);
17685     SDValue Chain = Op.getOperand(0);
17686
17687     if (isAllOnes(Mask)) // return just a store
17688       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17689                           MachinePointerInfo(), false, false, 0);
17690
17691     EVT VT = DataToCompress.getValueType();
17692     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17693                                   VT.getVectorNumElements());
17694     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17695                                      Mask.getValueType().getSizeInBits());
17696     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17697                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17698                                 DAG.getIntPtrConstant(0));
17699
17700     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17701                                       DataToCompress, DAG.getUNDEF(VT));
17702     return DAG.getStore(Chain, dl, Compressed, Addr,
17703                         MachinePointerInfo(), false, false, 0);
17704   }
17705   case EXPAND_FROM_MEM: {
17706     SDLoc dl(Op);
17707     SDValue Mask = Op.getOperand(4);
17708     SDValue PathThru = Op.getOperand(3);
17709     SDValue Addr = Op.getOperand(2);
17710     SDValue Chain = Op.getOperand(0);
17711     EVT VT = Op.getValueType();
17712
17713     if (isAllOnes(Mask)) // return just a load
17714       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17715                          false, 0);
17716     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17717                                   VT.getVectorNumElements());
17718     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17719                                      Mask.getValueType().getSizeInBits());
17720     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17721                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17722                                 DAG.getIntPtrConstant(0));
17723
17724     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17725                                    false, false, false, 0);
17726
17727     SmallVector<SDValue, 2> Results;
17728     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17729                                   PathThru));
17730     Results.push_back(Chain);
17731     return DAG.getMergeValues(Results, dl);
17732   }
17733   }
17734 }
17735
17736 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17737                                            SelectionDAG &DAG) const {
17738   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17739   MFI->setReturnAddressIsTaken(true);
17740
17741   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17742     return SDValue();
17743
17744   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17745   SDLoc dl(Op);
17746   EVT PtrVT = getPointerTy();
17747
17748   if (Depth > 0) {
17749     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17750     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17751         DAG.getSubtarget().getRegisterInfo());
17752     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17753     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17754                        DAG.getNode(ISD::ADD, dl, PtrVT,
17755                                    FrameAddr, Offset),
17756                        MachinePointerInfo(), false, false, false, 0);
17757   }
17758
17759   // Just load the return address.
17760   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17761   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17762                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17763 }
17764
17765 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17766   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17767   MFI->setFrameAddressIsTaken(true);
17768
17769   EVT VT = Op.getValueType();
17770   SDLoc dl(Op);  // FIXME probably not meaningful
17771   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17772   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17773       DAG.getSubtarget().getRegisterInfo());
17774   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17775       DAG.getMachineFunction());
17776   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17777           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17778          "Invalid Frame Register!");
17779   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17780   while (Depth--)
17781     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17782                             MachinePointerInfo(),
17783                             false, false, false, 0);
17784   return FrameAddr;
17785 }
17786
17787 // FIXME? Maybe this could be a TableGen attribute on some registers and
17788 // this table could be generated automatically from RegInfo.
17789 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17790                                               EVT VT) const {
17791   unsigned Reg = StringSwitch<unsigned>(RegName)
17792                        .Case("esp", X86::ESP)
17793                        .Case("rsp", X86::RSP)
17794                        .Default(0);
17795   if (Reg)
17796     return Reg;
17797   report_fatal_error("Invalid register name global variable");
17798 }
17799
17800 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17801                                                      SelectionDAG &DAG) const {
17802   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17803       DAG.getSubtarget().getRegisterInfo());
17804   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17805 }
17806
17807 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17808   SDValue Chain     = Op.getOperand(0);
17809   SDValue Offset    = Op.getOperand(1);
17810   SDValue Handler   = Op.getOperand(2);
17811   SDLoc dl      (Op);
17812
17813   EVT PtrVT = getPointerTy();
17814   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17815       DAG.getSubtarget().getRegisterInfo());
17816   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
17817   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
17818           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
17819          "Invalid Frame Register!");
17820   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
17821   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
17822
17823   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
17824                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
17825   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
17826   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
17827                        false, false, 0);
17828   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
17829
17830   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
17831                      DAG.getRegister(StoreAddrReg, PtrVT));
17832 }
17833
17834 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
17835                                                SelectionDAG &DAG) const {
17836   SDLoc DL(Op);
17837   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
17838                      DAG.getVTList(MVT::i32, MVT::Other),
17839                      Op.getOperand(0), Op.getOperand(1));
17840 }
17841
17842 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
17843                                                 SelectionDAG &DAG) const {
17844   SDLoc DL(Op);
17845   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
17846                      Op.getOperand(0), Op.getOperand(1));
17847 }
17848
17849 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
17850   return Op.getOperand(0);
17851 }
17852
17853 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
17854                                                 SelectionDAG &DAG) const {
17855   SDValue Root = Op.getOperand(0);
17856   SDValue Trmp = Op.getOperand(1); // trampoline
17857   SDValue FPtr = Op.getOperand(2); // nested function
17858   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
17859   SDLoc dl (Op);
17860
17861   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17862   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
17863
17864   if (Subtarget->is64Bit()) {
17865     SDValue OutChains[6];
17866
17867     // Large code-model.
17868     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
17869     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
17870
17871     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
17872     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
17873
17874     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
17875
17876     // Load the pointer to the nested function into R11.
17877     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
17878     SDValue Addr = Trmp;
17879     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17880                                 Addr, MachinePointerInfo(TrmpAddr),
17881                                 false, false, 0);
17882
17883     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17884                        DAG.getConstant(2, MVT::i64));
17885     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
17886                                 MachinePointerInfo(TrmpAddr, 2),
17887                                 false, false, 2);
17888
17889     // Load the 'nest' parameter value into R10.
17890     // R10 is specified in X86CallingConv.td
17891     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
17892     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17893                        DAG.getConstant(10, MVT::i64));
17894     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17895                                 Addr, MachinePointerInfo(TrmpAddr, 10),
17896                                 false, false, 0);
17897
17898     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17899                        DAG.getConstant(12, MVT::i64));
17900     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
17901                                 MachinePointerInfo(TrmpAddr, 12),
17902                                 false, false, 2);
17903
17904     // Jump to the nested function.
17905     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
17906     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17907                        DAG.getConstant(20, MVT::i64));
17908     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17909                                 Addr, MachinePointerInfo(TrmpAddr, 20),
17910                                 false, false, 0);
17911
17912     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
17913     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17914                        DAG.getConstant(22, MVT::i64));
17915     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
17916                                 MachinePointerInfo(TrmpAddr, 22),
17917                                 false, false, 0);
17918
17919     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17920   } else {
17921     const Function *Func =
17922       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
17923     CallingConv::ID CC = Func->getCallingConv();
17924     unsigned NestReg;
17925
17926     switch (CC) {
17927     default:
17928       llvm_unreachable("Unsupported calling convention");
17929     case CallingConv::C:
17930     case CallingConv::X86_StdCall: {
17931       // Pass 'nest' parameter in ECX.
17932       // Must be kept in sync with X86CallingConv.td
17933       NestReg = X86::ECX;
17934
17935       // Check that ECX wasn't needed by an 'inreg' parameter.
17936       FunctionType *FTy = Func->getFunctionType();
17937       const AttributeSet &Attrs = Func->getAttributes();
17938
17939       if (!Attrs.isEmpty() && !Func->isVarArg()) {
17940         unsigned InRegCount = 0;
17941         unsigned Idx = 1;
17942
17943         for (FunctionType::param_iterator I = FTy->param_begin(),
17944              E = FTy->param_end(); I != E; ++I, ++Idx)
17945           if (Attrs.hasAttribute(Idx, Attribute::InReg))
17946             // FIXME: should only count parameters that are lowered to integers.
17947             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
17948
17949         if (InRegCount > 2) {
17950           report_fatal_error("Nest register in use - reduce number of inreg"
17951                              " parameters!");
17952         }
17953       }
17954       break;
17955     }
17956     case CallingConv::X86_FastCall:
17957     case CallingConv::X86_ThisCall:
17958     case CallingConv::Fast:
17959       // Pass 'nest' parameter in EAX.
17960       // Must be kept in sync with X86CallingConv.td
17961       NestReg = X86::EAX;
17962       break;
17963     }
17964
17965     SDValue OutChains[4];
17966     SDValue Addr, Disp;
17967
17968     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17969                        DAG.getConstant(10, MVT::i32));
17970     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
17971
17972     // This is storing the opcode for MOV32ri.
17973     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
17974     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
17975     OutChains[0] = DAG.getStore(Root, dl,
17976                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
17977                                 Trmp, MachinePointerInfo(TrmpAddr),
17978                                 false, false, 0);
17979
17980     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17981                        DAG.getConstant(1, MVT::i32));
17982     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
17983                                 MachinePointerInfo(TrmpAddr, 1),
17984                                 false, false, 1);
17985
17986     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
17987     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17988                        DAG.getConstant(5, MVT::i32));
17989     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
17990                                 MachinePointerInfo(TrmpAddr, 5),
17991                                 false, false, 1);
17992
17993     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17994                        DAG.getConstant(6, MVT::i32));
17995     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
17996                                 MachinePointerInfo(TrmpAddr, 6),
17997                                 false, false, 1);
17998
17999     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18000   }
18001 }
18002
18003 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18004                                             SelectionDAG &DAG) const {
18005   /*
18006    The rounding mode is in bits 11:10 of FPSR, and has the following
18007    settings:
18008      00 Round to nearest
18009      01 Round to -inf
18010      10 Round to +inf
18011      11 Round to 0
18012
18013   FLT_ROUNDS, on the other hand, expects the following:
18014     -1 Undefined
18015      0 Round to 0
18016      1 Round to nearest
18017      2 Round to +inf
18018      3 Round to -inf
18019
18020   To perform the conversion, we do:
18021     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18022   */
18023
18024   MachineFunction &MF = DAG.getMachineFunction();
18025   const TargetMachine &TM = MF.getTarget();
18026   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
18027   unsigned StackAlignment = TFI.getStackAlignment();
18028   MVT VT = Op.getSimpleValueType();
18029   SDLoc DL(Op);
18030
18031   // Save FP Control Word to stack slot
18032   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18033   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18034
18035   MachineMemOperand *MMO =
18036    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18037                            MachineMemOperand::MOStore, 2, 2);
18038
18039   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18040   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18041                                           DAG.getVTList(MVT::Other),
18042                                           Ops, MVT::i16, MMO);
18043
18044   // Load FP Control Word from stack slot
18045   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18046                             MachinePointerInfo(), false, false, false, 0);
18047
18048   // Transform as necessary
18049   SDValue CWD1 =
18050     DAG.getNode(ISD::SRL, DL, MVT::i16,
18051                 DAG.getNode(ISD::AND, DL, MVT::i16,
18052                             CWD, DAG.getConstant(0x800, MVT::i16)),
18053                 DAG.getConstant(11, MVT::i8));
18054   SDValue CWD2 =
18055     DAG.getNode(ISD::SRL, DL, MVT::i16,
18056                 DAG.getNode(ISD::AND, DL, MVT::i16,
18057                             CWD, DAG.getConstant(0x400, MVT::i16)),
18058                 DAG.getConstant(9, MVT::i8));
18059
18060   SDValue RetVal =
18061     DAG.getNode(ISD::AND, DL, MVT::i16,
18062                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18063                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18064                             DAG.getConstant(1, MVT::i16)),
18065                 DAG.getConstant(3, MVT::i16));
18066
18067   return DAG.getNode((VT.getSizeInBits() < 16 ?
18068                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18069 }
18070
18071 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18072   MVT VT = Op.getSimpleValueType();
18073   EVT OpVT = VT;
18074   unsigned NumBits = VT.getSizeInBits();
18075   SDLoc dl(Op);
18076
18077   Op = Op.getOperand(0);
18078   if (VT == MVT::i8) {
18079     // Zero extend to i32 since there is not an i8 bsr.
18080     OpVT = MVT::i32;
18081     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18082   }
18083
18084   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18085   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18086   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18087
18088   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18089   SDValue Ops[] = {
18090     Op,
18091     DAG.getConstant(NumBits+NumBits-1, OpVT),
18092     DAG.getConstant(X86::COND_E, MVT::i8),
18093     Op.getValue(1)
18094   };
18095   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18096
18097   // Finally xor with NumBits-1.
18098   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18099
18100   if (VT == MVT::i8)
18101     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18102   return Op;
18103 }
18104
18105 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18106   MVT VT = Op.getSimpleValueType();
18107   EVT OpVT = VT;
18108   unsigned NumBits = VT.getSizeInBits();
18109   SDLoc dl(Op);
18110
18111   Op = Op.getOperand(0);
18112   if (VT == MVT::i8) {
18113     // Zero extend to i32 since there is not an i8 bsr.
18114     OpVT = MVT::i32;
18115     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18116   }
18117
18118   // Issue a bsr (scan bits in reverse).
18119   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18120   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18121
18122   // And xor with NumBits-1.
18123   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18124
18125   if (VT == MVT::i8)
18126     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18127   return Op;
18128 }
18129
18130 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18131   MVT VT = Op.getSimpleValueType();
18132   unsigned NumBits = VT.getSizeInBits();
18133   SDLoc dl(Op);
18134   Op = Op.getOperand(0);
18135
18136   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18137   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18138   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18139
18140   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18141   SDValue Ops[] = {
18142     Op,
18143     DAG.getConstant(NumBits, VT),
18144     DAG.getConstant(X86::COND_E, MVT::i8),
18145     Op.getValue(1)
18146   };
18147   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18148 }
18149
18150 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18151 // ones, and then concatenate the result back.
18152 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18153   MVT VT = Op.getSimpleValueType();
18154
18155   assert(VT.is256BitVector() && VT.isInteger() &&
18156          "Unsupported value type for operation");
18157
18158   unsigned NumElems = VT.getVectorNumElements();
18159   SDLoc dl(Op);
18160
18161   // Extract the LHS vectors
18162   SDValue LHS = Op.getOperand(0);
18163   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18164   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18165
18166   // Extract the RHS vectors
18167   SDValue RHS = Op.getOperand(1);
18168   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18169   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18170
18171   MVT EltVT = VT.getVectorElementType();
18172   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18173
18174   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18175                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18176                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18177 }
18178
18179 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18180   assert(Op.getSimpleValueType().is256BitVector() &&
18181          Op.getSimpleValueType().isInteger() &&
18182          "Only handle AVX 256-bit vector integer operation");
18183   return Lower256IntArith(Op, DAG);
18184 }
18185
18186 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18187   assert(Op.getSimpleValueType().is256BitVector() &&
18188          Op.getSimpleValueType().isInteger() &&
18189          "Only handle AVX 256-bit vector integer operation");
18190   return Lower256IntArith(Op, DAG);
18191 }
18192
18193 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18194                         SelectionDAG &DAG) {
18195   SDLoc dl(Op);
18196   MVT VT = Op.getSimpleValueType();
18197
18198   // Decompose 256-bit ops into smaller 128-bit ops.
18199   if (VT.is256BitVector() && !Subtarget->hasInt256())
18200     return Lower256IntArith(Op, DAG);
18201
18202   SDValue A = Op.getOperand(0);
18203   SDValue B = Op.getOperand(1);
18204
18205   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18206   if (VT == MVT::v4i32) {
18207     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18208            "Should not custom lower when pmuldq is available!");
18209
18210     // Extract the odd parts.
18211     static const int UnpackMask[] = { 1, -1, 3, -1 };
18212     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18213     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18214
18215     // Multiply the even parts.
18216     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18217     // Now multiply odd parts.
18218     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18219
18220     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18221     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18222
18223     // Merge the two vectors back together with a shuffle. This expands into 2
18224     // shuffles.
18225     static const int ShufMask[] = { 0, 4, 2, 6 };
18226     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18227   }
18228
18229   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18230          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18231
18232   //  Ahi = psrlqi(a, 32);
18233   //  Bhi = psrlqi(b, 32);
18234   //
18235   //  AloBlo = pmuludq(a, b);
18236   //  AloBhi = pmuludq(a, Bhi);
18237   //  AhiBlo = pmuludq(Ahi, b);
18238
18239   //  AloBhi = psllqi(AloBhi, 32);
18240   //  AhiBlo = psllqi(AhiBlo, 32);
18241   //  return AloBlo + AloBhi + AhiBlo;
18242
18243   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18244   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18245
18246   // Bit cast to 32-bit vectors for MULUDQ
18247   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18248                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18249   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18250   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18251   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18252   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18253
18254   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18255   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18256   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18257
18258   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18259   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18260
18261   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18262   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18263 }
18264
18265 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18266   assert(Subtarget->isTargetWin64() && "Unexpected target");
18267   EVT VT = Op.getValueType();
18268   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18269          "Unexpected return type for lowering");
18270
18271   RTLIB::Libcall LC;
18272   bool isSigned;
18273   switch (Op->getOpcode()) {
18274   default: llvm_unreachable("Unexpected request for libcall!");
18275   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18276   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18277   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18278   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18279   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18280   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18281   }
18282
18283   SDLoc dl(Op);
18284   SDValue InChain = DAG.getEntryNode();
18285
18286   TargetLowering::ArgListTy Args;
18287   TargetLowering::ArgListEntry Entry;
18288   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18289     EVT ArgVT = Op->getOperand(i).getValueType();
18290     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18291            "Unexpected argument type for lowering");
18292     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18293     Entry.Node = StackPtr;
18294     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18295                            false, false, 16);
18296     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18297     Entry.Ty = PointerType::get(ArgTy,0);
18298     Entry.isSExt = false;
18299     Entry.isZExt = false;
18300     Args.push_back(Entry);
18301   }
18302
18303   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18304                                          getPointerTy());
18305
18306   TargetLowering::CallLoweringInfo CLI(DAG);
18307   CLI.setDebugLoc(dl).setChain(InChain)
18308     .setCallee(getLibcallCallingConv(LC),
18309                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18310                Callee, std::move(Args), 0)
18311     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18312
18313   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18314   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18315 }
18316
18317 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18318                              SelectionDAG &DAG) {
18319   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18320   EVT VT = Op0.getValueType();
18321   SDLoc dl(Op);
18322
18323   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18324          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18325
18326   // PMULxD operations multiply each even value (starting at 0) of LHS with
18327   // the related value of RHS and produce a widen result.
18328   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18329   // => <2 x i64> <ae|cg>
18330   //
18331   // In other word, to have all the results, we need to perform two PMULxD:
18332   // 1. one with the even values.
18333   // 2. one with the odd values.
18334   // To achieve #2, with need to place the odd values at an even position.
18335   //
18336   // Place the odd value at an even position (basically, shift all values 1
18337   // step to the left):
18338   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18339   // <a|b|c|d> => <b|undef|d|undef>
18340   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18341   // <e|f|g|h> => <f|undef|h|undef>
18342   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18343
18344   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18345   // ints.
18346   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18347   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18348   unsigned Opcode =
18349       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18350   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18351   // => <2 x i64> <ae|cg>
18352   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18353                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18354   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18355   // => <2 x i64> <bf|dh>
18356   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18357                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18358
18359   // Shuffle it back into the right order.
18360   SDValue Highs, Lows;
18361   if (VT == MVT::v8i32) {
18362     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18363     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18364     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18365     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18366   } else {
18367     const int HighMask[] = {1, 5, 3, 7};
18368     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18369     const int LowMask[] = {0, 4, 2, 6};
18370     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18371   }
18372
18373   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18374   // unsigned multiply.
18375   if (IsSigned && !Subtarget->hasSSE41()) {
18376     SDValue ShAmt =
18377         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18378     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18379                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18380     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18381                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18382
18383     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18384     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18385   }
18386
18387   // The first result of MUL_LOHI is actually the low value, followed by the
18388   // high value.
18389   SDValue Ops[] = {Lows, Highs};
18390   return DAG.getMergeValues(Ops, dl);
18391 }
18392
18393 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18394                                          const X86Subtarget *Subtarget) {
18395   MVT VT = Op.getSimpleValueType();
18396   SDLoc dl(Op);
18397   SDValue R = Op.getOperand(0);
18398   SDValue Amt = Op.getOperand(1);
18399
18400   // Optimize shl/srl/sra with constant shift amount.
18401   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18402     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18403       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18404
18405       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18406           (Subtarget->hasInt256() &&
18407            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18408           (Subtarget->hasAVX512() &&
18409            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18410         if (Op.getOpcode() == ISD::SHL)
18411           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18412                                             DAG);
18413         if (Op.getOpcode() == ISD::SRL)
18414           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18415                                             DAG);
18416         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18417           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18418                                             DAG);
18419       }
18420
18421       if (VT == MVT::v16i8) {
18422         if (Op.getOpcode() == ISD::SHL) {
18423           // Make a large shift.
18424           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18425                                                    MVT::v8i16, R, ShiftAmt,
18426                                                    DAG);
18427           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18428           // Zero out the rightmost bits.
18429           SmallVector<SDValue, 16> V(16,
18430                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18431                                                      MVT::i8));
18432           return DAG.getNode(ISD::AND, dl, VT, SHL,
18433                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18434         }
18435         if (Op.getOpcode() == ISD::SRL) {
18436           // Make a large shift.
18437           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18438                                                    MVT::v8i16, R, ShiftAmt,
18439                                                    DAG);
18440           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18441           // Zero out the leftmost bits.
18442           SmallVector<SDValue, 16> V(16,
18443                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18444                                                      MVT::i8));
18445           return DAG.getNode(ISD::AND, dl, VT, SRL,
18446                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18447         }
18448         if (Op.getOpcode() == ISD::SRA) {
18449           if (ShiftAmt == 7) {
18450             // R s>> 7  ===  R s< 0
18451             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18452             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18453           }
18454
18455           // R s>> a === ((R u>> a) ^ m) - m
18456           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18457           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18458                                                          MVT::i8));
18459           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18460           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18461           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18462           return Res;
18463         }
18464         llvm_unreachable("Unknown shift opcode.");
18465       }
18466
18467       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18468         if (Op.getOpcode() == ISD::SHL) {
18469           // Make a large shift.
18470           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18471                                                    MVT::v16i16, R, ShiftAmt,
18472                                                    DAG);
18473           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18474           // Zero out the rightmost bits.
18475           SmallVector<SDValue, 32> V(32,
18476                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18477                                                      MVT::i8));
18478           return DAG.getNode(ISD::AND, dl, VT, SHL,
18479                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18480         }
18481         if (Op.getOpcode() == ISD::SRL) {
18482           // Make a large shift.
18483           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18484                                                    MVT::v16i16, R, ShiftAmt,
18485                                                    DAG);
18486           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18487           // Zero out the leftmost bits.
18488           SmallVector<SDValue, 32> V(32,
18489                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18490                                                      MVT::i8));
18491           return DAG.getNode(ISD::AND, dl, VT, SRL,
18492                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18493         }
18494         if (Op.getOpcode() == ISD::SRA) {
18495           if (ShiftAmt == 7) {
18496             // R s>> 7  ===  R s< 0
18497             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18498             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18499           }
18500
18501           // R s>> a === ((R u>> a) ^ m) - m
18502           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18503           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18504                                                          MVT::i8));
18505           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18506           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18507           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18508           return Res;
18509         }
18510         llvm_unreachable("Unknown shift opcode.");
18511       }
18512     }
18513   }
18514
18515   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18516   if (!Subtarget->is64Bit() &&
18517       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18518       Amt.getOpcode() == ISD::BITCAST &&
18519       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18520     Amt = Amt.getOperand(0);
18521     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18522                      VT.getVectorNumElements();
18523     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18524     uint64_t ShiftAmt = 0;
18525     for (unsigned i = 0; i != Ratio; ++i) {
18526       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18527       if (!C)
18528         return SDValue();
18529       // 6 == Log2(64)
18530       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18531     }
18532     // Check remaining shift amounts.
18533     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18534       uint64_t ShAmt = 0;
18535       for (unsigned j = 0; j != Ratio; ++j) {
18536         ConstantSDNode *C =
18537           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18538         if (!C)
18539           return SDValue();
18540         // 6 == Log2(64)
18541         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18542       }
18543       if (ShAmt != ShiftAmt)
18544         return SDValue();
18545     }
18546     switch (Op.getOpcode()) {
18547     default:
18548       llvm_unreachable("Unknown shift opcode!");
18549     case ISD::SHL:
18550       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18551                                         DAG);
18552     case ISD::SRL:
18553       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18554                                         DAG);
18555     case ISD::SRA:
18556       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18557                                         DAG);
18558     }
18559   }
18560
18561   return SDValue();
18562 }
18563
18564 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18565                                         const X86Subtarget* Subtarget) {
18566   MVT VT = Op.getSimpleValueType();
18567   SDLoc dl(Op);
18568   SDValue R = Op.getOperand(0);
18569   SDValue Amt = Op.getOperand(1);
18570
18571   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18572       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18573       (Subtarget->hasInt256() &&
18574        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18575         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18576        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18577     SDValue BaseShAmt;
18578     EVT EltVT = VT.getVectorElementType();
18579
18580     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18581       // Check if this build_vector node is doing a splat.
18582       // If so, then set BaseShAmt equal to the splat value.
18583       BaseShAmt = BV->getSplatValue();
18584       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18585         BaseShAmt = SDValue();
18586     } else {
18587       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18588         Amt = Amt.getOperand(0);
18589
18590       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18591       if (SVN && SVN->isSplat()) {
18592         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18593         SDValue InVec = Amt.getOperand(0);
18594         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18595           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18596                  "Unexpected shuffle index found!");
18597           BaseShAmt = InVec.getOperand(SplatIdx);
18598         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18599            if (ConstantSDNode *C =
18600                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18601              if (C->getZExtValue() == SplatIdx)
18602                BaseShAmt = InVec.getOperand(1);
18603            }
18604         }
18605
18606         if (!BaseShAmt)
18607           // Avoid introducing an extract element from a shuffle.
18608           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18609                                     DAG.getIntPtrConstant(SplatIdx));
18610       }
18611     }
18612
18613     if (BaseShAmt.getNode()) {
18614       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18615       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18616         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18617       else if (EltVT.bitsLT(MVT::i32))
18618         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18619
18620       switch (Op.getOpcode()) {
18621       default:
18622         llvm_unreachable("Unknown shift opcode!");
18623       case ISD::SHL:
18624         switch (VT.SimpleTy) {
18625         default: return SDValue();
18626         case MVT::v2i64:
18627         case MVT::v4i32:
18628         case MVT::v8i16:
18629         case MVT::v4i64:
18630         case MVT::v8i32:
18631         case MVT::v16i16:
18632         case MVT::v16i32:
18633         case MVT::v8i64:
18634           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18635         }
18636       case ISD::SRA:
18637         switch (VT.SimpleTy) {
18638         default: return SDValue();
18639         case MVT::v4i32:
18640         case MVT::v8i16:
18641         case MVT::v8i32:
18642         case MVT::v16i16:
18643         case MVT::v16i32:
18644         case MVT::v8i64:
18645           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18646         }
18647       case ISD::SRL:
18648         switch (VT.SimpleTy) {
18649         default: return SDValue();
18650         case MVT::v2i64:
18651         case MVT::v4i32:
18652         case MVT::v8i16:
18653         case MVT::v4i64:
18654         case MVT::v8i32:
18655         case MVT::v16i16:
18656         case MVT::v16i32:
18657         case MVT::v8i64:
18658           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18659         }
18660       }
18661     }
18662   }
18663
18664   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18665   if (!Subtarget->is64Bit() &&
18666       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18667       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18668       Amt.getOpcode() == ISD::BITCAST &&
18669       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18670     Amt = Amt.getOperand(0);
18671     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18672                      VT.getVectorNumElements();
18673     std::vector<SDValue> Vals(Ratio);
18674     for (unsigned i = 0; i != Ratio; ++i)
18675       Vals[i] = Amt.getOperand(i);
18676     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18677       for (unsigned j = 0; j != Ratio; ++j)
18678         if (Vals[j] != Amt.getOperand(i + j))
18679           return SDValue();
18680     }
18681     switch (Op.getOpcode()) {
18682     default:
18683       llvm_unreachable("Unknown shift opcode!");
18684     case ISD::SHL:
18685       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18686     case ISD::SRL:
18687       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18688     case ISD::SRA:
18689       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18690     }
18691   }
18692
18693   return SDValue();
18694 }
18695
18696 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18697                           SelectionDAG &DAG) {
18698   MVT VT = Op.getSimpleValueType();
18699   SDLoc dl(Op);
18700   SDValue R = Op.getOperand(0);
18701   SDValue Amt = Op.getOperand(1);
18702   SDValue V;
18703
18704   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18705   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18706
18707   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18708   if (V.getNode())
18709     return V;
18710
18711   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18712   if (V.getNode())
18713       return V;
18714
18715   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18716     return Op;
18717   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18718   if (Subtarget->hasInt256()) {
18719     if (Op.getOpcode() == ISD::SRL &&
18720         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18721          VT == MVT::v4i64 || VT == MVT::v8i32))
18722       return Op;
18723     if (Op.getOpcode() == ISD::SHL &&
18724         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18725          VT == MVT::v4i64 || VT == MVT::v8i32))
18726       return Op;
18727     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18728       return Op;
18729   }
18730
18731   // If possible, lower this packed shift into a vector multiply instead of
18732   // expanding it into a sequence of scalar shifts.
18733   // Do this only if the vector shift count is a constant build_vector.
18734   if (Op.getOpcode() == ISD::SHL &&
18735       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18736        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18737       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18738     SmallVector<SDValue, 8> Elts;
18739     EVT SVT = VT.getScalarType();
18740     unsigned SVTBits = SVT.getSizeInBits();
18741     const APInt &One = APInt(SVTBits, 1);
18742     unsigned NumElems = VT.getVectorNumElements();
18743
18744     for (unsigned i=0; i !=NumElems; ++i) {
18745       SDValue Op = Amt->getOperand(i);
18746       if (Op->getOpcode() == ISD::UNDEF) {
18747         Elts.push_back(Op);
18748         continue;
18749       }
18750
18751       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18752       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18753       uint64_t ShAmt = C.getZExtValue();
18754       if (ShAmt >= SVTBits) {
18755         Elts.push_back(DAG.getUNDEF(SVT));
18756         continue;
18757       }
18758       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18759     }
18760     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18761     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18762   }
18763
18764   // Lower SHL with variable shift amount.
18765   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18766     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18767
18768     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18769     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18770     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18771     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18772   }
18773
18774   // If possible, lower this shift as a sequence of two shifts by
18775   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18776   // Example:
18777   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18778   //
18779   // Could be rewritten as:
18780   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18781   //
18782   // The advantage is that the two shifts from the example would be
18783   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18784   // the vector shift into four scalar shifts plus four pairs of vector
18785   // insert/extract.
18786   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18787       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18788     unsigned TargetOpcode = X86ISD::MOVSS;
18789     bool CanBeSimplified;
18790     // The splat value for the first packed shift (the 'X' from the example).
18791     SDValue Amt1 = Amt->getOperand(0);
18792     // The splat value for the second packed shift (the 'Y' from the example).
18793     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18794                                         Amt->getOperand(2);
18795
18796     // See if it is possible to replace this node with a sequence of
18797     // two shifts followed by a MOVSS/MOVSD
18798     if (VT == MVT::v4i32) {
18799       // Check if it is legal to use a MOVSS.
18800       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18801                         Amt2 == Amt->getOperand(3);
18802       if (!CanBeSimplified) {
18803         // Otherwise, check if we can still simplify this node using a MOVSD.
18804         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18805                           Amt->getOperand(2) == Amt->getOperand(3);
18806         TargetOpcode = X86ISD::MOVSD;
18807         Amt2 = Amt->getOperand(2);
18808       }
18809     } else {
18810       // Do similar checks for the case where the machine value type
18811       // is MVT::v8i16.
18812       CanBeSimplified = Amt1 == Amt->getOperand(1);
18813       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18814         CanBeSimplified = Amt2 == Amt->getOperand(i);
18815
18816       if (!CanBeSimplified) {
18817         TargetOpcode = X86ISD::MOVSD;
18818         CanBeSimplified = true;
18819         Amt2 = Amt->getOperand(4);
18820         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
18821           CanBeSimplified = Amt1 == Amt->getOperand(i);
18822         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
18823           CanBeSimplified = Amt2 == Amt->getOperand(j);
18824       }
18825     }
18826
18827     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
18828         isa<ConstantSDNode>(Amt2)) {
18829       // Replace this node with two shifts followed by a MOVSS/MOVSD.
18830       EVT CastVT = MVT::v4i32;
18831       SDValue Splat1 =
18832         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
18833       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
18834       SDValue Splat2 =
18835         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
18836       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
18837       if (TargetOpcode == X86ISD::MOVSD)
18838         CastVT = MVT::v2i64;
18839       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
18840       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
18841       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
18842                                             BitCast1, DAG);
18843       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
18844     }
18845   }
18846
18847   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
18848     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
18849
18850     // a = a << 5;
18851     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
18852     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
18853
18854     // Turn 'a' into a mask suitable for VSELECT
18855     SDValue VSelM = DAG.getConstant(0x80, VT);
18856     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18857     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18858
18859     SDValue CM1 = DAG.getConstant(0x0f, VT);
18860     SDValue CM2 = DAG.getConstant(0x3f, VT);
18861
18862     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
18863     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
18864     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
18865     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18866     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18867
18868     // a += a
18869     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18870     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18871     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18872
18873     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
18874     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
18875     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
18876     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18877     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18878
18879     // a += a
18880     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18881     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18882     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18883
18884     // return VSELECT(r, r+r, a);
18885     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
18886                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
18887     return R;
18888   }
18889
18890   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
18891   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
18892   // solution better.
18893   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
18894     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
18895     unsigned ExtOpc =
18896         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18897     R = DAG.getNode(ExtOpc, dl, NewVT, R);
18898     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
18899     return DAG.getNode(ISD::TRUNCATE, dl, VT,
18900                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
18901     }
18902
18903   // Decompose 256-bit shifts into smaller 128-bit shifts.
18904   if (VT.is256BitVector()) {
18905     unsigned NumElems = VT.getVectorNumElements();
18906     MVT EltVT = VT.getVectorElementType();
18907     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18908
18909     // Extract the two vectors
18910     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
18911     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
18912
18913     // Recreate the shift amount vectors
18914     SDValue Amt1, Amt2;
18915     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
18916       // Constant shift amount
18917       SmallVector<SDValue, 4> Amt1Csts;
18918       SmallVector<SDValue, 4> Amt2Csts;
18919       for (unsigned i = 0; i != NumElems/2; ++i)
18920         Amt1Csts.push_back(Amt->getOperand(i));
18921       for (unsigned i = NumElems/2; i != NumElems; ++i)
18922         Amt2Csts.push_back(Amt->getOperand(i));
18923
18924       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
18925       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
18926     } else {
18927       // Variable shift amount
18928       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
18929       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
18930     }
18931
18932     // Issue new vector shifts for the smaller types
18933     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
18934     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
18935
18936     // Concatenate the result back
18937     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
18938   }
18939
18940   return SDValue();
18941 }
18942
18943 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
18944   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
18945   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
18946   // looks for this combo and may remove the "setcc" instruction if the "setcc"
18947   // has only one use.
18948   SDNode *N = Op.getNode();
18949   SDValue LHS = N->getOperand(0);
18950   SDValue RHS = N->getOperand(1);
18951   unsigned BaseOp = 0;
18952   unsigned Cond = 0;
18953   SDLoc DL(Op);
18954   switch (Op.getOpcode()) {
18955   default: llvm_unreachable("Unknown ovf instruction!");
18956   case ISD::SADDO:
18957     // A subtract of one will be selected as a INC. Note that INC doesn't
18958     // set CF, so we can't do this for UADDO.
18959     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18960       if (C->isOne()) {
18961         BaseOp = X86ISD::INC;
18962         Cond = X86::COND_O;
18963         break;
18964       }
18965     BaseOp = X86ISD::ADD;
18966     Cond = X86::COND_O;
18967     break;
18968   case ISD::UADDO:
18969     BaseOp = X86ISD::ADD;
18970     Cond = X86::COND_B;
18971     break;
18972   case ISD::SSUBO:
18973     // A subtract of one will be selected as a DEC. Note that DEC doesn't
18974     // set CF, so we can't do this for USUBO.
18975     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18976       if (C->isOne()) {
18977         BaseOp = X86ISD::DEC;
18978         Cond = X86::COND_O;
18979         break;
18980       }
18981     BaseOp = X86ISD::SUB;
18982     Cond = X86::COND_O;
18983     break;
18984   case ISD::USUBO:
18985     BaseOp = X86ISD::SUB;
18986     Cond = X86::COND_B;
18987     break;
18988   case ISD::SMULO:
18989     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
18990     Cond = X86::COND_O;
18991     break;
18992   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
18993     if (N->getValueType(0) == MVT::i8) {
18994       BaseOp = X86ISD::UMUL8;
18995       Cond = X86::COND_O;
18996       break;
18997     }
18998     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
18999                                  MVT::i32);
19000     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19001
19002     SDValue SetCC =
19003       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19004                   DAG.getConstant(X86::COND_O, MVT::i32),
19005                   SDValue(Sum.getNode(), 2));
19006
19007     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19008   }
19009   }
19010
19011   // Also sets EFLAGS.
19012   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19013   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19014
19015   SDValue SetCC =
19016     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19017                 DAG.getConstant(Cond, MVT::i32),
19018                 SDValue(Sum.getNode(), 1));
19019
19020   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19021 }
19022
19023 // Sign extension of the low part of vector elements. This may be used either
19024 // when sign extend instructions are not available or if the vector element
19025 // sizes already match the sign-extended size. If the vector elements are in
19026 // their pre-extended size and sign extend instructions are available, that will
19027 // be handled by LowerSIGN_EXTEND.
19028 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19029                                                   SelectionDAG &DAG) const {
19030   SDLoc dl(Op);
19031   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19032   MVT VT = Op.getSimpleValueType();
19033
19034   if (!Subtarget->hasSSE2() || !VT.isVector())
19035     return SDValue();
19036
19037   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19038                       ExtraVT.getScalarType().getSizeInBits();
19039
19040   switch (VT.SimpleTy) {
19041     default: return SDValue();
19042     case MVT::v8i32:
19043     case MVT::v16i16:
19044       if (!Subtarget->hasFp256())
19045         return SDValue();
19046       if (!Subtarget->hasInt256()) {
19047         // needs to be split
19048         unsigned NumElems = VT.getVectorNumElements();
19049
19050         // Extract the LHS vectors
19051         SDValue LHS = Op.getOperand(0);
19052         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19053         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19054
19055         MVT EltVT = VT.getVectorElementType();
19056         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19057
19058         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19059         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19060         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19061                                    ExtraNumElems/2);
19062         SDValue Extra = DAG.getValueType(ExtraVT);
19063
19064         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19065         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19066
19067         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19068       }
19069       // fall through
19070     case MVT::v4i32:
19071     case MVT::v8i16: {
19072       SDValue Op0 = Op.getOperand(0);
19073
19074       // This is a sign extension of some low part of vector elements without
19075       // changing the size of the vector elements themselves:
19076       // Shift-Left + Shift-Right-Algebraic.
19077       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19078                                                BitsDiff, DAG);
19079       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19080                                         DAG);
19081     }
19082   }
19083 }
19084
19085 /// Returns true if the operand type is exactly twice the native width, and
19086 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19087 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19088 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19089 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19090   const X86Subtarget &Subtarget =
19091       getTargetMachine().getSubtarget<X86Subtarget>();
19092   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19093
19094   if (OpWidth == 64)
19095     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19096   else if (OpWidth == 128)
19097     return Subtarget.hasCmpxchg16b();
19098   else
19099     return false;
19100 }
19101
19102 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19103   return needsCmpXchgNb(SI->getValueOperand()->getType());
19104 }
19105
19106 // Note: this turns large loads into lock cmpxchg8b/16b.
19107 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19108 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19109   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19110   return needsCmpXchgNb(PTy->getElementType());
19111 }
19112
19113 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19114   const X86Subtarget &Subtarget =
19115       getTargetMachine().getSubtarget<X86Subtarget>();
19116   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19117   const Type *MemType = AI->getType();
19118
19119   // If the operand is too big, we must see if cmpxchg8/16b is available
19120   // and default to library calls otherwise.
19121   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19122     return needsCmpXchgNb(MemType);
19123
19124   AtomicRMWInst::BinOp Op = AI->getOperation();
19125   switch (Op) {
19126   default:
19127     llvm_unreachable("Unknown atomic operation");
19128   case AtomicRMWInst::Xchg:
19129   case AtomicRMWInst::Add:
19130   case AtomicRMWInst::Sub:
19131     // It's better to use xadd, xsub or xchg for these in all cases.
19132     return false;
19133   case AtomicRMWInst::Or:
19134   case AtomicRMWInst::And:
19135   case AtomicRMWInst::Xor:
19136     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19137     // prefix to a normal instruction for these operations.
19138     return !AI->use_empty();
19139   case AtomicRMWInst::Nand:
19140   case AtomicRMWInst::Max:
19141   case AtomicRMWInst::Min:
19142   case AtomicRMWInst::UMax:
19143   case AtomicRMWInst::UMin:
19144     // These always require a non-trivial set of data operations on x86. We must
19145     // use a cmpxchg loop.
19146     return true;
19147   }
19148 }
19149
19150 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19151   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19152   // no-sse2). There isn't any reason to disable it if the target processor
19153   // supports it.
19154   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19155 }
19156
19157 LoadInst *
19158 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19159   const X86Subtarget &Subtarget =
19160       getTargetMachine().getSubtarget<X86Subtarget>();
19161   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19162   const Type *MemType = AI->getType();
19163   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19164   // there is no benefit in turning such RMWs into loads, and it is actually
19165   // harmful as it introduces a mfence.
19166   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19167     return nullptr;
19168
19169   auto Builder = IRBuilder<>(AI);
19170   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19171   auto SynchScope = AI->getSynchScope();
19172   // We must restrict the ordering to avoid generating loads with Release or
19173   // ReleaseAcquire orderings.
19174   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19175   auto Ptr = AI->getPointerOperand();
19176
19177   // Before the load we need a fence. Here is an example lifted from
19178   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19179   // is required:
19180   // Thread 0:
19181   //   x.store(1, relaxed);
19182   //   r1 = y.fetch_add(0, release);
19183   // Thread 1:
19184   //   y.fetch_add(42, acquire);
19185   //   r2 = x.load(relaxed);
19186   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19187   // lowered to just a load without a fence. A mfence flushes the store buffer,
19188   // making the optimization clearly correct.
19189   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19190   // otherwise, we might be able to be more agressive on relaxed idempotent
19191   // rmw. In practice, they do not look useful, so we don't try to be
19192   // especially clever.
19193   if (SynchScope == SingleThread) {
19194     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19195     // the IR level, so we must wrap it in an intrinsic.
19196     return nullptr;
19197   } else if (hasMFENCE(Subtarget)) {
19198     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19199             Intrinsic::x86_sse2_mfence);
19200     Builder.CreateCall(MFence);
19201   } else {
19202     // FIXME: it might make sense to use a locked operation here but on a
19203     // different cache-line to prevent cache-line bouncing. In practice it
19204     // is probably a small win, and x86 processors without mfence are rare
19205     // enough that we do not bother.
19206     return nullptr;
19207   }
19208
19209   // Finally we can emit the atomic load.
19210   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19211           AI->getType()->getPrimitiveSizeInBits());
19212   Loaded->setAtomic(Order, SynchScope);
19213   AI->replaceAllUsesWith(Loaded);
19214   AI->eraseFromParent();
19215   return Loaded;
19216 }
19217
19218 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19219                                  SelectionDAG &DAG) {
19220   SDLoc dl(Op);
19221   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19222     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19223   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19224     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19225
19226   // The only fence that needs an instruction is a sequentially-consistent
19227   // cross-thread fence.
19228   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19229     if (hasMFENCE(*Subtarget))
19230       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19231
19232     SDValue Chain = Op.getOperand(0);
19233     SDValue Zero = DAG.getConstant(0, MVT::i32);
19234     SDValue Ops[] = {
19235       DAG.getRegister(X86::ESP, MVT::i32), // Base
19236       DAG.getTargetConstant(1, MVT::i8),   // Scale
19237       DAG.getRegister(0, MVT::i32),        // Index
19238       DAG.getTargetConstant(0, MVT::i32),  // Disp
19239       DAG.getRegister(0, MVT::i32),        // Segment.
19240       Zero,
19241       Chain
19242     };
19243     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19244     return SDValue(Res, 0);
19245   }
19246
19247   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19248   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19249 }
19250
19251 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19252                              SelectionDAG &DAG) {
19253   MVT T = Op.getSimpleValueType();
19254   SDLoc DL(Op);
19255   unsigned Reg = 0;
19256   unsigned size = 0;
19257   switch(T.SimpleTy) {
19258   default: llvm_unreachable("Invalid value type!");
19259   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19260   case MVT::i16: Reg = X86::AX;  size = 2; break;
19261   case MVT::i32: Reg = X86::EAX; size = 4; break;
19262   case MVT::i64:
19263     assert(Subtarget->is64Bit() && "Node not type legal!");
19264     Reg = X86::RAX; size = 8;
19265     break;
19266   }
19267   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19268                                   Op.getOperand(2), SDValue());
19269   SDValue Ops[] = { cpIn.getValue(0),
19270                     Op.getOperand(1),
19271                     Op.getOperand(3),
19272                     DAG.getTargetConstant(size, MVT::i8),
19273                     cpIn.getValue(1) };
19274   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19275   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19276   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19277                                            Ops, T, MMO);
19278
19279   SDValue cpOut =
19280     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19281   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19282                                       MVT::i32, cpOut.getValue(2));
19283   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19284                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19285
19286   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19287   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19288   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19289   return SDValue();
19290 }
19291
19292 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19293                             SelectionDAG &DAG) {
19294   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19295   MVT DstVT = Op.getSimpleValueType();
19296
19297   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19298     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19299     if (DstVT != MVT::f64)
19300       // This conversion needs to be expanded.
19301       return SDValue();
19302
19303     SDValue InVec = Op->getOperand(0);
19304     SDLoc dl(Op);
19305     unsigned NumElts = SrcVT.getVectorNumElements();
19306     EVT SVT = SrcVT.getVectorElementType();
19307
19308     // Widen the vector in input in the case of MVT::v2i32.
19309     // Example: from MVT::v2i32 to MVT::v4i32.
19310     SmallVector<SDValue, 16> Elts;
19311     for (unsigned i = 0, e = NumElts; i != e; ++i)
19312       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19313                                  DAG.getIntPtrConstant(i)));
19314
19315     // Explicitly mark the extra elements as Undef.
19316     SDValue Undef = DAG.getUNDEF(SVT);
19317     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19318       Elts.push_back(Undef);
19319
19320     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19321     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19322     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19323     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19324                        DAG.getIntPtrConstant(0));
19325   }
19326
19327   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19328          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19329   assert((DstVT == MVT::i64 ||
19330           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19331          "Unexpected custom BITCAST");
19332   // i64 <=> MMX conversions are Legal.
19333   if (SrcVT==MVT::i64 && DstVT.isVector())
19334     return Op;
19335   if (DstVT==MVT::i64 && SrcVT.isVector())
19336     return Op;
19337   // MMX <=> MMX conversions are Legal.
19338   if (SrcVT.isVector() && DstVT.isVector())
19339     return Op;
19340   // All other conversions need to be expanded.
19341   return SDValue();
19342 }
19343
19344 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19345                           SelectionDAG &DAG) {
19346   SDNode *Node = Op.getNode();
19347   SDLoc dl(Node);
19348
19349   Op = Op.getOperand(0);
19350   EVT VT = Op.getValueType();
19351   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19352          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19353
19354   unsigned NumElts = VT.getVectorNumElements();
19355   EVT EltVT = VT.getVectorElementType();
19356   unsigned Len = EltVT.getSizeInBits();
19357
19358   // This is the vectorized version of the "best" algorithm from
19359   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19360   // with a minor tweak to use a series of adds + shifts instead of vector
19361   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19362   //
19363   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19364   //  v8i32 => Always profitable
19365   //
19366   // FIXME: There a couple of possible improvements:
19367   //
19368   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19369   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19370   //
19371   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19372          "CTPOP not implemented for this vector element type.");
19373
19374   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19375   // extra legalization.
19376   bool NeedsBitcast = EltVT == MVT::i32;
19377   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19378
19379   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19380   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19381   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19382
19383   // v = v - ((v >> 1) & 0x55555555...)
19384   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19385   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19386   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19387   if (NeedsBitcast)
19388     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19389
19390   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19391   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19392   if (NeedsBitcast)
19393     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19394
19395   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19396   if (VT != And.getValueType())
19397     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19398   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19399
19400   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19401   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19402   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19403   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19404   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19405
19406   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19407   if (NeedsBitcast) {
19408     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19409     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19410     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19411   }
19412
19413   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19414   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19415   if (VT != AndRHS.getValueType()) {
19416     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19417     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19418   }
19419   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19420
19421   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19422   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19423   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19424   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19425   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19426
19427   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19428   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19429   if (NeedsBitcast) {
19430     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19431     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19432   }
19433   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19434   if (VT != And.getValueType())
19435     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19436
19437   // The algorithm mentioned above uses:
19438   //    v = (v * 0x01010101...) >> (Len - 8)
19439   //
19440   // Change it to use vector adds + vector shifts which yield faster results on
19441   // Haswell than using vector integer multiplication.
19442   //
19443   // For i32 elements:
19444   //    v = v + (v >> 8)
19445   //    v = v + (v >> 16)
19446   //
19447   // For i64 elements:
19448   //    v = v + (v >> 8)
19449   //    v = v + (v >> 16)
19450   //    v = v + (v >> 32)
19451   //
19452   Add = And;
19453   SmallVector<SDValue, 8> Csts;
19454   for (unsigned i = 8; i <= Len/2; i *= 2) {
19455     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19456     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19457     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19458     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19459     Csts.clear();
19460   }
19461
19462   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19463   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19464   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19465   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19466   if (NeedsBitcast) {
19467     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19468     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19469   }
19470   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19471   if (VT != And.getValueType())
19472     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19473
19474   return And;
19475 }
19476
19477 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19478   SDNode *Node = Op.getNode();
19479   SDLoc dl(Node);
19480   EVT T = Node->getValueType(0);
19481   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19482                               DAG.getConstant(0, T), Node->getOperand(2));
19483   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19484                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19485                        Node->getOperand(0),
19486                        Node->getOperand(1), negOp,
19487                        cast<AtomicSDNode>(Node)->getMemOperand(),
19488                        cast<AtomicSDNode>(Node)->getOrdering(),
19489                        cast<AtomicSDNode>(Node)->getSynchScope());
19490 }
19491
19492 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19493   SDNode *Node = Op.getNode();
19494   SDLoc dl(Node);
19495   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19496
19497   // Convert seq_cst store -> xchg
19498   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19499   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19500   //        (The only way to get a 16-byte store is cmpxchg16b)
19501   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19502   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19503       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19504     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19505                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19506                                  Node->getOperand(0),
19507                                  Node->getOperand(1), Node->getOperand(2),
19508                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19509                                  cast<AtomicSDNode>(Node)->getOrdering(),
19510                                  cast<AtomicSDNode>(Node)->getSynchScope());
19511     return Swap.getValue(1);
19512   }
19513   // Other atomic stores have a simple pattern.
19514   return Op;
19515 }
19516
19517 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19518   EVT VT = Op.getNode()->getSimpleValueType(0);
19519
19520   // Let legalize expand this if it isn't a legal type yet.
19521   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19522     return SDValue();
19523
19524   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19525
19526   unsigned Opc;
19527   bool ExtraOp = false;
19528   switch (Op.getOpcode()) {
19529   default: llvm_unreachable("Invalid code");
19530   case ISD::ADDC: Opc = X86ISD::ADD; break;
19531   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19532   case ISD::SUBC: Opc = X86ISD::SUB; break;
19533   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19534   }
19535
19536   if (!ExtraOp)
19537     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19538                        Op.getOperand(1));
19539   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19540                      Op.getOperand(1), Op.getOperand(2));
19541 }
19542
19543 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19544                             SelectionDAG &DAG) {
19545   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19546
19547   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19548   // which returns the values as { float, float } (in XMM0) or
19549   // { double, double } (which is returned in XMM0, XMM1).
19550   SDLoc dl(Op);
19551   SDValue Arg = Op.getOperand(0);
19552   EVT ArgVT = Arg.getValueType();
19553   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19554
19555   TargetLowering::ArgListTy Args;
19556   TargetLowering::ArgListEntry Entry;
19557
19558   Entry.Node = Arg;
19559   Entry.Ty = ArgTy;
19560   Entry.isSExt = false;
19561   Entry.isZExt = false;
19562   Args.push_back(Entry);
19563
19564   bool isF64 = ArgVT == MVT::f64;
19565   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19566   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19567   // the results are returned via SRet in memory.
19568   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19569   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19570   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19571
19572   Type *RetTy = isF64
19573     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19574     : (Type*)VectorType::get(ArgTy, 4);
19575
19576   TargetLowering::CallLoweringInfo CLI(DAG);
19577   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19578     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19579
19580   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19581
19582   if (isF64)
19583     // Returned in xmm0 and xmm1.
19584     return CallResult.first;
19585
19586   // Returned in bits 0:31 and 32:64 xmm0.
19587   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19588                                CallResult.first, DAG.getIntPtrConstant(0));
19589   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19590                                CallResult.first, DAG.getIntPtrConstant(1));
19591   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19592   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19593 }
19594
19595 /// LowerOperation - Provide custom lowering hooks for some operations.
19596 ///
19597 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19598   switch (Op.getOpcode()) {
19599   default: llvm_unreachable("Should not custom lower this!");
19600   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19601   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19602   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19603     return LowerCMP_SWAP(Op, Subtarget, DAG);
19604   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19605   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19606   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19607   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19608   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19609   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19610   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19611   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19612   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19613   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19614   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19615   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19616   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19617   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19618   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19619   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19620   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19621   case ISD::SHL_PARTS:
19622   case ISD::SRA_PARTS:
19623   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19624   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19625   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19626   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19627   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19628   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19629   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19630   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19631   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19632   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19633   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19634   case ISD::FABS:
19635   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19636   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19637   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19638   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19639   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19640   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19641   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19642   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19643   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19644   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19645   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19646   case ISD::INTRINSIC_VOID:
19647   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19648   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19649   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19650   case ISD::FRAME_TO_ARGS_OFFSET:
19651                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19652   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19653   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19654   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19655   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19656   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19657   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19658   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19659   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19660   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19661   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19662   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19663   case ISD::UMUL_LOHI:
19664   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19665   case ISD::SRA:
19666   case ISD::SRL:
19667   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19668   case ISD::SADDO:
19669   case ISD::UADDO:
19670   case ISD::SSUBO:
19671   case ISD::USUBO:
19672   case ISD::SMULO:
19673   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19674   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19675   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19676   case ISD::ADDC:
19677   case ISD::ADDE:
19678   case ISD::SUBC:
19679   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19680   case ISD::ADD:                return LowerADD(Op, DAG);
19681   case ISD::SUB:                return LowerSUB(Op, DAG);
19682   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19683   }
19684 }
19685
19686 /// ReplaceNodeResults - Replace a node with an illegal result type
19687 /// with a new node built out of custom code.
19688 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19689                                            SmallVectorImpl<SDValue>&Results,
19690                                            SelectionDAG &DAG) const {
19691   SDLoc dl(N);
19692   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19693   switch (N->getOpcode()) {
19694   default:
19695     llvm_unreachable("Do not know how to custom type legalize this operation!");
19696   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19697   case X86ISD::FMINC:
19698   case X86ISD::FMIN:
19699   case X86ISD::FMAXC:
19700   case X86ISD::FMAX: {
19701     EVT VT = N->getValueType(0);
19702     if (VT != MVT::v2f32)
19703       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19704     SDValue UNDEF = DAG.getUNDEF(VT);
19705     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19706                               N->getOperand(0), UNDEF);
19707     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19708                               N->getOperand(1), UNDEF);
19709     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19710     return;
19711   }
19712   case ISD::SIGN_EXTEND_INREG:
19713   case ISD::ADDC:
19714   case ISD::ADDE:
19715   case ISD::SUBC:
19716   case ISD::SUBE:
19717     // We don't want to expand or promote these.
19718     return;
19719   case ISD::SDIV:
19720   case ISD::UDIV:
19721   case ISD::SREM:
19722   case ISD::UREM:
19723   case ISD::SDIVREM:
19724   case ISD::UDIVREM: {
19725     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19726     Results.push_back(V);
19727     return;
19728   }
19729   case ISD::FP_TO_SINT:
19730   case ISD::FP_TO_UINT: {
19731     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19732
19733     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19734       return;
19735
19736     std::pair<SDValue,SDValue> Vals =
19737         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19738     SDValue FIST = Vals.first, StackSlot = Vals.second;
19739     if (FIST.getNode()) {
19740       EVT VT = N->getValueType(0);
19741       // Return a load from the stack slot.
19742       if (StackSlot.getNode())
19743         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19744                                       MachinePointerInfo(),
19745                                       false, false, false, 0));
19746       else
19747         Results.push_back(FIST);
19748     }
19749     return;
19750   }
19751   case ISD::UINT_TO_FP: {
19752     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19753     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19754         N->getValueType(0) != MVT::v2f32)
19755       return;
19756     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19757                                  N->getOperand(0));
19758     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19759                                      MVT::f64);
19760     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19761     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19762                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19763     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19764     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19765     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19766     return;
19767   }
19768   case ISD::FP_ROUND: {
19769     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19770         return;
19771     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19772     Results.push_back(V);
19773     return;
19774   }
19775   case ISD::INTRINSIC_W_CHAIN: {
19776     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19777     switch (IntNo) {
19778     default : llvm_unreachable("Do not know how to custom type "
19779                                "legalize this intrinsic operation!");
19780     case Intrinsic::x86_rdtsc:
19781       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19782                                      Results);
19783     case Intrinsic::x86_rdtscp:
19784       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19785                                      Results);
19786     case Intrinsic::x86_rdpmc:
19787       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19788     }
19789   }
19790   case ISD::READCYCLECOUNTER: {
19791     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19792                                    Results);
19793   }
19794   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19795     EVT T = N->getValueType(0);
19796     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19797     bool Regs64bit = T == MVT::i128;
19798     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19799     SDValue cpInL, cpInH;
19800     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19801                         DAG.getConstant(0, HalfT));
19802     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19803                         DAG.getConstant(1, HalfT));
19804     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19805                              Regs64bit ? X86::RAX : X86::EAX,
19806                              cpInL, SDValue());
19807     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19808                              Regs64bit ? X86::RDX : X86::EDX,
19809                              cpInH, cpInL.getValue(1));
19810     SDValue swapInL, swapInH;
19811     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19812                           DAG.getConstant(0, HalfT));
19813     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19814                           DAG.getConstant(1, HalfT));
19815     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19816                                Regs64bit ? X86::RBX : X86::EBX,
19817                                swapInL, cpInH.getValue(1));
19818     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19819                                Regs64bit ? X86::RCX : X86::ECX,
19820                                swapInH, swapInL.getValue(1));
19821     SDValue Ops[] = { swapInH.getValue(0),
19822                       N->getOperand(1),
19823                       swapInH.getValue(1) };
19824     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19825     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
19826     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
19827                                   X86ISD::LCMPXCHG8_DAG;
19828     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
19829     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
19830                                         Regs64bit ? X86::RAX : X86::EAX,
19831                                         HalfT, Result.getValue(1));
19832     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
19833                                         Regs64bit ? X86::RDX : X86::EDX,
19834                                         HalfT, cpOutL.getValue(2));
19835     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
19836
19837     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
19838                                         MVT::i32, cpOutH.getValue(2));
19839     SDValue Success =
19840         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19841                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19842     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
19843
19844     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
19845     Results.push_back(Success);
19846     Results.push_back(EFLAGS.getValue(1));
19847     return;
19848   }
19849   case ISD::ATOMIC_SWAP:
19850   case ISD::ATOMIC_LOAD_ADD:
19851   case ISD::ATOMIC_LOAD_SUB:
19852   case ISD::ATOMIC_LOAD_AND:
19853   case ISD::ATOMIC_LOAD_OR:
19854   case ISD::ATOMIC_LOAD_XOR:
19855   case ISD::ATOMIC_LOAD_NAND:
19856   case ISD::ATOMIC_LOAD_MIN:
19857   case ISD::ATOMIC_LOAD_MAX:
19858   case ISD::ATOMIC_LOAD_UMIN:
19859   case ISD::ATOMIC_LOAD_UMAX:
19860   case ISD::ATOMIC_LOAD: {
19861     // Delegate to generic TypeLegalization. Situations we can really handle
19862     // should have already been dealt with by AtomicExpandPass.cpp.
19863     break;
19864   }
19865   case ISD::BITCAST: {
19866     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19867     EVT DstVT = N->getValueType(0);
19868     EVT SrcVT = N->getOperand(0)->getValueType(0);
19869
19870     if (SrcVT != MVT::f64 ||
19871         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
19872       return;
19873
19874     unsigned NumElts = DstVT.getVectorNumElements();
19875     EVT SVT = DstVT.getVectorElementType();
19876     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19877     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
19878                                    MVT::v2f64, N->getOperand(0));
19879     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
19880
19881     if (ExperimentalVectorWideningLegalization) {
19882       // If we are legalizing vectors by widening, we already have the desired
19883       // legal vector type, just return it.
19884       Results.push_back(ToVecInt);
19885       return;
19886     }
19887
19888     SmallVector<SDValue, 8> Elts;
19889     for (unsigned i = 0, e = NumElts; i != e; ++i)
19890       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
19891                                    ToVecInt, DAG.getIntPtrConstant(i)));
19892
19893     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
19894   }
19895   }
19896 }
19897
19898 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
19899   switch (Opcode) {
19900   default: return nullptr;
19901   case X86ISD::BSF:                return "X86ISD::BSF";
19902   case X86ISD::BSR:                return "X86ISD::BSR";
19903   case X86ISD::SHLD:               return "X86ISD::SHLD";
19904   case X86ISD::SHRD:               return "X86ISD::SHRD";
19905   case X86ISD::FAND:               return "X86ISD::FAND";
19906   case X86ISD::FANDN:              return "X86ISD::FANDN";
19907   case X86ISD::FOR:                return "X86ISD::FOR";
19908   case X86ISD::FXOR:               return "X86ISD::FXOR";
19909   case X86ISD::FSRL:               return "X86ISD::FSRL";
19910   case X86ISD::FILD:               return "X86ISD::FILD";
19911   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
19912   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
19913   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
19914   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
19915   case X86ISD::FLD:                return "X86ISD::FLD";
19916   case X86ISD::FST:                return "X86ISD::FST";
19917   case X86ISD::CALL:               return "X86ISD::CALL";
19918   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
19919   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
19920   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
19921   case X86ISD::BT:                 return "X86ISD::BT";
19922   case X86ISD::CMP:                return "X86ISD::CMP";
19923   case X86ISD::COMI:               return "X86ISD::COMI";
19924   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
19925   case X86ISD::CMPM:               return "X86ISD::CMPM";
19926   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
19927   case X86ISD::SETCC:              return "X86ISD::SETCC";
19928   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
19929   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
19930   case X86ISD::CMOV:               return "X86ISD::CMOV";
19931   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
19932   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
19933   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
19934   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
19935   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
19936   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
19937   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
19938   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
19939   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
19940   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
19941   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
19942   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
19943   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
19944   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
19945   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
19946   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
19947   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
19948   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
19949   case X86ISD::HADD:               return "X86ISD::HADD";
19950   case X86ISD::HSUB:               return "X86ISD::HSUB";
19951   case X86ISD::FHADD:              return "X86ISD::FHADD";
19952   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
19953   case X86ISD::UMAX:               return "X86ISD::UMAX";
19954   case X86ISD::UMIN:               return "X86ISD::UMIN";
19955   case X86ISD::SMAX:               return "X86ISD::SMAX";
19956   case X86ISD::SMIN:               return "X86ISD::SMIN";
19957   case X86ISD::FMAX:               return "X86ISD::FMAX";
19958   case X86ISD::FMIN:               return "X86ISD::FMIN";
19959   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
19960   case X86ISD::FMINC:              return "X86ISD::FMINC";
19961   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
19962   case X86ISD::FRCP:               return "X86ISD::FRCP";
19963   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
19964   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
19965   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
19966   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
19967   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
19968   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
19969   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
19970   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
19971   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
19972   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
19973   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
19974   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
19975   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
19976   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
19977   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
19978   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
19979   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
19980   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
19981   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
19982   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
19983   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
19984   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
19985   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
19986   case X86ISD::VSHL:               return "X86ISD::VSHL";
19987   case X86ISD::VSRL:               return "X86ISD::VSRL";
19988   case X86ISD::VSRA:               return "X86ISD::VSRA";
19989   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
19990   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
19991   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
19992   case X86ISD::CMPP:               return "X86ISD::CMPP";
19993   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
19994   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
19995   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
19996   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
19997   case X86ISD::ADD:                return "X86ISD::ADD";
19998   case X86ISD::SUB:                return "X86ISD::SUB";
19999   case X86ISD::ADC:                return "X86ISD::ADC";
20000   case X86ISD::SBB:                return "X86ISD::SBB";
20001   case X86ISD::SMUL:               return "X86ISD::SMUL";
20002   case X86ISD::UMUL:               return "X86ISD::UMUL";
20003   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20004   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20005   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20006   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20007   case X86ISD::INC:                return "X86ISD::INC";
20008   case X86ISD::DEC:                return "X86ISD::DEC";
20009   case X86ISD::OR:                 return "X86ISD::OR";
20010   case X86ISD::XOR:                return "X86ISD::XOR";
20011   case X86ISD::AND:                return "X86ISD::AND";
20012   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20013   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20014   case X86ISD::PTEST:              return "X86ISD::PTEST";
20015   case X86ISD::TESTP:              return "X86ISD::TESTP";
20016   case X86ISD::TESTM:              return "X86ISD::TESTM";
20017   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20018   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20019   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20020   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20021   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20022   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20023   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20024   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20025   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20026   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20027   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20028   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20029   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20030   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20031   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20032   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20033   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20034   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20035   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20036   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20037   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20038   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20039   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20040   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20041   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20042   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20043   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20044   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20045   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20046   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20047   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20048   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20049   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20050   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20051   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20052   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20053   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20054   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20055   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20056   case X86ISD::SAHF:               return "X86ISD::SAHF";
20057   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20058   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20059   case X86ISD::FMADD:              return "X86ISD::FMADD";
20060   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20061   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20062   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20063   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20064   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20065   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20066   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20067   case X86ISD::XTEST:              return "X86ISD::XTEST";
20068   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20069   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20070   case X86ISD::SELECT:             return "X86ISD::SELECT";
20071   }
20072 }
20073
20074 // isLegalAddressingMode - Return true if the addressing mode represented
20075 // by AM is legal for this target, for a load/store of the specified type.
20076 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20077                                               Type *Ty) const {
20078   // X86 supports extremely general addressing modes.
20079   CodeModel::Model M = getTargetMachine().getCodeModel();
20080   Reloc::Model R = getTargetMachine().getRelocationModel();
20081
20082   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20083   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20084     return false;
20085
20086   if (AM.BaseGV) {
20087     unsigned GVFlags =
20088       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20089
20090     // If a reference to this global requires an extra load, we can't fold it.
20091     if (isGlobalStubReference(GVFlags))
20092       return false;
20093
20094     // If BaseGV requires a register for the PIC base, we cannot also have a
20095     // BaseReg specified.
20096     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20097       return false;
20098
20099     // If lower 4G is not available, then we must use rip-relative addressing.
20100     if ((M != CodeModel::Small || R != Reloc::Static) &&
20101         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20102       return false;
20103   }
20104
20105   switch (AM.Scale) {
20106   case 0:
20107   case 1:
20108   case 2:
20109   case 4:
20110   case 8:
20111     // These scales always work.
20112     break;
20113   case 3:
20114   case 5:
20115   case 9:
20116     // These scales are formed with basereg+scalereg.  Only accept if there is
20117     // no basereg yet.
20118     if (AM.HasBaseReg)
20119       return false;
20120     break;
20121   default:  // Other stuff never works.
20122     return false;
20123   }
20124
20125   return true;
20126 }
20127
20128 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20129   unsigned Bits = Ty->getScalarSizeInBits();
20130
20131   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20132   // particularly cheaper than those without.
20133   if (Bits == 8)
20134     return false;
20135
20136   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20137   // variable shifts just as cheap as scalar ones.
20138   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20139     return false;
20140
20141   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20142   // fully general vector.
20143   return true;
20144 }
20145
20146 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20147   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20148     return false;
20149   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20150   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20151   return NumBits1 > NumBits2;
20152 }
20153
20154 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20155   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20156     return false;
20157
20158   if (!isTypeLegal(EVT::getEVT(Ty1)))
20159     return false;
20160
20161   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20162
20163   // Assuming the caller doesn't have a zeroext or signext return parameter,
20164   // truncation all the way down to i1 is valid.
20165   return true;
20166 }
20167
20168 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20169   return isInt<32>(Imm);
20170 }
20171
20172 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20173   // Can also use sub to handle negated immediates.
20174   return isInt<32>(Imm);
20175 }
20176
20177 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20178   if (!VT1.isInteger() || !VT2.isInteger())
20179     return false;
20180   unsigned NumBits1 = VT1.getSizeInBits();
20181   unsigned NumBits2 = VT2.getSizeInBits();
20182   return NumBits1 > NumBits2;
20183 }
20184
20185 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20186   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20187   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20188 }
20189
20190 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20191   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20192   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20193 }
20194
20195 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20196   EVT VT1 = Val.getValueType();
20197   if (isZExtFree(VT1, VT2))
20198     return true;
20199
20200   if (Val.getOpcode() != ISD::LOAD)
20201     return false;
20202
20203   if (!VT1.isSimple() || !VT1.isInteger() ||
20204       !VT2.isSimple() || !VT2.isInteger())
20205     return false;
20206
20207   switch (VT1.getSimpleVT().SimpleTy) {
20208   default: break;
20209   case MVT::i8:
20210   case MVT::i16:
20211   case MVT::i32:
20212     // X86 has 8, 16, and 32-bit zero-extending loads.
20213     return true;
20214   }
20215
20216   return false;
20217 }
20218
20219 bool
20220 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20221   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20222     return false;
20223
20224   VT = VT.getScalarType();
20225
20226   if (!VT.isSimple())
20227     return false;
20228
20229   switch (VT.getSimpleVT().SimpleTy) {
20230   case MVT::f32:
20231   case MVT::f64:
20232     return true;
20233   default:
20234     break;
20235   }
20236
20237   return false;
20238 }
20239
20240 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20241   // i16 instructions are longer (0x66 prefix) and potentially slower.
20242   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20243 }
20244
20245 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20246 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20247 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20248 /// are assumed to be legal.
20249 bool
20250 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20251                                       EVT VT) const {
20252   if (!VT.isSimple())
20253     return false;
20254
20255   MVT SVT = VT.getSimpleVT();
20256
20257   // Very little shuffling can be done for 64-bit vectors right now.
20258   if (VT.getSizeInBits() == 64)
20259     return false;
20260
20261   // This is an experimental legality test that is tailored to match the
20262   // legality test of the experimental lowering more closely. They are gated
20263   // separately to ease testing of performance differences.
20264   if (ExperimentalVectorShuffleLegality)
20265     // We only care that the types being shuffled are legal. The lowering can
20266     // handle any possible shuffle mask that results.
20267     return isTypeLegal(SVT);
20268
20269   // If this is a single-input shuffle with no 128 bit lane crossings we can
20270   // lower it into pshufb.
20271   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20272       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20273     bool isLegal = true;
20274     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20275       if (M[I] >= (int)SVT.getVectorNumElements() ||
20276           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20277         isLegal = false;
20278         break;
20279       }
20280     }
20281     if (isLegal)
20282       return true;
20283   }
20284
20285   // FIXME: blends, shifts.
20286   return (SVT.getVectorNumElements() == 2 ||
20287           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20288           isMOVLMask(M, SVT) ||
20289           isCommutedMOVLMask(M, SVT) ||
20290           isMOVHLPSMask(M, SVT) ||
20291           isSHUFPMask(M, SVT) ||
20292           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20293           isPSHUFDMask(M, SVT) ||
20294           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20295           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20296           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20297           isPALIGNRMask(M, SVT, Subtarget) ||
20298           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20299           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20300           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20301           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20302           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20303           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20304 }
20305
20306 bool
20307 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20308                                           EVT VT) const {
20309   if (!VT.isSimple())
20310     return false;
20311
20312   MVT SVT = VT.getSimpleVT();
20313
20314   // This is an experimental legality test that is tailored to match the
20315   // legality test of the experimental lowering more closely. They are gated
20316   // separately to ease testing of performance differences.
20317   if (ExperimentalVectorShuffleLegality)
20318     // The new vector shuffle lowering is very good at managing zero-inputs.
20319     return isShuffleMaskLegal(Mask, VT);
20320
20321   unsigned NumElts = SVT.getVectorNumElements();
20322   // FIXME: This collection of masks seems suspect.
20323   if (NumElts == 2)
20324     return true;
20325   if (NumElts == 4 && SVT.is128BitVector()) {
20326     return (isMOVLMask(Mask, SVT)  ||
20327             isCommutedMOVLMask(Mask, SVT, true) ||
20328             isSHUFPMask(Mask, SVT) ||
20329             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20330             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20331                         Subtarget->hasInt256()));
20332   }
20333   return false;
20334 }
20335
20336 //===----------------------------------------------------------------------===//
20337 //                           X86 Scheduler Hooks
20338 //===----------------------------------------------------------------------===//
20339
20340 /// Utility function to emit xbegin specifying the start of an RTM region.
20341 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20342                                      const TargetInstrInfo *TII) {
20343   DebugLoc DL = MI->getDebugLoc();
20344
20345   const BasicBlock *BB = MBB->getBasicBlock();
20346   MachineFunction::iterator I = MBB;
20347   ++I;
20348
20349   // For the v = xbegin(), we generate
20350   //
20351   // thisMBB:
20352   //  xbegin sinkMBB
20353   //
20354   // mainMBB:
20355   //  eax = -1
20356   //
20357   // sinkMBB:
20358   //  v = eax
20359
20360   MachineBasicBlock *thisMBB = MBB;
20361   MachineFunction *MF = MBB->getParent();
20362   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20363   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20364   MF->insert(I, mainMBB);
20365   MF->insert(I, sinkMBB);
20366
20367   // Transfer the remainder of BB and its successor edges to sinkMBB.
20368   sinkMBB->splice(sinkMBB->begin(), MBB,
20369                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20370   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20371
20372   // thisMBB:
20373   //  xbegin sinkMBB
20374   //  # fallthrough to mainMBB
20375   //  # abortion to sinkMBB
20376   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20377   thisMBB->addSuccessor(mainMBB);
20378   thisMBB->addSuccessor(sinkMBB);
20379
20380   // mainMBB:
20381   //  EAX = -1
20382   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20383   mainMBB->addSuccessor(sinkMBB);
20384
20385   // sinkMBB:
20386   // EAX is live into the sinkMBB
20387   sinkMBB->addLiveIn(X86::EAX);
20388   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20389           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20390     .addReg(X86::EAX);
20391
20392   MI->eraseFromParent();
20393   return sinkMBB;
20394 }
20395
20396 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20397 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20398 // in the .td file.
20399 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20400                                        const TargetInstrInfo *TII) {
20401   unsigned Opc;
20402   switch (MI->getOpcode()) {
20403   default: llvm_unreachable("illegal opcode!");
20404   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20405   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20406   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20407   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20408   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20409   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20410   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20411   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20412   }
20413
20414   DebugLoc dl = MI->getDebugLoc();
20415   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20416
20417   unsigned NumArgs = MI->getNumOperands();
20418   for (unsigned i = 1; i < NumArgs; ++i) {
20419     MachineOperand &Op = MI->getOperand(i);
20420     if (!(Op.isReg() && Op.isImplicit()))
20421       MIB.addOperand(Op);
20422   }
20423   if (MI->hasOneMemOperand())
20424     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20425
20426   BuildMI(*BB, MI, dl,
20427     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20428     .addReg(X86::XMM0);
20429
20430   MI->eraseFromParent();
20431   return BB;
20432 }
20433
20434 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20435 // defs in an instruction pattern
20436 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20437                                        const TargetInstrInfo *TII) {
20438   unsigned Opc;
20439   switch (MI->getOpcode()) {
20440   default: llvm_unreachable("illegal opcode!");
20441   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20442   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20443   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20444   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20445   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20446   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20447   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20448   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20449   }
20450
20451   DebugLoc dl = MI->getDebugLoc();
20452   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20453
20454   unsigned NumArgs = MI->getNumOperands(); // remove the results
20455   for (unsigned i = 1; i < NumArgs; ++i) {
20456     MachineOperand &Op = MI->getOperand(i);
20457     if (!(Op.isReg() && Op.isImplicit()))
20458       MIB.addOperand(Op);
20459   }
20460   if (MI->hasOneMemOperand())
20461     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20462
20463   BuildMI(*BB, MI, dl,
20464     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20465     .addReg(X86::ECX);
20466
20467   MI->eraseFromParent();
20468   return BB;
20469 }
20470
20471 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20472                                        const TargetInstrInfo *TII,
20473                                        const X86Subtarget* Subtarget) {
20474   DebugLoc dl = MI->getDebugLoc();
20475
20476   // Address into RAX/EAX, other two args into ECX, EDX.
20477   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20478   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20479   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20480   for (int i = 0; i < X86::AddrNumOperands; ++i)
20481     MIB.addOperand(MI->getOperand(i));
20482
20483   unsigned ValOps = X86::AddrNumOperands;
20484   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20485     .addReg(MI->getOperand(ValOps).getReg());
20486   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20487     .addReg(MI->getOperand(ValOps+1).getReg());
20488
20489   // The instruction doesn't actually take any operands though.
20490   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20491
20492   MI->eraseFromParent(); // The pseudo is gone now.
20493   return BB;
20494 }
20495
20496 MachineBasicBlock *
20497 X86TargetLowering::EmitVAARG64WithCustomInserter(
20498                    MachineInstr *MI,
20499                    MachineBasicBlock *MBB) const {
20500   // Emit va_arg instruction on X86-64.
20501
20502   // Operands to this pseudo-instruction:
20503   // 0  ) Output        : destination address (reg)
20504   // 1-5) Input         : va_list address (addr, i64mem)
20505   // 6  ) ArgSize       : Size (in bytes) of vararg type
20506   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20507   // 8  ) Align         : Alignment of type
20508   // 9  ) EFLAGS (implicit-def)
20509
20510   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20511   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20512
20513   unsigned DestReg = MI->getOperand(0).getReg();
20514   MachineOperand &Base = MI->getOperand(1);
20515   MachineOperand &Scale = MI->getOperand(2);
20516   MachineOperand &Index = MI->getOperand(3);
20517   MachineOperand &Disp = MI->getOperand(4);
20518   MachineOperand &Segment = MI->getOperand(5);
20519   unsigned ArgSize = MI->getOperand(6).getImm();
20520   unsigned ArgMode = MI->getOperand(7).getImm();
20521   unsigned Align = MI->getOperand(8).getImm();
20522
20523   // Memory Reference
20524   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20525   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20526   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20527
20528   // Machine Information
20529   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20530   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20531   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20532   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20533   DebugLoc DL = MI->getDebugLoc();
20534
20535   // struct va_list {
20536   //   i32   gp_offset
20537   //   i32   fp_offset
20538   //   i64   overflow_area (address)
20539   //   i64   reg_save_area (address)
20540   // }
20541   // sizeof(va_list) = 24
20542   // alignment(va_list) = 8
20543
20544   unsigned TotalNumIntRegs = 6;
20545   unsigned TotalNumXMMRegs = 8;
20546   bool UseGPOffset = (ArgMode == 1);
20547   bool UseFPOffset = (ArgMode == 2);
20548   unsigned MaxOffset = TotalNumIntRegs * 8 +
20549                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20550
20551   /* Align ArgSize to a multiple of 8 */
20552   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20553   bool NeedsAlign = (Align > 8);
20554
20555   MachineBasicBlock *thisMBB = MBB;
20556   MachineBasicBlock *overflowMBB;
20557   MachineBasicBlock *offsetMBB;
20558   MachineBasicBlock *endMBB;
20559
20560   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20561   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20562   unsigned OffsetReg = 0;
20563
20564   if (!UseGPOffset && !UseFPOffset) {
20565     // If we only pull from the overflow region, we don't create a branch.
20566     // We don't need to alter control flow.
20567     OffsetDestReg = 0; // unused
20568     OverflowDestReg = DestReg;
20569
20570     offsetMBB = nullptr;
20571     overflowMBB = thisMBB;
20572     endMBB = thisMBB;
20573   } else {
20574     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20575     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20576     // If not, pull from overflow_area. (branch to overflowMBB)
20577     //
20578     //       thisMBB
20579     //         |     .
20580     //         |        .
20581     //     offsetMBB   overflowMBB
20582     //         |        .
20583     //         |     .
20584     //        endMBB
20585
20586     // Registers for the PHI in endMBB
20587     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20588     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20589
20590     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20591     MachineFunction *MF = MBB->getParent();
20592     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20593     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20594     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20595
20596     MachineFunction::iterator MBBIter = MBB;
20597     ++MBBIter;
20598
20599     // Insert the new basic blocks
20600     MF->insert(MBBIter, offsetMBB);
20601     MF->insert(MBBIter, overflowMBB);
20602     MF->insert(MBBIter, endMBB);
20603
20604     // Transfer the remainder of MBB and its successor edges to endMBB.
20605     endMBB->splice(endMBB->begin(), thisMBB,
20606                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20607     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20608
20609     // Make offsetMBB and overflowMBB successors of thisMBB
20610     thisMBB->addSuccessor(offsetMBB);
20611     thisMBB->addSuccessor(overflowMBB);
20612
20613     // endMBB is a successor of both offsetMBB and overflowMBB
20614     offsetMBB->addSuccessor(endMBB);
20615     overflowMBB->addSuccessor(endMBB);
20616
20617     // Load the offset value into a register
20618     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20619     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20620       .addOperand(Base)
20621       .addOperand(Scale)
20622       .addOperand(Index)
20623       .addDisp(Disp, UseFPOffset ? 4 : 0)
20624       .addOperand(Segment)
20625       .setMemRefs(MMOBegin, MMOEnd);
20626
20627     // Check if there is enough room left to pull this argument.
20628     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20629       .addReg(OffsetReg)
20630       .addImm(MaxOffset + 8 - ArgSizeA8);
20631
20632     // Branch to "overflowMBB" if offset >= max
20633     // Fall through to "offsetMBB" otherwise
20634     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20635       .addMBB(overflowMBB);
20636   }
20637
20638   // In offsetMBB, emit code to use the reg_save_area.
20639   if (offsetMBB) {
20640     assert(OffsetReg != 0);
20641
20642     // Read the reg_save_area address.
20643     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20644     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20645       .addOperand(Base)
20646       .addOperand(Scale)
20647       .addOperand(Index)
20648       .addDisp(Disp, 16)
20649       .addOperand(Segment)
20650       .setMemRefs(MMOBegin, MMOEnd);
20651
20652     // Zero-extend the offset
20653     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20654       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20655         .addImm(0)
20656         .addReg(OffsetReg)
20657         .addImm(X86::sub_32bit);
20658
20659     // Add the offset to the reg_save_area to get the final address.
20660     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20661       .addReg(OffsetReg64)
20662       .addReg(RegSaveReg);
20663
20664     // Compute the offset for the next argument
20665     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20666     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20667       .addReg(OffsetReg)
20668       .addImm(UseFPOffset ? 16 : 8);
20669
20670     // Store it back into the va_list.
20671     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20672       .addOperand(Base)
20673       .addOperand(Scale)
20674       .addOperand(Index)
20675       .addDisp(Disp, UseFPOffset ? 4 : 0)
20676       .addOperand(Segment)
20677       .addReg(NextOffsetReg)
20678       .setMemRefs(MMOBegin, MMOEnd);
20679
20680     // Jump to endMBB
20681     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20682       .addMBB(endMBB);
20683   }
20684
20685   //
20686   // Emit code to use overflow area
20687   //
20688
20689   // Load the overflow_area address into a register.
20690   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20691   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20692     .addOperand(Base)
20693     .addOperand(Scale)
20694     .addOperand(Index)
20695     .addDisp(Disp, 8)
20696     .addOperand(Segment)
20697     .setMemRefs(MMOBegin, MMOEnd);
20698
20699   // If we need to align it, do so. Otherwise, just copy the address
20700   // to OverflowDestReg.
20701   if (NeedsAlign) {
20702     // Align the overflow address
20703     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20704     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20705
20706     // aligned_addr = (addr + (align-1)) & ~(align-1)
20707     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20708       .addReg(OverflowAddrReg)
20709       .addImm(Align-1);
20710
20711     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20712       .addReg(TmpReg)
20713       .addImm(~(uint64_t)(Align-1));
20714   } else {
20715     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20716       .addReg(OverflowAddrReg);
20717   }
20718
20719   // Compute the next overflow address after this argument.
20720   // (the overflow address should be kept 8-byte aligned)
20721   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20722   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20723     .addReg(OverflowDestReg)
20724     .addImm(ArgSizeA8);
20725
20726   // Store the new overflow address.
20727   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20728     .addOperand(Base)
20729     .addOperand(Scale)
20730     .addOperand(Index)
20731     .addDisp(Disp, 8)
20732     .addOperand(Segment)
20733     .addReg(NextAddrReg)
20734     .setMemRefs(MMOBegin, MMOEnd);
20735
20736   // If we branched, emit the PHI to the front of endMBB.
20737   if (offsetMBB) {
20738     BuildMI(*endMBB, endMBB->begin(), DL,
20739             TII->get(X86::PHI), DestReg)
20740       .addReg(OffsetDestReg).addMBB(offsetMBB)
20741       .addReg(OverflowDestReg).addMBB(overflowMBB);
20742   }
20743
20744   // Erase the pseudo instruction
20745   MI->eraseFromParent();
20746
20747   return endMBB;
20748 }
20749
20750 MachineBasicBlock *
20751 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20752                                                  MachineInstr *MI,
20753                                                  MachineBasicBlock *MBB) const {
20754   // Emit code to save XMM registers to the stack. The ABI says that the
20755   // number of registers to save is given in %al, so it's theoretically
20756   // possible to do an indirect jump trick to avoid saving all of them,
20757   // however this code takes a simpler approach and just executes all
20758   // of the stores if %al is non-zero. It's less code, and it's probably
20759   // easier on the hardware branch predictor, and stores aren't all that
20760   // expensive anyway.
20761
20762   // Create the new basic blocks. One block contains all the XMM stores,
20763   // and one block is the final destination regardless of whether any
20764   // stores were performed.
20765   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20766   MachineFunction *F = MBB->getParent();
20767   MachineFunction::iterator MBBIter = MBB;
20768   ++MBBIter;
20769   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20770   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20771   F->insert(MBBIter, XMMSaveMBB);
20772   F->insert(MBBIter, EndMBB);
20773
20774   // Transfer the remainder of MBB and its successor edges to EndMBB.
20775   EndMBB->splice(EndMBB->begin(), MBB,
20776                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20777   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20778
20779   // The original block will now fall through to the XMM save block.
20780   MBB->addSuccessor(XMMSaveMBB);
20781   // The XMMSaveMBB will fall through to the end block.
20782   XMMSaveMBB->addSuccessor(EndMBB);
20783
20784   // Now add the instructions.
20785   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20786   DebugLoc DL = MI->getDebugLoc();
20787
20788   unsigned CountReg = MI->getOperand(0).getReg();
20789   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20790   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20791
20792   if (!Subtarget->isTargetWin64()) {
20793     // If %al is 0, branch around the XMM save block.
20794     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20795     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20796     MBB->addSuccessor(EndMBB);
20797   }
20798
20799   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20800   // that was just emitted, but clearly shouldn't be "saved".
20801   assert((MI->getNumOperands() <= 3 ||
20802           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20803           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20804          && "Expected last argument to be EFLAGS");
20805   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20806   // In the XMM save block, save all the XMM argument registers.
20807   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20808     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20809     MachineMemOperand *MMO =
20810       F->getMachineMemOperand(
20811           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20812         MachineMemOperand::MOStore,
20813         /*Size=*/16, /*Align=*/16);
20814     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20815       .addFrameIndex(RegSaveFrameIndex)
20816       .addImm(/*Scale=*/1)
20817       .addReg(/*IndexReg=*/0)
20818       .addImm(/*Disp=*/Offset)
20819       .addReg(/*Segment=*/0)
20820       .addReg(MI->getOperand(i).getReg())
20821       .addMemOperand(MMO);
20822   }
20823
20824   MI->eraseFromParent();   // The pseudo instruction is gone now.
20825
20826   return EndMBB;
20827 }
20828
20829 // The EFLAGS operand of SelectItr might be missing a kill marker
20830 // because there were multiple uses of EFLAGS, and ISel didn't know
20831 // which to mark. Figure out whether SelectItr should have had a
20832 // kill marker, and set it if it should. Returns the correct kill
20833 // marker value.
20834 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
20835                                      MachineBasicBlock* BB,
20836                                      const TargetRegisterInfo* TRI) {
20837   // Scan forward through BB for a use/def of EFLAGS.
20838   MachineBasicBlock::iterator miI(std::next(SelectItr));
20839   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
20840     const MachineInstr& mi = *miI;
20841     if (mi.readsRegister(X86::EFLAGS))
20842       return false;
20843     if (mi.definesRegister(X86::EFLAGS))
20844       break; // Should have kill-flag - update below.
20845   }
20846
20847   // If we hit the end of the block, check whether EFLAGS is live into a
20848   // successor.
20849   if (miI == BB->end()) {
20850     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
20851                                           sEnd = BB->succ_end();
20852          sItr != sEnd; ++sItr) {
20853       MachineBasicBlock* succ = *sItr;
20854       if (succ->isLiveIn(X86::EFLAGS))
20855         return false;
20856     }
20857   }
20858
20859   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
20860   // out. SelectMI should have a kill flag on EFLAGS.
20861   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
20862   return true;
20863 }
20864
20865 MachineBasicBlock *
20866 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
20867                                      MachineBasicBlock *BB) const {
20868   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
20869   DebugLoc DL = MI->getDebugLoc();
20870
20871   // To "insert" a SELECT_CC instruction, we actually have to insert the
20872   // diamond control-flow pattern.  The incoming instruction knows the
20873   // destination vreg to set, the condition code register to branch on, the
20874   // true/false values to select between, and a branch opcode to use.
20875   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20876   MachineFunction::iterator It = BB;
20877   ++It;
20878
20879   //  thisMBB:
20880   //  ...
20881   //   TrueVal = ...
20882   //   cmpTY ccX, r1, r2
20883   //   bCC copy1MBB
20884   //   fallthrough --> copy0MBB
20885   MachineBasicBlock *thisMBB = BB;
20886   MachineFunction *F = BB->getParent();
20887   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
20888   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
20889   F->insert(It, copy0MBB);
20890   F->insert(It, sinkMBB);
20891
20892   // If the EFLAGS register isn't dead in the terminator, then claim that it's
20893   // live into the sink and copy blocks.
20894   const TargetRegisterInfo *TRI =
20895       BB->getParent()->getSubtarget().getRegisterInfo();
20896   if (!MI->killsRegister(X86::EFLAGS) &&
20897       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
20898     copy0MBB->addLiveIn(X86::EFLAGS);
20899     sinkMBB->addLiveIn(X86::EFLAGS);
20900   }
20901
20902   // Transfer the remainder of BB and its successor edges to sinkMBB.
20903   sinkMBB->splice(sinkMBB->begin(), BB,
20904                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
20905   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
20906
20907   // Add the true and fallthrough blocks as its successors.
20908   BB->addSuccessor(copy0MBB);
20909   BB->addSuccessor(sinkMBB);
20910
20911   // Create the conditional branch instruction.
20912   unsigned Opc =
20913     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
20914   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
20915
20916   //  copy0MBB:
20917   //   %FalseValue = ...
20918   //   # fallthrough to sinkMBB
20919   copy0MBB->addSuccessor(sinkMBB);
20920
20921   //  sinkMBB:
20922   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
20923   //  ...
20924   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20925           TII->get(X86::PHI), MI->getOperand(0).getReg())
20926     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
20927     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
20928
20929   MI->eraseFromParent();   // The pseudo instruction is gone now.
20930   return sinkMBB;
20931 }
20932
20933 MachineBasicBlock *
20934 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
20935                                         MachineBasicBlock *BB) const {
20936   MachineFunction *MF = BB->getParent();
20937   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
20938   DebugLoc DL = MI->getDebugLoc();
20939   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20940
20941   assert(MF->shouldSplitStack());
20942
20943   const bool Is64Bit = Subtarget->is64Bit();
20944   const bool IsLP64 = Subtarget->isTarget64BitLP64();
20945
20946   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
20947   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
20948
20949   // BB:
20950   //  ... [Till the alloca]
20951   // If stacklet is not large enough, jump to mallocMBB
20952   //
20953   // bumpMBB:
20954   //  Allocate by subtracting from RSP
20955   //  Jump to continueMBB
20956   //
20957   // mallocMBB:
20958   //  Allocate by call to runtime
20959   //
20960   // continueMBB:
20961   //  ...
20962   //  [rest of original BB]
20963   //
20964
20965   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20966   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20967   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20968
20969   MachineRegisterInfo &MRI = MF->getRegInfo();
20970   const TargetRegisterClass *AddrRegClass =
20971     getRegClassFor(getPointerTy());
20972
20973   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20974     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20975     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
20976     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
20977     sizeVReg = MI->getOperand(1).getReg(),
20978     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
20979
20980   MachineFunction::iterator MBBIter = BB;
20981   ++MBBIter;
20982
20983   MF->insert(MBBIter, bumpMBB);
20984   MF->insert(MBBIter, mallocMBB);
20985   MF->insert(MBBIter, continueMBB);
20986
20987   continueMBB->splice(continueMBB->begin(), BB,
20988                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
20989   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
20990
20991   // Add code to the main basic block to check if the stack limit has been hit,
20992   // and if so, jump to mallocMBB otherwise to bumpMBB.
20993   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
20994   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
20995     .addReg(tmpSPVReg).addReg(sizeVReg);
20996   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
20997     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
20998     .addReg(SPLimitVReg);
20999   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21000
21001   // bumpMBB simply decreases the stack pointer, since we know the current
21002   // stacklet has enough space.
21003   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21004     .addReg(SPLimitVReg);
21005   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21006     .addReg(SPLimitVReg);
21007   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21008
21009   // Calls into a routine in libgcc to allocate more space from the heap.
21010   const uint32_t *RegMask = MF->getTarget()
21011                                 .getSubtargetImpl()
21012                                 ->getRegisterInfo()
21013                                 ->getCallPreservedMask(CallingConv::C);
21014   if (IsLP64) {
21015     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21016       .addReg(sizeVReg);
21017     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21018       .addExternalSymbol("__morestack_allocate_stack_space")
21019       .addRegMask(RegMask)
21020       .addReg(X86::RDI, RegState::Implicit)
21021       .addReg(X86::RAX, RegState::ImplicitDefine);
21022   } else if (Is64Bit) {
21023     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21024       .addReg(sizeVReg);
21025     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21026       .addExternalSymbol("__morestack_allocate_stack_space")
21027       .addRegMask(RegMask)
21028       .addReg(X86::EDI, RegState::Implicit)
21029       .addReg(X86::EAX, RegState::ImplicitDefine);
21030   } else {
21031     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21032       .addImm(12);
21033     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21034     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21035       .addExternalSymbol("__morestack_allocate_stack_space")
21036       .addRegMask(RegMask)
21037       .addReg(X86::EAX, RegState::ImplicitDefine);
21038   }
21039
21040   if (!Is64Bit)
21041     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21042       .addImm(16);
21043
21044   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21045     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21046   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21047
21048   // Set up the CFG correctly.
21049   BB->addSuccessor(bumpMBB);
21050   BB->addSuccessor(mallocMBB);
21051   mallocMBB->addSuccessor(continueMBB);
21052   bumpMBB->addSuccessor(continueMBB);
21053
21054   // Take care of the PHI nodes.
21055   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21056           MI->getOperand(0).getReg())
21057     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21058     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21059
21060   // Delete the original pseudo instruction.
21061   MI->eraseFromParent();
21062
21063   // And we're done.
21064   return continueMBB;
21065 }
21066
21067 MachineBasicBlock *
21068 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21069                                         MachineBasicBlock *BB) const {
21070   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
21071   DebugLoc DL = MI->getDebugLoc();
21072
21073   assert(!Subtarget->isTargetMachO());
21074
21075   // The lowering is pretty easy: we're just emitting the call to _alloca.  The
21076   // non-trivial part is impdef of ESP.
21077
21078   if (Subtarget->isTargetWin64()) {
21079     if (Subtarget->isTargetCygMing()) {
21080       // ___chkstk(Mingw64):
21081       // Clobbers R10, R11, RAX and EFLAGS.
21082       // Updates RSP.
21083       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
21084         .addExternalSymbol("___chkstk")
21085         .addReg(X86::RAX, RegState::Implicit)
21086         .addReg(X86::RSP, RegState::Implicit)
21087         .addReg(X86::RAX, RegState::Define | RegState::Implicit)
21088         .addReg(X86::RSP, RegState::Define | RegState::Implicit)
21089         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21090     } else {
21091       // __chkstk(MSVCRT): does not update stack pointer.
21092       // Clobbers R10, R11 and EFLAGS.
21093       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
21094         .addExternalSymbol("__chkstk")
21095         .addReg(X86::RAX, RegState::Implicit)
21096         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21097       // RAX has the offset to be subtracted from RSP.
21098       BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
21099         .addReg(X86::RSP)
21100         .addReg(X86::RAX);
21101     }
21102   } else {
21103     const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() ||
21104                                     Subtarget->isTargetWindowsItanium())
21105                                        ? "_chkstk"
21106                                        : "_alloca";
21107
21108     BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
21109       .addExternalSymbol(StackProbeSymbol)
21110       .addReg(X86::EAX, RegState::Implicit)
21111       .addReg(X86::ESP, RegState::Implicit)
21112       .addReg(X86::EAX, RegState::Define | RegState::Implicit)
21113       .addReg(X86::ESP, RegState::Define | RegState::Implicit)
21114       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
21115   }
21116
21117   MI->eraseFromParent();   // The pseudo instruction is gone now.
21118   return BB;
21119 }
21120
21121 MachineBasicBlock *
21122 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21123                                       MachineBasicBlock *BB) const {
21124   // This is pretty easy.  We're taking the value that we received from
21125   // our load from the relocation, sticking it in either RDI (x86-64)
21126   // or EAX and doing an indirect call.  The return value will then
21127   // be in the normal return register.
21128   MachineFunction *F = BB->getParent();
21129   const X86InstrInfo *TII =
21130       static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
21131   DebugLoc DL = MI->getDebugLoc();
21132
21133   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21134   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21135
21136   // Get a register mask for the lowered call.
21137   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21138   // proper register mask.
21139   const uint32_t *RegMask = F->getTarget()
21140                                 .getSubtargetImpl()
21141                                 ->getRegisterInfo()
21142                                 ->getCallPreservedMask(CallingConv::C);
21143   if (Subtarget->is64Bit()) {
21144     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21145                                       TII->get(X86::MOV64rm), X86::RDI)
21146     .addReg(X86::RIP)
21147     .addImm(0).addReg(0)
21148     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21149                       MI->getOperand(3).getTargetFlags())
21150     .addReg(0);
21151     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21152     addDirectMem(MIB, X86::RDI);
21153     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21154   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21155     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21156                                       TII->get(X86::MOV32rm), X86::EAX)
21157     .addReg(0)
21158     .addImm(0).addReg(0)
21159     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21160                       MI->getOperand(3).getTargetFlags())
21161     .addReg(0);
21162     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21163     addDirectMem(MIB, X86::EAX);
21164     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21165   } else {
21166     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21167                                       TII->get(X86::MOV32rm), X86::EAX)
21168     .addReg(TII->getGlobalBaseReg(F))
21169     .addImm(0).addReg(0)
21170     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21171                       MI->getOperand(3).getTargetFlags())
21172     .addReg(0);
21173     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21174     addDirectMem(MIB, X86::EAX);
21175     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21176   }
21177
21178   MI->eraseFromParent(); // The pseudo instruction is gone now.
21179   return BB;
21180 }
21181
21182 MachineBasicBlock *
21183 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21184                                     MachineBasicBlock *MBB) const {
21185   DebugLoc DL = MI->getDebugLoc();
21186   MachineFunction *MF = MBB->getParent();
21187   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21188   MachineRegisterInfo &MRI = MF->getRegInfo();
21189
21190   const BasicBlock *BB = MBB->getBasicBlock();
21191   MachineFunction::iterator I = MBB;
21192   ++I;
21193
21194   // Memory Reference
21195   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21196   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21197
21198   unsigned DstReg;
21199   unsigned MemOpndSlot = 0;
21200
21201   unsigned CurOp = 0;
21202
21203   DstReg = MI->getOperand(CurOp++).getReg();
21204   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21205   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21206   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21207   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21208
21209   MemOpndSlot = CurOp;
21210
21211   MVT PVT = getPointerTy();
21212   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21213          "Invalid Pointer Size!");
21214
21215   // For v = setjmp(buf), we generate
21216   //
21217   // thisMBB:
21218   //  buf[LabelOffset] = restoreMBB
21219   //  SjLjSetup restoreMBB
21220   //
21221   // mainMBB:
21222   //  v_main = 0
21223   //
21224   // sinkMBB:
21225   //  v = phi(main, restore)
21226   //
21227   // restoreMBB:
21228   //  if base pointer being used, load it from frame
21229   //  v_restore = 1
21230
21231   MachineBasicBlock *thisMBB = MBB;
21232   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21233   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21234   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21235   MF->insert(I, mainMBB);
21236   MF->insert(I, sinkMBB);
21237   MF->push_back(restoreMBB);
21238
21239   MachineInstrBuilder MIB;
21240
21241   // Transfer the remainder of BB and its successor edges to sinkMBB.
21242   sinkMBB->splice(sinkMBB->begin(), MBB,
21243                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21244   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21245
21246   // thisMBB:
21247   unsigned PtrStoreOpc = 0;
21248   unsigned LabelReg = 0;
21249   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21250   Reloc::Model RM = MF->getTarget().getRelocationModel();
21251   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21252                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21253
21254   // Prepare IP either in reg or imm.
21255   if (!UseImmLabel) {
21256     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21257     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21258     LabelReg = MRI.createVirtualRegister(PtrRC);
21259     if (Subtarget->is64Bit()) {
21260       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21261               .addReg(X86::RIP)
21262               .addImm(0)
21263               .addReg(0)
21264               .addMBB(restoreMBB)
21265               .addReg(0);
21266     } else {
21267       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21268       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21269               .addReg(XII->getGlobalBaseReg(MF))
21270               .addImm(0)
21271               .addReg(0)
21272               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21273               .addReg(0);
21274     }
21275   } else
21276     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21277   // Store IP
21278   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21279   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21280     if (i == X86::AddrDisp)
21281       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21282     else
21283       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21284   }
21285   if (!UseImmLabel)
21286     MIB.addReg(LabelReg);
21287   else
21288     MIB.addMBB(restoreMBB);
21289   MIB.setMemRefs(MMOBegin, MMOEnd);
21290   // Setup
21291   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21292           .addMBB(restoreMBB);
21293
21294   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21295       MF->getSubtarget().getRegisterInfo());
21296   MIB.addRegMask(RegInfo->getNoPreservedMask());
21297   thisMBB->addSuccessor(mainMBB);
21298   thisMBB->addSuccessor(restoreMBB);
21299
21300   // mainMBB:
21301   //  EAX = 0
21302   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21303   mainMBB->addSuccessor(sinkMBB);
21304
21305   // sinkMBB:
21306   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21307           TII->get(X86::PHI), DstReg)
21308     .addReg(mainDstReg).addMBB(mainMBB)
21309     .addReg(restoreDstReg).addMBB(restoreMBB);
21310
21311   // restoreMBB:
21312   if (RegInfo->hasBasePointer(*MF)) {
21313     const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
21314     const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
21315     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21316     X86FI->setRestoreBasePointer(MF);
21317     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21318     unsigned BasePtr = RegInfo->getBaseRegister();
21319     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21320     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21321                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21322       .setMIFlag(MachineInstr::FrameSetup);
21323   }
21324   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21325   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21326   restoreMBB->addSuccessor(sinkMBB);
21327
21328   MI->eraseFromParent();
21329   return sinkMBB;
21330 }
21331
21332 MachineBasicBlock *
21333 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21334                                      MachineBasicBlock *MBB) const {
21335   DebugLoc DL = MI->getDebugLoc();
21336   MachineFunction *MF = MBB->getParent();
21337   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21338   MachineRegisterInfo &MRI = MF->getRegInfo();
21339
21340   // Memory Reference
21341   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21342   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21343
21344   MVT PVT = getPointerTy();
21345   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21346          "Invalid Pointer Size!");
21347
21348   const TargetRegisterClass *RC =
21349     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21350   unsigned Tmp = MRI.createVirtualRegister(RC);
21351   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21352   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21353       MF->getSubtarget().getRegisterInfo());
21354   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21355   unsigned SP = RegInfo->getStackRegister();
21356
21357   MachineInstrBuilder MIB;
21358
21359   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21360   const int64_t SPOffset = 2 * PVT.getStoreSize();
21361
21362   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21363   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21364
21365   // Reload FP
21366   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21367   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21368     MIB.addOperand(MI->getOperand(i));
21369   MIB.setMemRefs(MMOBegin, MMOEnd);
21370   // Reload IP
21371   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21372   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21373     if (i == X86::AddrDisp)
21374       MIB.addDisp(MI->getOperand(i), LabelOffset);
21375     else
21376       MIB.addOperand(MI->getOperand(i));
21377   }
21378   MIB.setMemRefs(MMOBegin, MMOEnd);
21379   // Reload SP
21380   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21381   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21382     if (i == X86::AddrDisp)
21383       MIB.addDisp(MI->getOperand(i), SPOffset);
21384     else
21385       MIB.addOperand(MI->getOperand(i));
21386   }
21387   MIB.setMemRefs(MMOBegin, MMOEnd);
21388   // Jump
21389   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21390
21391   MI->eraseFromParent();
21392   return MBB;
21393 }
21394
21395 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21396 // accumulator loops. Writing back to the accumulator allows the coalescer
21397 // to remove extra copies in the loop.
21398 MachineBasicBlock *
21399 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21400                                  MachineBasicBlock *MBB) const {
21401   MachineOperand &AddendOp = MI->getOperand(3);
21402
21403   // Bail out early if the addend isn't a register - we can't switch these.
21404   if (!AddendOp.isReg())
21405     return MBB;
21406
21407   MachineFunction &MF = *MBB->getParent();
21408   MachineRegisterInfo &MRI = MF.getRegInfo();
21409
21410   // Check whether the addend is defined by a PHI:
21411   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21412   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21413   if (!AddendDef.isPHI())
21414     return MBB;
21415
21416   // Look for the following pattern:
21417   // loop:
21418   //   %addend = phi [%entry, 0], [%loop, %result]
21419   //   ...
21420   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21421
21422   // Replace with:
21423   //   loop:
21424   //   %addend = phi [%entry, 0], [%loop, %result]
21425   //   ...
21426   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21427
21428   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21429     assert(AddendDef.getOperand(i).isReg());
21430     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21431     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21432     if (&PHISrcInst == MI) {
21433       // Found a matching instruction.
21434       unsigned NewFMAOpc = 0;
21435       switch (MI->getOpcode()) {
21436         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21437         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21438         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21439         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21440         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21441         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21442         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21443         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21444         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21445         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21446         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21447         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21448         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21449         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21450         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21451         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21452         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21453         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21454         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21455         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21456
21457         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21458         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21459         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21460         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21461         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21462         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21463         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21464         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21465         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21466         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21467         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21468         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21469         default: llvm_unreachable("Unrecognized FMA variant.");
21470       }
21471
21472       const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
21473       MachineInstrBuilder MIB =
21474         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21475         .addOperand(MI->getOperand(0))
21476         .addOperand(MI->getOperand(3))
21477         .addOperand(MI->getOperand(2))
21478         .addOperand(MI->getOperand(1));
21479       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21480       MI->eraseFromParent();
21481     }
21482   }
21483
21484   return MBB;
21485 }
21486
21487 MachineBasicBlock *
21488 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21489                                                MachineBasicBlock *BB) const {
21490   switch (MI->getOpcode()) {
21491   default: llvm_unreachable("Unexpected instr type to insert");
21492   case X86::TAILJMPd64:
21493   case X86::TAILJMPr64:
21494   case X86::TAILJMPm64:
21495     llvm_unreachable("TAILJMP64 would not be touched here.");
21496   case X86::TCRETURNdi64:
21497   case X86::TCRETURNri64:
21498   case X86::TCRETURNmi64:
21499     return BB;
21500   case X86::WIN_ALLOCA:
21501     return EmitLoweredWinAlloca(MI, BB);
21502   case X86::SEG_ALLOCA_32:
21503   case X86::SEG_ALLOCA_64:
21504     return EmitLoweredSegAlloca(MI, BB);
21505   case X86::TLSCall_32:
21506   case X86::TLSCall_64:
21507     return EmitLoweredTLSCall(MI, BB);
21508   case X86::CMOV_GR8:
21509   case X86::CMOV_FR32:
21510   case X86::CMOV_FR64:
21511   case X86::CMOV_V4F32:
21512   case X86::CMOV_V2F64:
21513   case X86::CMOV_V2I64:
21514   case X86::CMOV_V8F32:
21515   case X86::CMOV_V4F64:
21516   case X86::CMOV_V4I64:
21517   case X86::CMOV_V16F32:
21518   case X86::CMOV_V8F64:
21519   case X86::CMOV_V8I64:
21520   case X86::CMOV_GR16:
21521   case X86::CMOV_GR32:
21522   case X86::CMOV_RFP32:
21523   case X86::CMOV_RFP64:
21524   case X86::CMOV_RFP80:
21525     return EmitLoweredSelect(MI, BB);
21526
21527   case X86::FP32_TO_INT16_IN_MEM:
21528   case X86::FP32_TO_INT32_IN_MEM:
21529   case X86::FP32_TO_INT64_IN_MEM:
21530   case X86::FP64_TO_INT16_IN_MEM:
21531   case X86::FP64_TO_INT32_IN_MEM:
21532   case X86::FP64_TO_INT64_IN_MEM:
21533   case X86::FP80_TO_INT16_IN_MEM:
21534   case X86::FP80_TO_INT32_IN_MEM:
21535   case X86::FP80_TO_INT64_IN_MEM: {
21536     MachineFunction *F = BB->getParent();
21537     const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
21538     DebugLoc DL = MI->getDebugLoc();
21539
21540     // Change the floating point control register to use "round towards zero"
21541     // mode when truncating to an integer value.
21542     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21543     addFrameReference(BuildMI(*BB, MI, DL,
21544                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21545
21546     // Load the old value of the high byte of the control word...
21547     unsigned OldCW =
21548       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21549     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21550                       CWFrameIdx);
21551
21552     // Set the high part to be round to zero...
21553     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21554       .addImm(0xC7F);
21555
21556     // Reload the modified control word now...
21557     addFrameReference(BuildMI(*BB, MI, DL,
21558                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21559
21560     // Restore the memory image of control word to original value
21561     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21562       .addReg(OldCW);
21563
21564     // Get the X86 opcode to use.
21565     unsigned Opc;
21566     switch (MI->getOpcode()) {
21567     default: llvm_unreachable("illegal opcode!");
21568     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21569     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21570     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21571     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21572     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21573     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21574     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21575     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21576     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21577     }
21578
21579     X86AddressMode AM;
21580     MachineOperand &Op = MI->getOperand(0);
21581     if (Op.isReg()) {
21582       AM.BaseType = X86AddressMode::RegBase;
21583       AM.Base.Reg = Op.getReg();
21584     } else {
21585       AM.BaseType = X86AddressMode::FrameIndexBase;
21586       AM.Base.FrameIndex = Op.getIndex();
21587     }
21588     Op = MI->getOperand(1);
21589     if (Op.isImm())
21590       AM.Scale = Op.getImm();
21591     Op = MI->getOperand(2);
21592     if (Op.isImm())
21593       AM.IndexReg = Op.getImm();
21594     Op = MI->getOperand(3);
21595     if (Op.isGlobal()) {
21596       AM.GV = Op.getGlobal();
21597     } else {
21598       AM.Disp = Op.getImm();
21599     }
21600     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21601                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21602
21603     // Reload the original control word now.
21604     addFrameReference(BuildMI(*BB, MI, DL,
21605                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21606
21607     MI->eraseFromParent();   // The pseudo instruction is gone now.
21608     return BB;
21609   }
21610     // String/text processing lowering.
21611   case X86::PCMPISTRM128REG:
21612   case X86::VPCMPISTRM128REG:
21613   case X86::PCMPISTRM128MEM:
21614   case X86::VPCMPISTRM128MEM:
21615   case X86::PCMPESTRM128REG:
21616   case X86::VPCMPESTRM128REG:
21617   case X86::PCMPESTRM128MEM:
21618   case X86::VPCMPESTRM128MEM:
21619     assert(Subtarget->hasSSE42() &&
21620            "Target must have SSE4.2 or AVX features enabled");
21621     return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21622
21623   // String/text processing lowering.
21624   case X86::PCMPISTRIREG:
21625   case X86::VPCMPISTRIREG:
21626   case X86::PCMPISTRIMEM:
21627   case X86::VPCMPISTRIMEM:
21628   case X86::PCMPESTRIREG:
21629   case X86::VPCMPESTRIREG:
21630   case X86::PCMPESTRIMEM:
21631   case X86::VPCMPESTRIMEM:
21632     assert(Subtarget->hasSSE42() &&
21633            "Target must have SSE4.2 or AVX features enabled");
21634     return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21635
21636   // Thread synchronization.
21637   case X86::MONITOR:
21638     return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
21639                        Subtarget);
21640
21641   // xbegin
21642   case X86::XBEGIN:
21643     return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21644
21645   case X86::VASTART_SAVE_XMM_REGS:
21646     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21647
21648   case X86::VAARG_64:
21649     return EmitVAARG64WithCustomInserter(MI, BB);
21650
21651   case X86::EH_SjLj_SetJmp32:
21652   case X86::EH_SjLj_SetJmp64:
21653     return emitEHSjLjSetJmp(MI, BB);
21654
21655   case X86::EH_SjLj_LongJmp32:
21656   case X86::EH_SjLj_LongJmp64:
21657     return emitEHSjLjLongJmp(MI, BB);
21658
21659   case TargetOpcode::STATEPOINT:
21660     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21661     // this point in the process.  We diverge later.
21662     return emitPatchPoint(MI, BB);
21663
21664   case TargetOpcode::STACKMAP:
21665   case TargetOpcode::PATCHPOINT:
21666     return emitPatchPoint(MI, BB);
21667
21668   case X86::VFMADDPDr213r:
21669   case X86::VFMADDPSr213r:
21670   case X86::VFMADDSDr213r:
21671   case X86::VFMADDSSr213r:
21672   case X86::VFMSUBPDr213r:
21673   case X86::VFMSUBPSr213r:
21674   case X86::VFMSUBSDr213r:
21675   case X86::VFMSUBSSr213r:
21676   case X86::VFNMADDPDr213r:
21677   case X86::VFNMADDPSr213r:
21678   case X86::VFNMADDSDr213r:
21679   case X86::VFNMADDSSr213r:
21680   case X86::VFNMSUBPDr213r:
21681   case X86::VFNMSUBPSr213r:
21682   case X86::VFNMSUBSDr213r:
21683   case X86::VFNMSUBSSr213r:
21684   case X86::VFMADDSUBPDr213r:
21685   case X86::VFMADDSUBPSr213r:
21686   case X86::VFMSUBADDPDr213r:
21687   case X86::VFMSUBADDPSr213r:
21688   case X86::VFMADDPDr213rY:
21689   case X86::VFMADDPSr213rY:
21690   case X86::VFMSUBPDr213rY:
21691   case X86::VFMSUBPSr213rY:
21692   case X86::VFNMADDPDr213rY:
21693   case X86::VFNMADDPSr213rY:
21694   case X86::VFNMSUBPDr213rY:
21695   case X86::VFNMSUBPSr213rY:
21696   case X86::VFMADDSUBPDr213rY:
21697   case X86::VFMADDSUBPSr213rY:
21698   case X86::VFMSUBADDPDr213rY:
21699   case X86::VFMSUBADDPSr213rY:
21700     return emitFMA3Instr(MI, BB);
21701   }
21702 }
21703
21704 //===----------------------------------------------------------------------===//
21705 //                           X86 Optimization Hooks
21706 //===----------------------------------------------------------------------===//
21707
21708 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21709                                                       APInt &KnownZero,
21710                                                       APInt &KnownOne,
21711                                                       const SelectionDAG &DAG,
21712                                                       unsigned Depth) const {
21713   unsigned BitWidth = KnownZero.getBitWidth();
21714   unsigned Opc = Op.getOpcode();
21715   assert((Opc >= ISD::BUILTIN_OP_END ||
21716           Opc == ISD::INTRINSIC_WO_CHAIN ||
21717           Opc == ISD::INTRINSIC_W_CHAIN ||
21718           Opc == ISD::INTRINSIC_VOID) &&
21719          "Should use MaskedValueIsZero if you don't know whether Op"
21720          " is a target node!");
21721
21722   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21723   switch (Opc) {
21724   default: break;
21725   case X86ISD::ADD:
21726   case X86ISD::SUB:
21727   case X86ISD::ADC:
21728   case X86ISD::SBB:
21729   case X86ISD::SMUL:
21730   case X86ISD::UMUL:
21731   case X86ISD::INC:
21732   case X86ISD::DEC:
21733   case X86ISD::OR:
21734   case X86ISD::XOR:
21735   case X86ISD::AND:
21736     // These nodes' second result is a boolean.
21737     if (Op.getResNo() == 0)
21738       break;
21739     // Fallthrough
21740   case X86ISD::SETCC:
21741     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21742     break;
21743   case ISD::INTRINSIC_WO_CHAIN: {
21744     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21745     unsigned NumLoBits = 0;
21746     switch (IntId) {
21747     default: break;
21748     case Intrinsic::x86_sse_movmsk_ps:
21749     case Intrinsic::x86_avx_movmsk_ps_256:
21750     case Intrinsic::x86_sse2_movmsk_pd:
21751     case Intrinsic::x86_avx_movmsk_pd_256:
21752     case Intrinsic::x86_mmx_pmovmskb:
21753     case Intrinsic::x86_sse2_pmovmskb_128:
21754     case Intrinsic::x86_avx2_pmovmskb: {
21755       // High bits of movmskp{s|d}, pmovmskb are known zero.
21756       switch (IntId) {
21757         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21758         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21759         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21760         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21761         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21762         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21763         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21764         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21765       }
21766       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21767       break;
21768     }
21769     }
21770     break;
21771   }
21772   }
21773 }
21774
21775 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21776   SDValue Op,
21777   const SelectionDAG &,
21778   unsigned Depth) const {
21779   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21780   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21781     return Op.getValueType().getScalarType().getSizeInBits();
21782
21783   // Fallback case.
21784   return 1;
21785 }
21786
21787 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21788 /// node is a GlobalAddress + offset.
21789 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21790                                        const GlobalValue* &GA,
21791                                        int64_t &Offset) const {
21792   if (N->getOpcode() == X86ISD::Wrapper) {
21793     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21794       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21795       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21796       return true;
21797     }
21798   }
21799   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21800 }
21801
21802 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21803 /// same as extracting the high 128-bit part of 256-bit vector and then
21804 /// inserting the result into the low part of a new 256-bit vector
21805 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21806   EVT VT = SVOp->getValueType(0);
21807   unsigned NumElems = VT.getVectorNumElements();
21808
21809   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21810   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21811     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21812         SVOp->getMaskElt(j) >= 0)
21813       return false;
21814
21815   return true;
21816 }
21817
21818 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21819 /// same as extracting the low 128-bit part of 256-bit vector and then
21820 /// inserting the result into the high part of a new 256-bit vector
21821 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21822   EVT VT = SVOp->getValueType(0);
21823   unsigned NumElems = VT.getVectorNumElements();
21824
21825   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21826   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21827     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21828         SVOp->getMaskElt(j) >= 0)
21829       return false;
21830
21831   return true;
21832 }
21833
21834 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21835 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21836                                         TargetLowering::DAGCombinerInfo &DCI,
21837                                         const X86Subtarget* Subtarget) {
21838   SDLoc dl(N);
21839   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21840   SDValue V1 = SVOp->getOperand(0);
21841   SDValue V2 = SVOp->getOperand(1);
21842   EVT VT = SVOp->getValueType(0);
21843   unsigned NumElems = VT.getVectorNumElements();
21844
21845   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21846       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21847     //
21848     //                   0,0,0,...
21849     //                      |
21850     //    V      UNDEF    BUILD_VECTOR    UNDEF
21851     //     \      /           \           /
21852     //  CONCAT_VECTOR         CONCAT_VECTOR
21853     //         \                  /
21854     //          \                /
21855     //          RESULT: V + zero extended
21856     //
21857     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21858         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21859         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21860       return SDValue();
21861
21862     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21863       return SDValue();
21864
21865     // To match the shuffle mask, the first half of the mask should
21866     // be exactly the first vector, and all the rest a splat with the
21867     // first element of the second one.
21868     for (unsigned i = 0; i != NumElems/2; ++i)
21869       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
21870           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
21871         return SDValue();
21872
21873     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
21874     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
21875       if (Ld->hasNUsesOfValue(1, 0)) {
21876         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
21877         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
21878         SDValue ResNode =
21879           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
21880                                   Ld->getMemoryVT(),
21881                                   Ld->getPointerInfo(),
21882                                   Ld->getAlignment(),
21883                                   false/*isVolatile*/, true/*ReadMem*/,
21884                                   false/*WriteMem*/);
21885
21886         // Make sure the newly-created LOAD is in the same position as Ld in
21887         // terms of dependency. We create a TokenFactor for Ld and ResNode,
21888         // and update uses of Ld's output chain to use the TokenFactor.
21889         if (Ld->hasAnyUseOfValue(1)) {
21890           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
21891                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
21892           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
21893           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
21894                                  SDValue(ResNode.getNode(), 1));
21895         }
21896
21897         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
21898       }
21899     }
21900
21901     // Emit a zeroed vector and insert the desired subvector on its
21902     // first half.
21903     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21904     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
21905     return DCI.CombineTo(N, InsV);
21906   }
21907
21908   //===--------------------------------------------------------------------===//
21909   // Combine some shuffles into subvector extracts and inserts:
21910   //
21911
21912   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21913   if (isShuffleHigh128VectorInsertLow(SVOp)) {
21914     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
21915     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
21916     return DCI.CombineTo(N, InsV);
21917   }
21918
21919   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21920   if (isShuffleLow128VectorInsertHigh(SVOp)) {
21921     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
21922     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
21923     return DCI.CombineTo(N, InsV);
21924   }
21925
21926   return SDValue();
21927 }
21928
21929 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
21930 /// possible.
21931 ///
21932 /// This is the leaf of the recursive combinine below. When we have found some
21933 /// chain of single-use x86 shuffle instructions and accumulated the combined
21934 /// shuffle mask represented by them, this will try to pattern match that mask
21935 /// into either a single instruction if there is a special purpose instruction
21936 /// for this operation, or into a PSHUFB instruction which is a fully general
21937 /// instruction but should only be used to replace chains over a certain depth.
21938 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
21939                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
21940                                    TargetLowering::DAGCombinerInfo &DCI,
21941                                    const X86Subtarget *Subtarget) {
21942   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
21943
21944   // Find the operand that enters the chain. Note that multiple uses are OK
21945   // here, we're not going to remove the operand we find.
21946   SDValue Input = Op.getOperand(0);
21947   while (Input.getOpcode() == ISD::BITCAST)
21948     Input = Input.getOperand(0);
21949
21950   MVT VT = Input.getSimpleValueType();
21951   MVT RootVT = Root.getSimpleValueType();
21952   SDLoc DL(Root);
21953
21954   // Just remove no-op shuffle masks.
21955   if (Mask.size() == 1) {
21956     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
21957                   /*AddTo*/ true);
21958     return true;
21959   }
21960
21961   // Use the float domain if the operand type is a floating point type.
21962   bool FloatDomain = VT.isFloatingPoint();
21963
21964   // For floating point shuffles, we don't have free copies in the shuffle
21965   // instructions or the ability to load as part of the instruction, so
21966   // canonicalize their shuffles to UNPCK or MOV variants.
21967   //
21968   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
21969   // vectors because it can have a load folded into it that UNPCK cannot. This
21970   // doesn't preclude something switching to the shorter encoding post-RA.
21971   if (FloatDomain) {
21972     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
21973       bool Lo = Mask.equals(0, 0);
21974       unsigned Shuffle;
21975       MVT ShuffleVT;
21976       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
21977       // is no slower than UNPCKLPD but has the option to fold the input operand
21978       // into even an unaligned memory load.
21979       if (Lo && Subtarget->hasSSE3()) {
21980         Shuffle = X86ISD::MOVDDUP;
21981         ShuffleVT = MVT::v2f64;
21982       } else {
21983         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
21984         // than the UNPCK variants.
21985         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
21986         ShuffleVT = MVT::v4f32;
21987       }
21988       if (Depth == 1 && Root->getOpcode() == Shuffle)
21989         return false; // Nothing to do!
21990       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21991       DCI.AddToWorklist(Op.getNode());
21992       if (Shuffle == X86ISD::MOVDDUP)
21993         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21994       else
21995         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21996       DCI.AddToWorklist(Op.getNode());
21997       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21998                     /*AddTo*/ true);
21999       return true;
22000     }
22001     if (Subtarget->hasSSE3() &&
22002         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
22003       bool Lo = Mask.equals(0, 0, 2, 2);
22004       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
22005       MVT ShuffleVT = MVT::v4f32;
22006       if (Depth == 1 && Root->getOpcode() == Shuffle)
22007         return false; // Nothing to do!
22008       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22009       DCI.AddToWorklist(Op.getNode());
22010       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
22011       DCI.AddToWorklist(Op.getNode());
22012       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22013                     /*AddTo*/ true);
22014       return true;
22015     }
22016     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
22017       bool Lo = Mask.equals(0, 0, 1, 1);
22018       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22019       MVT ShuffleVT = MVT::v4f32;
22020       if (Depth == 1 && Root->getOpcode() == Shuffle)
22021         return false; // Nothing to do!
22022       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22023       DCI.AddToWorklist(Op.getNode());
22024       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22025       DCI.AddToWorklist(Op.getNode());
22026       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22027                     /*AddTo*/ true);
22028       return true;
22029     }
22030   }
22031
22032   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22033   // variants as none of these have single-instruction variants that are
22034   // superior to the UNPCK formulation.
22035   if (!FloatDomain &&
22036       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22037        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22038        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22039        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22040                    15))) {
22041     bool Lo = Mask[0] == 0;
22042     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22043     if (Depth == 1 && Root->getOpcode() == Shuffle)
22044       return false; // Nothing to do!
22045     MVT ShuffleVT;
22046     switch (Mask.size()) {
22047     case 8:
22048       ShuffleVT = MVT::v8i16;
22049       break;
22050     case 16:
22051       ShuffleVT = MVT::v16i8;
22052       break;
22053     default:
22054       llvm_unreachable("Impossible mask size!");
22055     };
22056     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22057     DCI.AddToWorklist(Op.getNode());
22058     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22059     DCI.AddToWorklist(Op.getNode());
22060     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22061                   /*AddTo*/ true);
22062     return true;
22063   }
22064
22065   // Don't try to re-form single instruction chains under any circumstances now
22066   // that we've done encoding canonicalization for them.
22067   if (Depth < 2)
22068     return false;
22069
22070   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22071   // can replace them with a single PSHUFB instruction profitably. Intel's
22072   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22073   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22074   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22075     SmallVector<SDValue, 16> PSHUFBMask;
22076     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22077     int Ratio = 16 / Mask.size();
22078     for (unsigned i = 0; i < 16; ++i) {
22079       if (Mask[i / Ratio] == SM_SentinelUndef) {
22080         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22081         continue;
22082       }
22083       int M = Mask[i / Ratio] != SM_SentinelZero
22084                   ? Ratio * Mask[i / Ratio] + i % Ratio
22085                   : 255;
22086       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22087     }
22088     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22089     DCI.AddToWorklist(Op.getNode());
22090     SDValue PSHUFBMaskOp =
22091         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22092     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22093     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22094     DCI.AddToWorklist(Op.getNode());
22095     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22096                   /*AddTo*/ true);
22097     return true;
22098   }
22099
22100   // Failed to find any combines.
22101   return false;
22102 }
22103
22104 /// \brief Fully generic combining of x86 shuffle instructions.
22105 ///
22106 /// This should be the last combine run over the x86 shuffle instructions. Once
22107 /// they have been fully optimized, this will recursively consider all chains
22108 /// of single-use shuffle instructions, build a generic model of the cumulative
22109 /// shuffle operation, and check for simpler instructions which implement this
22110 /// operation. We use this primarily for two purposes:
22111 ///
22112 /// 1) Collapse generic shuffles to specialized single instructions when
22113 ///    equivalent. In most cases, this is just an encoding size win, but
22114 ///    sometimes we will collapse multiple generic shuffles into a single
22115 ///    special-purpose shuffle.
22116 /// 2) Look for sequences of shuffle instructions with 3 or more total
22117 ///    instructions, and replace them with the slightly more expensive SSSE3
22118 ///    PSHUFB instruction if available. We do this as the last combining step
22119 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22120 ///    a suitable short sequence of other instructions. The PHUFB will either
22121 ///    use a register or have to read from memory and so is slightly (but only
22122 ///    slightly) more expensive than the other shuffle instructions.
22123 ///
22124 /// Because this is inherently a quadratic operation (for each shuffle in
22125 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22126 /// This should never be an issue in practice as the shuffle lowering doesn't
22127 /// produce sequences of more than 8 instructions.
22128 ///
22129 /// FIXME: We will currently miss some cases where the redundant shuffling
22130 /// would simplify under the threshold for PSHUFB formation because of
22131 /// combine-ordering. To fix this, we should do the redundant instruction
22132 /// combining in this recursive walk.
22133 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22134                                           ArrayRef<int> RootMask,
22135                                           int Depth, bool HasPSHUFB,
22136                                           SelectionDAG &DAG,
22137                                           TargetLowering::DAGCombinerInfo &DCI,
22138                                           const X86Subtarget *Subtarget) {
22139   // Bound the depth of our recursive combine because this is ultimately
22140   // quadratic in nature.
22141   if (Depth > 8)
22142     return false;
22143
22144   // Directly rip through bitcasts to find the underlying operand.
22145   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22146     Op = Op.getOperand(0);
22147
22148   MVT VT = Op.getSimpleValueType();
22149   if (!VT.isVector())
22150     return false; // Bail if we hit a non-vector.
22151   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22152   // version should be added.
22153   if (VT.getSizeInBits() != 128)
22154     return false;
22155
22156   assert(Root.getSimpleValueType().isVector() &&
22157          "Shuffles operate on vector types!");
22158   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22159          "Can only combine shuffles of the same vector register size.");
22160
22161   if (!isTargetShuffle(Op.getOpcode()))
22162     return false;
22163   SmallVector<int, 16> OpMask;
22164   bool IsUnary;
22165   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22166   // We only can combine unary shuffles which we can decode the mask for.
22167   if (!HaveMask || !IsUnary)
22168     return false;
22169
22170   assert(VT.getVectorNumElements() == OpMask.size() &&
22171          "Different mask size from vector size!");
22172   assert(((RootMask.size() > OpMask.size() &&
22173            RootMask.size() % OpMask.size() == 0) ||
22174           (OpMask.size() > RootMask.size() &&
22175            OpMask.size() % RootMask.size() == 0) ||
22176           OpMask.size() == RootMask.size()) &&
22177          "The smaller number of elements must divide the larger.");
22178   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22179   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22180   assert(((RootRatio == 1 && OpRatio == 1) ||
22181           (RootRatio == 1) != (OpRatio == 1)) &&
22182          "Must not have a ratio for both incoming and op masks!");
22183
22184   SmallVector<int, 16> Mask;
22185   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22186
22187   // Merge this shuffle operation's mask into our accumulated mask. Note that
22188   // this shuffle's mask will be the first applied to the input, followed by the
22189   // root mask to get us all the way to the root value arrangement. The reason
22190   // for this order is that we are recursing up the operation chain.
22191   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22192     int RootIdx = i / RootRatio;
22193     if (RootMask[RootIdx] < 0) {
22194       // This is a zero or undef lane, we're done.
22195       Mask.push_back(RootMask[RootIdx]);
22196       continue;
22197     }
22198
22199     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22200     int OpIdx = RootMaskedIdx / OpRatio;
22201     if (OpMask[OpIdx] < 0) {
22202       // The incoming lanes are zero or undef, it doesn't matter which ones we
22203       // are using.
22204       Mask.push_back(OpMask[OpIdx]);
22205       continue;
22206     }
22207
22208     // Ok, we have non-zero lanes, map them through.
22209     Mask.push_back(OpMask[OpIdx] * OpRatio +
22210                    RootMaskedIdx % OpRatio);
22211   }
22212
22213   // See if we can recurse into the operand to combine more things.
22214   switch (Op.getOpcode()) {
22215     case X86ISD::PSHUFB:
22216       HasPSHUFB = true;
22217     case X86ISD::PSHUFD:
22218     case X86ISD::PSHUFHW:
22219     case X86ISD::PSHUFLW:
22220       if (Op.getOperand(0).hasOneUse() &&
22221           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22222                                         HasPSHUFB, DAG, DCI, Subtarget))
22223         return true;
22224       break;
22225
22226     case X86ISD::UNPCKL:
22227     case X86ISD::UNPCKH:
22228       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22229       // We can't check for single use, we have to check that this shuffle is the only user.
22230       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22231           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22232                                         HasPSHUFB, DAG, DCI, Subtarget))
22233           return true;
22234       break;
22235   }
22236
22237   // Minor canonicalization of the accumulated shuffle mask to make it easier
22238   // to match below. All this does is detect masks with squential pairs of
22239   // elements, and shrink them to the half-width mask. It does this in a loop
22240   // so it will reduce the size of the mask to the minimal width mask which
22241   // performs an equivalent shuffle.
22242   SmallVector<int, 16> WidenedMask;
22243   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22244     Mask = std::move(WidenedMask);
22245     WidenedMask.clear();
22246   }
22247
22248   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22249                                 Subtarget);
22250 }
22251
22252 /// \brief Get the PSHUF-style mask from PSHUF node.
22253 ///
22254 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22255 /// PSHUF-style masks that can be reused with such instructions.
22256 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22257   SmallVector<int, 4> Mask;
22258   bool IsUnary;
22259   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22260   (void)HaveMask;
22261   assert(HaveMask);
22262
22263   switch (N.getOpcode()) {
22264   case X86ISD::PSHUFD:
22265     return Mask;
22266   case X86ISD::PSHUFLW:
22267     Mask.resize(4);
22268     return Mask;
22269   case X86ISD::PSHUFHW:
22270     Mask.erase(Mask.begin(), Mask.begin() + 4);
22271     for (int &M : Mask)
22272       M -= 4;
22273     return Mask;
22274   default:
22275     llvm_unreachable("No valid shuffle instruction found!");
22276   }
22277 }
22278
22279 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22280 ///
22281 /// We walk up the chain and look for a combinable shuffle, skipping over
22282 /// shuffles that we could hoist this shuffle's transformation past without
22283 /// altering anything.
22284 static SDValue
22285 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22286                              SelectionDAG &DAG,
22287                              TargetLowering::DAGCombinerInfo &DCI) {
22288   assert(N.getOpcode() == X86ISD::PSHUFD &&
22289          "Called with something other than an x86 128-bit half shuffle!");
22290   SDLoc DL(N);
22291
22292   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22293   // of the shuffles in the chain so that we can form a fresh chain to replace
22294   // this one.
22295   SmallVector<SDValue, 8> Chain;
22296   SDValue V = N.getOperand(0);
22297   for (; V.hasOneUse(); V = V.getOperand(0)) {
22298     switch (V.getOpcode()) {
22299     default:
22300       return SDValue(); // Nothing combined!
22301
22302     case ISD::BITCAST:
22303       // Skip bitcasts as we always know the type for the target specific
22304       // instructions.
22305       continue;
22306
22307     case X86ISD::PSHUFD:
22308       // Found another dword shuffle.
22309       break;
22310
22311     case X86ISD::PSHUFLW:
22312       // Check that the low words (being shuffled) are the identity in the
22313       // dword shuffle, and the high words are self-contained.
22314       if (Mask[0] != 0 || Mask[1] != 1 ||
22315           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22316         return SDValue();
22317
22318       Chain.push_back(V);
22319       continue;
22320
22321     case X86ISD::PSHUFHW:
22322       // Check that the high words (being shuffled) are the identity in the
22323       // dword shuffle, and the low words are self-contained.
22324       if (Mask[2] != 2 || Mask[3] != 3 ||
22325           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22326         return SDValue();
22327
22328       Chain.push_back(V);
22329       continue;
22330
22331     case X86ISD::UNPCKL:
22332     case X86ISD::UNPCKH:
22333       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22334       // shuffle into a preceding word shuffle.
22335       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22336         return SDValue();
22337
22338       // Search for a half-shuffle which we can combine with.
22339       unsigned CombineOp =
22340           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22341       if (V.getOperand(0) != V.getOperand(1) ||
22342           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22343         return SDValue();
22344       Chain.push_back(V);
22345       V = V.getOperand(0);
22346       do {
22347         switch (V.getOpcode()) {
22348         default:
22349           return SDValue(); // Nothing to combine.
22350
22351         case X86ISD::PSHUFLW:
22352         case X86ISD::PSHUFHW:
22353           if (V.getOpcode() == CombineOp)
22354             break;
22355
22356           Chain.push_back(V);
22357
22358           // Fallthrough!
22359         case ISD::BITCAST:
22360           V = V.getOperand(0);
22361           continue;
22362         }
22363         break;
22364       } while (V.hasOneUse());
22365       break;
22366     }
22367     // Break out of the loop if we break out of the switch.
22368     break;
22369   }
22370
22371   if (!V.hasOneUse())
22372     // We fell out of the loop without finding a viable combining instruction.
22373     return SDValue();
22374
22375   // Merge this node's mask and our incoming mask.
22376   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22377   for (int &M : Mask)
22378     M = VMask[M];
22379   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22380                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22381
22382   // Rebuild the chain around this new shuffle.
22383   while (!Chain.empty()) {
22384     SDValue W = Chain.pop_back_val();
22385
22386     if (V.getValueType() != W.getOperand(0).getValueType())
22387       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22388
22389     switch (W.getOpcode()) {
22390     default:
22391       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22392
22393     case X86ISD::UNPCKL:
22394     case X86ISD::UNPCKH:
22395       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22396       break;
22397
22398     case X86ISD::PSHUFD:
22399     case X86ISD::PSHUFLW:
22400     case X86ISD::PSHUFHW:
22401       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22402       break;
22403     }
22404   }
22405   if (V.getValueType() != N.getValueType())
22406     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22407
22408   // Return the new chain to replace N.
22409   return V;
22410 }
22411
22412 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22413 ///
22414 /// We walk up the chain, skipping shuffles of the other half and looking
22415 /// through shuffles which switch halves trying to find a shuffle of the same
22416 /// pair of dwords.
22417 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22418                                         SelectionDAG &DAG,
22419                                         TargetLowering::DAGCombinerInfo &DCI) {
22420   assert(
22421       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22422       "Called with something other than an x86 128-bit half shuffle!");
22423   SDLoc DL(N);
22424   unsigned CombineOpcode = N.getOpcode();
22425
22426   // Walk up a single-use chain looking for a combinable shuffle.
22427   SDValue V = N.getOperand(0);
22428   for (; V.hasOneUse(); V = V.getOperand(0)) {
22429     switch (V.getOpcode()) {
22430     default:
22431       return false; // Nothing combined!
22432
22433     case ISD::BITCAST:
22434       // Skip bitcasts as we always know the type for the target specific
22435       // instructions.
22436       continue;
22437
22438     case X86ISD::PSHUFLW:
22439     case X86ISD::PSHUFHW:
22440       if (V.getOpcode() == CombineOpcode)
22441         break;
22442
22443       // Other-half shuffles are no-ops.
22444       continue;
22445     }
22446     // Break out of the loop if we break out of the switch.
22447     break;
22448   }
22449
22450   if (!V.hasOneUse())
22451     // We fell out of the loop without finding a viable combining instruction.
22452     return false;
22453
22454   // Combine away the bottom node as its shuffle will be accumulated into
22455   // a preceding shuffle.
22456   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22457
22458   // Record the old value.
22459   SDValue Old = V;
22460
22461   // Merge this node's mask and our incoming mask (adjusted to account for all
22462   // the pshufd instructions encountered).
22463   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22464   for (int &M : Mask)
22465     M = VMask[M];
22466   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22467                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22468
22469   // Check that the shuffles didn't cancel each other out. If not, we need to
22470   // combine to the new one.
22471   if (Old != V)
22472     // Replace the combinable shuffle with the combined one, updating all users
22473     // so that we re-evaluate the chain here.
22474     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22475
22476   return true;
22477 }
22478
22479 /// \brief Try to combine x86 target specific shuffles.
22480 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22481                                            TargetLowering::DAGCombinerInfo &DCI,
22482                                            const X86Subtarget *Subtarget) {
22483   SDLoc DL(N);
22484   MVT VT = N.getSimpleValueType();
22485   SmallVector<int, 4> Mask;
22486
22487   switch (N.getOpcode()) {
22488   case X86ISD::PSHUFD:
22489   case X86ISD::PSHUFLW:
22490   case X86ISD::PSHUFHW:
22491     Mask = getPSHUFShuffleMask(N);
22492     assert(Mask.size() == 4);
22493     break;
22494   default:
22495     return SDValue();
22496   }
22497
22498   // Nuke no-op shuffles that show up after combining.
22499   if (isNoopShuffleMask(Mask))
22500     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22501
22502   // Look for simplifications involving one or two shuffle instructions.
22503   SDValue V = N.getOperand(0);
22504   switch (N.getOpcode()) {
22505   default:
22506     break;
22507   case X86ISD::PSHUFLW:
22508   case X86ISD::PSHUFHW:
22509     assert(VT == MVT::v8i16);
22510     (void)VT;
22511
22512     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22513       return SDValue(); // We combined away this shuffle, so we're done.
22514
22515     // See if this reduces to a PSHUFD which is no more expensive and can
22516     // combine with more operations. Note that it has to at least flip the
22517     // dwords as otherwise it would have been removed as a no-op.
22518     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22519       int DMask[] = {0, 1, 2, 3};
22520       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22521       DMask[DOffset + 0] = DOffset + 1;
22522       DMask[DOffset + 1] = DOffset + 0;
22523       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22524       DCI.AddToWorklist(V.getNode());
22525       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22526                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22527       DCI.AddToWorklist(V.getNode());
22528       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22529     }
22530
22531     // Look for shuffle patterns which can be implemented as a single unpack.
22532     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22533     // only works when we have a PSHUFD followed by two half-shuffles.
22534     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22535         (V.getOpcode() == X86ISD::PSHUFLW ||
22536          V.getOpcode() == X86ISD::PSHUFHW) &&
22537         V.getOpcode() != N.getOpcode() &&
22538         V.hasOneUse()) {
22539       SDValue D = V.getOperand(0);
22540       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22541         D = D.getOperand(0);
22542       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22543         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22544         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22545         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22546         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22547         int WordMask[8];
22548         for (int i = 0; i < 4; ++i) {
22549           WordMask[i + NOffset] = Mask[i] + NOffset;
22550           WordMask[i + VOffset] = VMask[i] + VOffset;
22551         }
22552         // Map the word mask through the DWord mask.
22553         int MappedMask[8];
22554         for (int i = 0; i < 8; ++i)
22555           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22556         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22557         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22558         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22559                        std::begin(UnpackLoMask)) ||
22560             std::equal(std::begin(MappedMask), std::end(MappedMask),
22561                        std::begin(UnpackHiMask))) {
22562           // We can replace all three shuffles with an unpack.
22563           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22564           DCI.AddToWorklist(V.getNode());
22565           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22566                                                 : X86ISD::UNPCKH,
22567                              DL, MVT::v8i16, V, V);
22568         }
22569       }
22570     }
22571
22572     break;
22573
22574   case X86ISD::PSHUFD:
22575     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22576       return NewN;
22577
22578     break;
22579   }
22580
22581   return SDValue();
22582 }
22583
22584 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22585 ///
22586 /// We combine this directly on the abstract vector shuffle nodes so it is
22587 /// easier to generically match. We also insert dummy vector shuffle nodes for
22588 /// the operands which explicitly discard the lanes which are unused by this
22589 /// operation to try to flow through the rest of the combiner the fact that
22590 /// they're unused.
22591 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22592   SDLoc DL(N);
22593   EVT VT = N->getValueType(0);
22594
22595   // We only handle target-independent shuffles.
22596   // FIXME: It would be easy and harmless to use the target shuffle mask
22597   // extraction tool to support more.
22598   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22599     return SDValue();
22600
22601   auto *SVN = cast<ShuffleVectorSDNode>(N);
22602   ArrayRef<int> Mask = SVN->getMask();
22603   SDValue V1 = N->getOperand(0);
22604   SDValue V2 = N->getOperand(1);
22605
22606   // We require the first shuffle operand to be the SUB node, and the second to
22607   // be the ADD node.
22608   // FIXME: We should support the commuted patterns.
22609   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22610     return SDValue();
22611
22612   // If there are other uses of these operations we can't fold them.
22613   if (!V1->hasOneUse() || !V2->hasOneUse())
22614     return SDValue();
22615
22616   // Ensure that both operations have the same operands. Note that we can
22617   // commute the FADD operands.
22618   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22619   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22620       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22621     return SDValue();
22622
22623   // We're looking for blends between FADD and FSUB nodes. We insist on these
22624   // nodes being lined up in a specific expected pattern.
22625   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22626         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22627         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22628     return SDValue();
22629
22630   // Only specific types are legal at this point, assert so we notice if and
22631   // when these change.
22632   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22633           VT == MVT::v4f64) &&
22634          "Unknown vector type encountered!");
22635
22636   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22637 }
22638
22639 /// PerformShuffleCombine - Performs several different shuffle combines.
22640 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22641                                      TargetLowering::DAGCombinerInfo &DCI,
22642                                      const X86Subtarget *Subtarget) {
22643   SDLoc dl(N);
22644   SDValue N0 = N->getOperand(0);
22645   SDValue N1 = N->getOperand(1);
22646   EVT VT = N->getValueType(0);
22647
22648   // Don't create instructions with illegal types after legalize types has run.
22649   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22650   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22651     return SDValue();
22652
22653   // If we have legalized the vector types, look for blends of FADD and FSUB
22654   // nodes that we can fuse into an ADDSUB node.
22655   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22656     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22657       return AddSub;
22658
22659   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22660   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22661       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22662     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22663
22664   // During Type Legalization, when promoting illegal vector types,
22665   // the backend might introduce new shuffle dag nodes and bitcasts.
22666   //
22667   // This code performs the following transformation:
22668   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22669   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22670   //
22671   // We do this only if both the bitcast and the BINOP dag nodes have
22672   // one use. Also, perform this transformation only if the new binary
22673   // operation is legal. This is to avoid introducing dag nodes that
22674   // potentially need to be further expanded (or custom lowered) into a
22675   // less optimal sequence of dag nodes.
22676   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22677       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22678       N0.getOpcode() == ISD::BITCAST) {
22679     SDValue BC0 = N0.getOperand(0);
22680     EVT SVT = BC0.getValueType();
22681     unsigned Opcode = BC0.getOpcode();
22682     unsigned NumElts = VT.getVectorNumElements();
22683
22684     if (BC0.hasOneUse() && SVT.isVector() &&
22685         SVT.getVectorNumElements() * 2 == NumElts &&
22686         TLI.isOperationLegal(Opcode, VT)) {
22687       bool CanFold = false;
22688       switch (Opcode) {
22689       default : break;
22690       case ISD::ADD :
22691       case ISD::FADD :
22692       case ISD::SUB :
22693       case ISD::FSUB :
22694       case ISD::MUL :
22695       case ISD::FMUL :
22696         CanFold = true;
22697       }
22698
22699       unsigned SVTNumElts = SVT.getVectorNumElements();
22700       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22701       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22702         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22703       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22704         CanFold = SVOp->getMaskElt(i) < 0;
22705
22706       if (CanFold) {
22707         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22708         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22709         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22710         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22711       }
22712     }
22713   }
22714
22715   // Only handle 128 wide vector from here on.
22716   if (!VT.is128BitVector())
22717     return SDValue();
22718
22719   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22720   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22721   // consecutive, non-overlapping, and in the right order.
22722   SmallVector<SDValue, 16> Elts;
22723   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22724     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22725
22726   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22727   if (LD.getNode())
22728     return LD;
22729
22730   if (isTargetShuffle(N->getOpcode())) {
22731     SDValue Shuffle =
22732         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22733     if (Shuffle.getNode())
22734       return Shuffle;
22735
22736     // Try recursively combining arbitrary sequences of x86 shuffle
22737     // instructions into higher-order shuffles. We do this after combining
22738     // specific PSHUF instruction sequences into their minimal form so that we
22739     // can evaluate how many specialized shuffle instructions are involved in
22740     // a particular chain.
22741     SmallVector<int, 1> NonceMask; // Just a placeholder.
22742     NonceMask.push_back(0);
22743     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22744                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22745                                       DCI, Subtarget))
22746       return SDValue(); // This routine will use CombineTo to replace N.
22747   }
22748
22749   return SDValue();
22750 }
22751
22752 /// PerformTruncateCombine - Converts truncate operation to
22753 /// a sequence of vector shuffle operations.
22754 /// It is possible when we truncate 256-bit vector to 128-bit vector
22755 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22756                                       TargetLowering::DAGCombinerInfo &DCI,
22757                                       const X86Subtarget *Subtarget)  {
22758   return SDValue();
22759 }
22760
22761 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22762 /// specific shuffle of a load can be folded into a single element load.
22763 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22764 /// shuffles have been custom lowered so we need to handle those here.
22765 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22766                                          TargetLowering::DAGCombinerInfo &DCI) {
22767   if (DCI.isBeforeLegalizeOps())
22768     return SDValue();
22769
22770   SDValue InVec = N->getOperand(0);
22771   SDValue EltNo = N->getOperand(1);
22772
22773   if (!isa<ConstantSDNode>(EltNo))
22774     return SDValue();
22775
22776   EVT OriginalVT = InVec.getValueType();
22777
22778   if (InVec.getOpcode() == ISD::BITCAST) {
22779     // Don't duplicate a load with other uses.
22780     if (!InVec.hasOneUse())
22781       return SDValue();
22782     EVT BCVT = InVec.getOperand(0).getValueType();
22783     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22784       return SDValue();
22785     InVec = InVec.getOperand(0);
22786   }
22787
22788   EVT CurrentVT = InVec.getValueType();
22789
22790   if (!isTargetShuffle(InVec.getOpcode()))
22791     return SDValue();
22792
22793   // Don't duplicate a load with other uses.
22794   if (!InVec.hasOneUse())
22795     return SDValue();
22796
22797   SmallVector<int, 16> ShuffleMask;
22798   bool UnaryShuffle;
22799   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22800                             ShuffleMask, UnaryShuffle))
22801     return SDValue();
22802
22803   // Select the input vector, guarding against out of range extract vector.
22804   unsigned NumElems = CurrentVT.getVectorNumElements();
22805   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22806   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22807   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22808                                          : InVec.getOperand(1);
22809
22810   // If inputs to shuffle are the same for both ops, then allow 2 uses
22811   unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22812
22813   if (LdNode.getOpcode() == ISD::BITCAST) {
22814     // Don't duplicate a load with other uses.
22815     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22816       return SDValue();
22817
22818     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22819     LdNode = LdNode.getOperand(0);
22820   }
22821
22822   if (!ISD::isNormalLoad(LdNode.getNode()))
22823     return SDValue();
22824
22825   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22826
22827   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22828     return SDValue();
22829
22830   EVT EltVT = N->getValueType(0);
22831   // If there's a bitcast before the shuffle, check if the load type and
22832   // alignment is valid.
22833   unsigned Align = LN0->getAlignment();
22834   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22835   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22836       EltVT.getTypeForEVT(*DAG.getContext()));
22837
22838   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22839     return SDValue();
22840
22841   // All checks match so transform back to vector_shuffle so that DAG combiner
22842   // can finish the job
22843   SDLoc dl(N);
22844
22845   // Create shuffle node taking into account the case that its a unary shuffle
22846   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22847                                    : InVec.getOperand(1);
22848   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22849                                  InVec.getOperand(0), Shuffle,
22850                                  &ShuffleMask[0]);
22851   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22852   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22853                      EltNo);
22854 }
22855
22856 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22857 /// generation and convert it from being a bunch of shuffles and extracts
22858 /// into a somewhat faster sequence. For i686, the best sequence is apparently
22859 /// storing the value and loading scalars back, while for x64 we should
22860 /// use 64-bit extracts and shifts.
22861 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22862                                          TargetLowering::DAGCombinerInfo &DCI) {
22863   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22864   if (NewOp.getNode())
22865     return NewOp;
22866
22867   SDValue InputVector = N->getOperand(0);
22868
22869   // Detect whether we are trying to convert from mmx to i32 and the bitcast
22870   // from mmx to v2i32 has a single usage.
22871   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
22872       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
22873       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
22874     return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
22875                        N->getValueType(0),
22876                        InputVector.getNode()->getOperand(0));
22877
22878   // Only operate on vectors of 4 elements, where the alternative shuffling
22879   // gets to be more expensive.
22880   if (InputVector.getValueType() != MVT::v4i32)
22881     return SDValue();
22882
22883   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
22884   // single use which is a sign-extend or zero-extend, and all elements are
22885   // used.
22886   SmallVector<SDNode *, 4> Uses;
22887   unsigned ExtractedElements = 0;
22888   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
22889        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
22890     if (UI.getUse().getResNo() != InputVector.getResNo())
22891       return SDValue();
22892
22893     SDNode *Extract = *UI;
22894     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22895       return SDValue();
22896
22897     if (Extract->getValueType(0) != MVT::i32)
22898       return SDValue();
22899     if (!Extract->hasOneUse())
22900       return SDValue();
22901     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
22902         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
22903       return SDValue();
22904     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
22905       return SDValue();
22906
22907     // Record which element was extracted.
22908     ExtractedElements |=
22909       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
22910
22911     Uses.push_back(Extract);
22912   }
22913
22914   // If not all the elements were used, this may not be worthwhile.
22915   if (ExtractedElements != 15)
22916     return SDValue();
22917
22918   // Ok, we've now decided to do the transformation.
22919   // If 64-bit shifts are legal, use the extract-shift sequence,
22920   // otherwise bounce the vector off the cache.
22921   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22922   SDValue Vals[4];
22923   SDLoc dl(InputVector);
22924
22925   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
22926     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
22927     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
22928     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22929       DAG.getConstant(0, VecIdxTy));
22930     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22931       DAG.getConstant(1, VecIdxTy));
22932
22933     SDValue ShAmt = DAG.getConstant(32,
22934       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
22935     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
22936     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22937       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
22938     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
22939     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22940       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
22941   } else {
22942     // Store the value to a temporary stack slot.
22943     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
22944     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
22945       MachinePointerInfo(), false, false, 0);
22946
22947     EVT ElementType = InputVector.getValueType().getVectorElementType();
22948     unsigned EltSize = ElementType.getSizeInBits() / 8;
22949
22950     // Replace each use (extract) with a load of the appropriate element.
22951     for (unsigned i = 0; i < 4; ++i) {
22952       uint64_t Offset = EltSize * i;
22953       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
22954
22955       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
22956                                        StackPtr, OffsetVal);
22957
22958       // Load the scalar.
22959       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
22960                             ScalarAddr, MachinePointerInfo(),
22961                             false, false, false, 0);
22962
22963     }
22964   }
22965
22966   // Replace the extracts
22967   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
22968     UE = Uses.end(); UI != UE; ++UI) {
22969     SDNode *Extract = *UI;
22970
22971     SDValue Idx = Extract->getOperand(1);
22972     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
22973     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
22974   }
22975
22976   // The replacement was made in place; don't return anything.
22977   return SDValue();
22978 }
22979
22980 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
22981 static std::pair<unsigned, bool>
22982 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
22983                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
22984   if (!VT.isVector())
22985     return std::make_pair(0, false);
22986
22987   bool NeedSplit = false;
22988   switch (VT.getSimpleVT().SimpleTy) {
22989   default: return std::make_pair(0, false);
22990   case MVT::v4i64:
22991   case MVT::v2i64:
22992     if (!Subtarget->hasVLX())
22993       return std::make_pair(0, false);
22994     break;
22995   case MVT::v64i8:
22996   case MVT::v32i16:
22997     if (!Subtarget->hasBWI())
22998       return std::make_pair(0, false);
22999     break;
23000   case MVT::v16i32:
23001   case MVT::v8i64:
23002     if (!Subtarget->hasAVX512())
23003       return std::make_pair(0, false);
23004     break;
23005   case MVT::v32i8:
23006   case MVT::v16i16:
23007   case MVT::v8i32:
23008     if (!Subtarget->hasAVX2())
23009       NeedSplit = true;
23010     if (!Subtarget->hasAVX())
23011       return std::make_pair(0, false);
23012     break;
23013   case MVT::v16i8:
23014   case MVT::v8i16:
23015   case MVT::v4i32:
23016     if (!Subtarget->hasSSE2())
23017       return std::make_pair(0, false);
23018   }
23019
23020   // SSE2 has only a small subset of the operations.
23021   bool hasUnsigned = Subtarget->hasSSE41() ||
23022                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23023   bool hasSigned = Subtarget->hasSSE41() ||
23024                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23025
23026   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23027
23028   unsigned Opc = 0;
23029   // Check for x CC y ? x : y.
23030   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23031       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23032     switch (CC) {
23033     default: break;
23034     case ISD::SETULT:
23035     case ISD::SETULE:
23036       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23037     case ISD::SETUGT:
23038     case ISD::SETUGE:
23039       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23040     case ISD::SETLT:
23041     case ISD::SETLE:
23042       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23043     case ISD::SETGT:
23044     case ISD::SETGE:
23045       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23046     }
23047   // Check for x CC y ? y : x -- a min/max with reversed arms.
23048   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23049              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23050     switch (CC) {
23051     default: break;
23052     case ISD::SETULT:
23053     case ISD::SETULE:
23054       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23055     case ISD::SETUGT:
23056     case ISD::SETUGE:
23057       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23058     case ISD::SETLT:
23059     case ISD::SETLE:
23060       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23061     case ISD::SETGT:
23062     case ISD::SETGE:
23063       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23064     }
23065   }
23066
23067   return std::make_pair(Opc, NeedSplit);
23068 }
23069
23070 static SDValue
23071 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23072                                       const X86Subtarget *Subtarget) {
23073   SDLoc dl(N);
23074   SDValue Cond = N->getOperand(0);
23075   SDValue LHS = N->getOperand(1);
23076   SDValue RHS = N->getOperand(2);
23077
23078   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23079     SDValue CondSrc = Cond->getOperand(0);
23080     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23081       Cond = CondSrc->getOperand(0);
23082   }
23083
23084   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23085     return SDValue();
23086
23087   // A vselect where all conditions and data are constants can be optimized into
23088   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23089   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23090       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23091     return SDValue();
23092
23093   unsigned MaskValue = 0;
23094   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23095     return SDValue();
23096
23097   MVT VT = N->getSimpleValueType(0);
23098   unsigned NumElems = VT.getVectorNumElements();
23099   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23100   for (unsigned i = 0; i < NumElems; ++i) {
23101     // Be sure we emit undef where we can.
23102     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23103       ShuffleMask[i] = -1;
23104     else
23105       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23106   }
23107
23108   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23109   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23110     return SDValue();
23111   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23112 }
23113
23114 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23115 /// nodes.
23116 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23117                                     TargetLowering::DAGCombinerInfo &DCI,
23118                                     const X86Subtarget *Subtarget) {
23119   SDLoc DL(N);
23120   SDValue Cond = N->getOperand(0);
23121   // Get the LHS/RHS of the select.
23122   SDValue LHS = N->getOperand(1);
23123   SDValue RHS = N->getOperand(2);
23124   EVT VT = LHS.getValueType();
23125   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23126
23127   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23128   // instructions match the semantics of the common C idiom x<y?x:y but not
23129   // x<=y?x:y, because of how they handle negative zero (which can be
23130   // ignored in unsafe-math mode).
23131   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23132   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23133       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23134       (Subtarget->hasSSE2() ||
23135        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23136     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23137
23138     unsigned Opcode = 0;
23139     // Check for x CC y ? x : y.
23140     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23141         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23142       switch (CC) {
23143       default: break;
23144       case ISD::SETULT:
23145         // Converting this to a min would handle NaNs incorrectly, and swapping
23146         // the operands would cause it to handle comparisons between positive
23147         // and negative zero incorrectly.
23148         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23149           if (!DAG.getTarget().Options.UnsafeFPMath &&
23150               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23151             break;
23152           std::swap(LHS, RHS);
23153         }
23154         Opcode = X86ISD::FMIN;
23155         break;
23156       case ISD::SETOLE:
23157         // Converting this to a min would handle comparisons between positive
23158         // and negative zero incorrectly.
23159         if (!DAG.getTarget().Options.UnsafeFPMath &&
23160             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23161           break;
23162         Opcode = X86ISD::FMIN;
23163         break;
23164       case ISD::SETULE:
23165         // Converting this to a min would handle both negative zeros and NaNs
23166         // incorrectly, but we can swap the operands to fix both.
23167         std::swap(LHS, RHS);
23168       case ISD::SETOLT:
23169       case ISD::SETLT:
23170       case ISD::SETLE:
23171         Opcode = X86ISD::FMIN;
23172         break;
23173
23174       case ISD::SETOGE:
23175         // Converting this to a max would handle comparisons between positive
23176         // and negative zero incorrectly.
23177         if (!DAG.getTarget().Options.UnsafeFPMath &&
23178             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23179           break;
23180         Opcode = X86ISD::FMAX;
23181         break;
23182       case ISD::SETUGT:
23183         // Converting this to a max would handle NaNs incorrectly, and swapping
23184         // the operands would cause it to handle comparisons between positive
23185         // and negative zero incorrectly.
23186         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23187           if (!DAG.getTarget().Options.UnsafeFPMath &&
23188               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23189             break;
23190           std::swap(LHS, RHS);
23191         }
23192         Opcode = X86ISD::FMAX;
23193         break;
23194       case ISD::SETUGE:
23195         // Converting this to a max would handle both negative zeros and NaNs
23196         // incorrectly, but we can swap the operands to fix both.
23197         std::swap(LHS, RHS);
23198       case ISD::SETOGT:
23199       case ISD::SETGT:
23200       case ISD::SETGE:
23201         Opcode = X86ISD::FMAX;
23202         break;
23203       }
23204     // Check for x CC y ? y : x -- a min/max with reversed arms.
23205     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23206                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23207       switch (CC) {
23208       default: break;
23209       case ISD::SETOGE:
23210         // Converting this to a min would handle comparisons between positive
23211         // and negative zero incorrectly, and swapping the operands would
23212         // cause it to handle NaNs incorrectly.
23213         if (!DAG.getTarget().Options.UnsafeFPMath &&
23214             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23215           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23216             break;
23217           std::swap(LHS, RHS);
23218         }
23219         Opcode = X86ISD::FMIN;
23220         break;
23221       case ISD::SETUGT:
23222         // Converting this to a min would handle NaNs incorrectly.
23223         if (!DAG.getTarget().Options.UnsafeFPMath &&
23224             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23225           break;
23226         Opcode = X86ISD::FMIN;
23227         break;
23228       case ISD::SETUGE:
23229         // Converting this to a min would handle both negative zeros and NaNs
23230         // incorrectly, but we can swap the operands to fix both.
23231         std::swap(LHS, RHS);
23232       case ISD::SETOGT:
23233       case ISD::SETGT:
23234       case ISD::SETGE:
23235         Opcode = X86ISD::FMIN;
23236         break;
23237
23238       case ISD::SETULT:
23239         // Converting this to a max would handle NaNs incorrectly.
23240         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23241           break;
23242         Opcode = X86ISD::FMAX;
23243         break;
23244       case ISD::SETOLE:
23245         // Converting this to a max would handle comparisons between positive
23246         // and negative zero incorrectly, and swapping the operands would
23247         // cause it to handle NaNs incorrectly.
23248         if (!DAG.getTarget().Options.UnsafeFPMath &&
23249             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23250           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23251             break;
23252           std::swap(LHS, RHS);
23253         }
23254         Opcode = X86ISD::FMAX;
23255         break;
23256       case ISD::SETULE:
23257         // Converting this to a max would handle both negative zeros and NaNs
23258         // incorrectly, but we can swap the operands to fix both.
23259         std::swap(LHS, RHS);
23260       case ISD::SETOLT:
23261       case ISD::SETLT:
23262       case ISD::SETLE:
23263         Opcode = X86ISD::FMAX;
23264         break;
23265       }
23266     }
23267
23268     if (Opcode)
23269       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23270   }
23271
23272   EVT CondVT = Cond.getValueType();
23273   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23274       CondVT.getVectorElementType() == MVT::i1) {
23275     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23276     // lowering on KNL. In this case we convert it to
23277     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23278     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23279     // Since SKX these selects have a proper lowering.
23280     EVT OpVT = LHS.getValueType();
23281     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23282         (OpVT.getVectorElementType() == MVT::i8 ||
23283          OpVT.getVectorElementType() == MVT::i16) &&
23284         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23285       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23286       DCI.AddToWorklist(Cond.getNode());
23287       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23288     }
23289   }
23290   // If this is a select between two integer constants, try to do some
23291   // optimizations.
23292   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23293     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23294       // Don't do this for crazy integer types.
23295       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23296         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23297         // so that TrueC (the true value) is larger than FalseC.
23298         bool NeedsCondInvert = false;
23299
23300         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23301             // Efficiently invertible.
23302             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23303              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23304               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23305           NeedsCondInvert = true;
23306           std::swap(TrueC, FalseC);
23307         }
23308
23309         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23310         if (FalseC->getAPIntValue() == 0 &&
23311             TrueC->getAPIntValue().isPowerOf2()) {
23312           if (NeedsCondInvert) // Invert the condition if needed.
23313             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23314                                DAG.getConstant(1, Cond.getValueType()));
23315
23316           // Zero extend the condition if needed.
23317           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23318
23319           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23320           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23321                              DAG.getConstant(ShAmt, MVT::i8));
23322         }
23323
23324         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23325         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23326           if (NeedsCondInvert) // Invert the condition if needed.
23327             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23328                                DAG.getConstant(1, Cond.getValueType()));
23329
23330           // Zero extend the condition if needed.
23331           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23332                              FalseC->getValueType(0), Cond);
23333           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23334                              SDValue(FalseC, 0));
23335         }
23336
23337         // Optimize cases that will turn into an LEA instruction.  This requires
23338         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23339         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23340           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23341           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23342
23343           bool isFastMultiplier = false;
23344           if (Diff < 10) {
23345             switch ((unsigned char)Diff) {
23346               default: break;
23347               case 1:  // result = add base, cond
23348               case 2:  // result = lea base(    , cond*2)
23349               case 3:  // result = lea base(cond, cond*2)
23350               case 4:  // result = lea base(    , cond*4)
23351               case 5:  // result = lea base(cond, cond*4)
23352               case 8:  // result = lea base(    , cond*8)
23353               case 9:  // result = lea base(cond, cond*8)
23354                 isFastMultiplier = true;
23355                 break;
23356             }
23357           }
23358
23359           if (isFastMultiplier) {
23360             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23361             if (NeedsCondInvert) // Invert the condition if needed.
23362               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23363                                  DAG.getConstant(1, Cond.getValueType()));
23364
23365             // Zero extend the condition if needed.
23366             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23367                                Cond);
23368             // Scale the condition by the difference.
23369             if (Diff != 1)
23370               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23371                                  DAG.getConstant(Diff, Cond.getValueType()));
23372
23373             // Add the base if non-zero.
23374             if (FalseC->getAPIntValue() != 0)
23375               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23376                                  SDValue(FalseC, 0));
23377             return Cond;
23378           }
23379         }
23380       }
23381   }
23382
23383   // Canonicalize max and min:
23384   // (x > y) ? x : y -> (x >= y) ? x : y
23385   // (x < y) ? x : y -> (x <= y) ? x : y
23386   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23387   // the need for an extra compare
23388   // against zero. e.g.
23389   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23390   // subl   %esi, %edi
23391   // testl  %edi, %edi
23392   // movl   $0, %eax
23393   // cmovgl %edi, %eax
23394   // =>
23395   // xorl   %eax, %eax
23396   // subl   %esi, $edi
23397   // cmovsl %eax, %edi
23398   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23399       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23400       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23401     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23402     switch (CC) {
23403     default: break;
23404     case ISD::SETLT:
23405     case ISD::SETGT: {
23406       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23407       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23408                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23409       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23410     }
23411     }
23412   }
23413
23414   // Early exit check
23415   if (!TLI.isTypeLegal(VT))
23416     return SDValue();
23417
23418   // Match VSELECTs into subs with unsigned saturation.
23419   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23420       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23421       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23422        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23423     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23424
23425     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23426     // left side invert the predicate to simplify logic below.
23427     SDValue Other;
23428     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23429       Other = RHS;
23430       CC = ISD::getSetCCInverse(CC, true);
23431     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23432       Other = LHS;
23433     }
23434
23435     if (Other.getNode() && Other->getNumOperands() == 2 &&
23436         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23437       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23438       SDValue CondRHS = Cond->getOperand(1);
23439
23440       // Look for a general sub with unsigned saturation first.
23441       // x >= y ? x-y : 0 --> subus x, y
23442       // x >  y ? x-y : 0 --> subus x, y
23443       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23444           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23445         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23446
23447       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23448         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23449           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23450             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23451               // If the RHS is a constant we have to reverse the const
23452               // canonicalization.
23453               // x > C-1 ? x+-C : 0 --> subus x, C
23454               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23455                   CondRHSConst->getAPIntValue() ==
23456                       (-OpRHSConst->getAPIntValue() - 1))
23457                 return DAG.getNode(
23458                     X86ISD::SUBUS, DL, VT, OpLHS,
23459                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23460
23461           // Another special case: If C was a sign bit, the sub has been
23462           // canonicalized into a xor.
23463           // FIXME: Would it be better to use computeKnownBits to determine
23464           //        whether it's safe to decanonicalize the xor?
23465           // x s< 0 ? x^C : 0 --> subus x, C
23466           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23467               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23468               OpRHSConst->getAPIntValue().isSignBit())
23469             // Note that we have to rebuild the RHS constant here to ensure we
23470             // don't rely on particular values of undef lanes.
23471             return DAG.getNode(
23472                 X86ISD::SUBUS, DL, VT, OpLHS,
23473                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23474         }
23475     }
23476   }
23477
23478   // Try to match a min/max vector operation.
23479   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23480     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23481     unsigned Opc = ret.first;
23482     bool NeedSplit = ret.second;
23483
23484     if (Opc && NeedSplit) {
23485       unsigned NumElems = VT.getVectorNumElements();
23486       // Extract the LHS vectors
23487       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23488       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23489
23490       // Extract the RHS vectors
23491       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23492       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23493
23494       // Create min/max for each subvector
23495       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23496       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23497
23498       // Merge the result
23499       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23500     } else if (Opc)
23501       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23502   }
23503
23504   // Simplify vector selection if condition value type matches vselect
23505   // operand type
23506   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23507     assert(Cond.getValueType().isVector() &&
23508            "vector select expects a vector selector!");
23509
23510     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23511     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23512
23513     // Try invert the condition if true value is not all 1s and false value
23514     // is not all 0s.
23515     if (!TValIsAllOnes && !FValIsAllZeros &&
23516         // Check if the selector will be produced by CMPP*/PCMP*
23517         Cond.getOpcode() == ISD::SETCC &&
23518         // Check if SETCC has already been promoted
23519         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23520       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23521       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23522
23523       if (TValIsAllZeros || FValIsAllOnes) {
23524         SDValue CC = Cond.getOperand(2);
23525         ISD::CondCode NewCC =
23526           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23527                                Cond.getOperand(0).getValueType().isInteger());
23528         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23529         std::swap(LHS, RHS);
23530         TValIsAllOnes = FValIsAllOnes;
23531         FValIsAllZeros = TValIsAllZeros;
23532       }
23533     }
23534
23535     if (TValIsAllOnes || FValIsAllZeros) {
23536       SDValue Ret;
23537
23538       if (TValIsAllOnes && FValIsAllZeros)
23539         Ret = Cond;
23540       else if (TValIsAllOnes)
23541         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23542                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23543       else if (FValIsAllZeros)
23544         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23545                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23546
23547       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23548     }
23549   }
23550
23551   // If we know that this node is legal then we know that it is going to be
23552   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23553   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23554   // to simplify previous instructions.
23555   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23556       !DCI.isBeforeLegalize() &&
23557       // We explicitly check against v8i16 and v16i16 because, although
23558       // they're marked as Custom, they might only be legal when Cond is a
23559       // build_vector of constants. This will be taken care in a later
23560       // condition.
23561       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23562        VT != MVT::v8i16) &&
23563       // Don't optimize vector of constants. Those are handled by
23564       // the generic code and all the bits must be properly set for
23565       // the generic optimizer.
23566       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23567     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23568
23569     // Don't optimize vector selects that map to mask-registers.
23570     if (BitWidth == 1)
23571       return SDValue();
23572
23573     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23574     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23575
23576     APInt KnownZero, KnownOne;
23577     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23578                                           DCI.isBeforeLegalizeOps());
23579     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23580         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23581                                  TLO)) {
23582       // If we changed the computation somewhere in the DAG, this change
23583       // will affect all users of Cond.
23584       // Make sure it is fine and update all the nodes so that we do not
23585       // use the generic VSELECT anymore. Otherwise, we may perform
23586       // wrong optimizations as we messed up with the actual expectation
23587       // for the vector boolean values.
23588       if (Cond != TLO.Old) {
23589         // Check all uses of that condition operand to check whether it will be
23590         // consumed by non-BLEND instructions, which may depend on all bits are
23591         // set properly.
23592         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23593              I != E; ++I)
23594           if (I->getOpcode() != ISD::VSELECT)
23595             // TODO: Add other opcodes eventually lowered into BLEND.
23596             return SDValue();
23597
23598         // Update all the users of the condition, before committing the change,
23599         // so that the VSELECT optimizations that expect the correct vector
23600         // boolean value will not be triggered.
23601         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23602              I != E; ++I)
23603           DAG.ReplaceAllUsesOfValueWith(
23604               SDValue(*I, 0),
23605               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23606                           Cond, I->getOperand(1), I->getOperand(2)));
23607         DCI.CommitTargetLoweringOpt(TLO);
23608         return SDValue();
23609       }
23610       // At this point, only Cond is changed. Change the condition
23611       // just for N to keep the opportunity to optimize all other
23612       // users their own way.
23613       DAG.ReplaceAllUsesOfValueWith(
23614           SDValue(N, 0),
23615           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23616                       TLO.New, N->getOperand(1), N->getOperand(2)));
23617       return SDValue();
23618     }
23619   }
23620
23621   // We should generate an X86ISD::BLENDI from a vselect if its argument
23622   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23623   // constants. This specific pattern gets generated when we split a
23624   // selector for a 512 bit vector in a machine without AVX512 (but with
23625   // 256-bit vectors), during legalization:
23626   //
23627   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23628   //
23629   // Iff we find this pattern and the build_vectors are built from
23630   // constants, we translate the vselect into a shuffle_vector that we
23631   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23632   if ((N->getOpcode() == ISD::VSELECT ||
23633        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23634       !DCI.isBeforeLegalize()) {
23635     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23636     if (Shuffle.getNode())
23637       return Shuffle;
23638   }
23639
23640   return SDValue();
23641 }
23642
23643 // Check whether a boolean test is testing a boolean value generated by
23644 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23645 // code.
23646 //
23647 // Simplify the following patterns:
23648 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23649 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23650 // to (Op EFLAGS Cond)
23651 //
23652 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23653 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23654 // to (Op EFLAGS !Cond)
23655 //
23656 // where Op could be BRCOND or CMOV.
23657 //
23658 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23659   // Quit if not CMP and SUB with its value result used.
23660   if (Cmp.getOpcode() != X86ISD::CMP &&
23661       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23662       return SDValue();
23663
23664   // Quit if not used as a boolean value.
23665   if (CC != X86::COND_E && CC != X86::COND_NE)
23666     return SDValue();
23667
23668   // Check CMP operands. One of them should be 0 or 1 and the other should be
23669   // an SetCC or extended from it.
23670   SDValue Op1 = Cmp.getOperand(0);
23671   SDValue Op2 = Cmp.getOperand(1);
23672
23673   SDValue SetCC;
23674   const ConstantSDNode* C = nullptr;
23675   bool needOppositeCond = (CC == X86::COND_E);
23676   bool checkAgainstTrue = false; // Is it a comparison against 1?
23677
23678   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23679     SetCC = Op2;
23680   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23681     SetCC = Op1;
23682   else // Quit if all operands are not constants.
23683     return SDValue();
23684
23685   if (C->getZExtValue() == 1) {
23686     needOppositeCond = !needOppositeCond;
23687     checkAgainstTrue = true;
23688   } else if (C->getZExtValue() != 0)
23689     // Quit if the constant is neither 0 or 1.
23690     return SDValue();
23691
23692   bool truncatedToBoolWithAnd = false;
23693   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23694   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23695          SetCC.getOpcode() == ISD::TRUNCATE ||
23696          SetCC.getOpcode() == ISD::AND) {
23697     if (SetCC.getOpcode() == ISD::AND) {
23698       int OpIdx = -1;
23699       ConstantSDNode *CS;
23700       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23701           CS->getZExtValue() == 1)
23702         OpIdx = 1;
23703       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23704           CS->getZExtValue() == 1)
23705         OpIdx = 0;
23706       if (OpIdx == -1)
23707         break;
23708       SetCC = SetCC.getOperand(OpIdx);
23709       truncatedToBoolWithAnd = true;
23710     } else
23711       SetCC = SetCC.getOperand(0);
23712   }
23713
23714   switch (SetCC.getOpcode()) {
23715   case X86ISD::SETCC_CARRY:
23716     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23717     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23718     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23719     // truncated to i1 using 'and'.
23720     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23721       break;
23722     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23723            "Invalid use of SETCC_CARRY!");
23724     // FALL THROUGH
23725   case X86ISD::SETCC:
23726     // Set the condition code or opposite one if necessary.
23727     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23728     if (needOppositeCond)
23729       CC = X86::GetOppositeBranchCondition(CC);
23730     return SetCC.getOperand(1);
23731   case X86ISD::CMOV: {
23732     // Check whether false/true value has canonical one, i.e. 0 or 1.
23733     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23734     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23735     // Quit if true value is not a constant.
23736     if (!TVal)
23737       return SDValue();
23738     // Quit if false value is not a constant.
23739     if (!FVal) {
23740       SDValue Op = SetCC.getOperand(0);
23741       // Skip 'zext' or 'trunc' node.
23742       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23743           Op.getOpcode() == ISD::TRUNCATE)
23744         Op = Op.getOperand(0);
23745       // A special case for rdrand/rdseed, where 0 is set if false cond is
23746       // found.
23747       if ((Op.getOpcode() != X86ISD::RDRAND &&
23748            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23749         return SDValue();
23750     }
23751     // Quit if false value is not the constant 0 or 1.
23752     bool FValIsFalse = true;
23753     if (FVal && FVal->getZExtValue() != 0) {
23754       if (FVal->getZExtValue() != 1)
23755         return SDValue();
23756       // If FVal is 1, opposite cond is needed.
23757       needOppositeCond = !needOppositeCond;
23758       FValIsFalse = false;
23759     }
23760     // Quit if TVal is not the constant opposite of FVal.
23761     if (FValIsFalse && TVal->getZExtValue() != 1)
23762       return SDValue();
23763     if (!FValIsFalse && TVal->getZExtValue() != 0)
23764       return SDValue();
23765     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23766     if (needOppositeCond)
23767       CC = X86::GetOppositeBranchCondition(CC);
23768     return SetCC.getOperand(3);
23769   }
23770   }
23771
23772   return SDValue();
23773 }
23774
23775 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23776 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23777                                   TargetLowering::DAGCombinerInfo &DCI,
23778                                   const X86Subtarget *Subtarget) {
23779   SDLoc DL(N);
23780
23781   // If the flag operand isn't dead, don't touch this CMOV.
23782   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23783     return SDValue();
23784
23785   SDValue FalseOp = N->getOperand(0);
23786   SDValue TrueOp = N->getOperand(1);
23787   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23788   SDValue Cond = N->getOperand(3);
23789
23790   if (CC == X86::COND_E || CC == X86::COND_NE) {
23791     switch (Cond.getOpcode()) {
23792     default: break;
23793     case X86ISD::BSR:
23794     case X86ISD::BSF:
23795       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23796       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23797         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23798     }
23799   }
23800
23801   SDValue Flags;
23802
23803   Flags = checkBoolTestSetCCCombine(Cond, CC);
23804   if (Flags.getNode() &&
23805       // Extra check as FCMOV only supports a subset of X86 cond.
23806       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23807     SDValue Ops[] = { FalseOp, TrueOp,
23808                       DAG.getConstant(CC, MVT::i8), Flags };
23809     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23810   }
23811
23812   // If this is a select between two integer constants, try to do some
23813   // optimizations.  Note that the operands are ordered the opposite of SELECT
23814   // operands.
23815   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23816     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23817       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23818       // larger than FalseC (the false value).
23819       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23820         CC = X86::GetOppositeBranchCondition(CC);
23821         std::swap(TrueC, FalseC);
23822         std::swap(TrueOp, FalseOp);
23823       }
23824
23825       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23826       // This is efficient for any integer data type (including i8/i16) and
23827       // shift amount.
23828       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23829         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23830                            DAG.getConstant(CC, MVT::i8), Cond);
23831
23832         // Zero extend the condition if needed.
23833         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23834
23835         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23836         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23837                            DAG.getConstant(ShAmt, MVT::i8));
23838         if (N->getNumValues() == 2)  // Dead flag value?
23839           return DCI.CombineTo(N, Cond, SDValue());
23840         return Cond;
23841       }
23842
23843       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
23844       // for any integer data type, including i8/i16.
23845       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23846         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23847                            DAG.getConstant(CC, MVT::i8), Cond);
23848
23849         // Zero extend the condition if needed.
23850         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23851                            FalseC->getValueType(0), Cond);
23852         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23853                            SDValue(FalseC, 0));
23854
23855         if (N->getNumValues() == 2)  // Dead flag value?
23856           return DCI.CombineTo(N, Cond, SDValue());
23857         return Cond;
23858       }
23859
23860       // Optimize cases that will turn into an LEA instruction.  This requires
23861       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23862       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23863         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23864         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23865
23866         bool isFastMultiplier = false;
23867         if (Diff < 10) {
23868           switch ((unsigned char)Diff) {
23869           default: break;
23870           case 1:  // result = add base, cond
23871           case 2:  // result = lea base(    , cond*2)
23872           case 3:  // result = lea base(cond, cond*2)
23873           case 4:  // result = lea base(    , cond*4)
23874           case 5:  // result = lea base(cond, cond*4)
23875           case 8:  // result = lea base(    , cond*8)
23876           case 9:  // result = lea base(cond, cond*8)
23877             isFastMultiplier = true;
23878             break;
23879           }
23880         }
23881
23882         if (isFastMultiplier) {
23883           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23884           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23885                              DAG.getConstant(CC, MVT::i8), Cond);
23886           // Zero extend the condition if needed.
23887           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23888                              Cond);
23889           // Scale the condition by the difference.
23890           if (Diff != 1)
23891             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23892                                DAG.getConstant(Diff, Cond.getValueType()));
23893
23894           // Add the base if non-zero.
23895           if (FalseC->getAPIntValue() != 0)
23896             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23897                                SDValue(FalseC, 0));
23898           if (N->getNumValues() == 2)  // Dead flag value?
23899             return DCI.CombineTo(N, Cond, SDValue());
23900           return Cond;
23901         }
23902       }
23903     }
23904   }
23905
23906   // Handle these cases:
23907   //   (select (x != c), e, c) -> select (x != c), e, x),
23908   //   (select (x == c), c, e) -> select (x == c), x, e)
23909   // where the c is an integer constant, and the "select" is the combination
23910   // of CMOV and CMP.
23911   //
23912   // The rationale for this change is that the conditional-move from a constant
23913   // needs two instructions, however, conditional-move from a register needs
23914   // only one instruction.
23915   //
23916   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
23917   //  some instruction-combining opportunities. This opt needs to be
23918   //  postponed as late as possible.
23919   //
23920   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
23921     // the DCI.xxxx conditions are provided to postpone the optimization as
23922     // late as possible.
23923
23924     ConstantSDNode *CmpAgainst = nullptr;
23925     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
23926         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
23927         !isa<ConstantSDNode>(Cond.getOperand(0))) {
23928
23929       if (CC == X86::COND_NE &&
23930           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
23931         CC = X86::GetOppositeBranchCondition(CC);
23932         std::swap(TrueOp, FalseOp);
23933       }
23934
23935       if (CC == X86::COND_E &&
23936           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
23937         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
23938                           DAG.getConstant(CC, MVT::i8), Cond };
23939         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
23940       }
23941     }
23942   }
23943
23944   return SDValue();
23945 }
23946
23947 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
23948                                                 const X86Subtarget *Subtarget) {
23949   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
23950   switch (IntNo) {
23951   default: return SDValue();
23952   // SSE/AVX/AVX2 blend intrinsics.
23953   case Intrinsic::x86_avx2_pblendvb:
23954   case Intrinsic::x86_avx2_pblendw:
23955   case Intrinsic::x86_avx2_pblendd_128:
23956   case Intrinsic::x86_avx2_pblendd_256:
23957     // Don't try to simplify this intrinsic if we don't have AVX2.
23958     if (!Subtarget->hasAVX2())
23959       return SDValue();
23960     // FALL-THROUGH
23961   case Intrinsic::x86_avx_blend_pd_256:
23962   case Intrinsic::x86_avx_blend_ps_256:
23963   case Intrinsic::x86_avx_blendv_pd_256:
23964   case Intrinsic::x86_avx_blendv_ps_256:
23965     // Don't try to simplify this intrinsic if we don't have AVX.
23966     if (!Subtarget->hasAVX())
23967       return SDValue();
23968     // FALL-THROUGH
23969   case Intrinsic::x86_sse41_pblendw:
23970   case Intrinsic::x86_sse41_blendpd:
23971   case Intrinsic::x86_sse41_blendps:
23972   case Intrinsic::x86_sse41_blendvps:
23973   case Intrinsic::x86_sse41_blendvpd:
23974   case Intrinsic::x86_sse41_pblendvb: {
23975     SDValue Op0 = N->getOperand(1);
23976     SDValue Op1 = N->getOperand(2);
23977     SDValue Mask = N->getOperand(3);
23978
23979     // Don't try to simplify this intrinsic if we don't have SSE4.1.
23980     if (!Subtarget->hasSSE41())
23981       return SDValue();
23982
23983     // fold (blend A, A, Mask) -> A
23984     if (Op0 == Op1)
23985       return Op0;
23986     // fold (blend A, B, allZeros) -> A
23987     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
23988       return Op0;
23989     // fold (blend A, B, allOnes) -> B
23990     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
23991       return Op1;
23992
23993     // Simplify the case where the mask is a constant i32 value.
23994     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
23995       if (C->isNullValue())
23996         return Op0;
23997       if (C->isAllOnesValue())
23998         return Op1;
23999     }
24000
24001     return SDValue();
24002   }
24003
24004   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
24005   case Intrinsic::x86_sse2_psrai_w:
24006   case Intrinsic::x86_sse2_psrai_d:
24007   case Intrinsic::x86_avx2_psrai_w:
24008   case Intrinsic::x86_avx2_psrai_d:
24009   case Intrinsic::x86_sse2_psra_w:
24010   case Intrinsic::x86_sse2_psra_d:
24011   case Intrinsic::x86_avx2_psra_w:
24012   case Intrinsic::x86_avx2_psra_d: {
24013     SDValue Op0 = N->getOperand(1);
24014     SDValue Op1 = N->getOperand(2);
24015     EVT VT = Op0.getValueType();
24016     assert(VT.isVector() && "Expected a vector type!");
24017
24018     if (isa<BuildVectorSDNode>(Op1))
24019       Op1 = Op1.getOperand(0);
24020
24021     if (!isa<ConstantSDNode>(Op1))
24022       return SDValue();
24023
24024     EVT SVT = VT.getVectorElementType();
24025     unsigned SVTBits = SVT.getSizeInBits();
24026
24027     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24028     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24029     uint64_t ShAmt = C.getZExtValue();
24030
24031     // Don't try to convert this shift into a ISD::SRA if the shift
24032     // count is bigger than or equal to the element size.
24033     if (ShAmt >= SVTBits)
24034       return SDValue();
24035
24036     // Trivial case: if the shift count is zero, then fold this
24037     // into the first operand.
24038     if (ShAmt == 0)
24039       return Op0;
24040
24041     // Replace this packed shift intrinsic with a target independent
24042     // shift dag node.
24043     SDValue Splat = DAG.getConstant(C, VT);
24044     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24045   }
24046   }
24047 }
24048
24049 /// PerformMulCombine - Optimize a single multiply with constant into two
24050 /// in order to implement it with two cheaper instructions, e.g.
24051 /// LEA + SHL, LEA + LEA.
24052 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24053                                  TargetLowering::DAGCombinerInfo &DCI) {
24054   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24055     return SDValue();
24056
24057   EVT VT = N->getValueType(0);
24058   if (VT != MVT::i64)
24059     return SDValue();
24060
24061   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24062   if (!C)
24063     return SDValue();
24064   uint64_t MulAmt = C->getZExtValue();
24065   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24066     return SDValue();
24067
24068   uint64_t MulAmt1 = 0;
24069   uint64_t MulAmt2 = 0;
24070   if ((MulAmt % 9) == 0) {
24071     MulAmt1 = 9;
24072     MulAmt2 = MulAmt / 9;
24073   } else if ((MulAmt % 5) == 0) {
24074     MulAmt1 = 5;
24075     MulAmt2 = MulAmt / 5;
24076   } else if ((MulAmt % 3) == 0) {
24077     MulAmt1 = 3;
24078     MulAmt2 = MulAmt / 3;
24079   }
24080   if (MulAmt2 &&
24081       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24082     SDLoc DL(N);
24083
24084     if (isPowerOf2_64(MulAmt2) &&
24085         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24086       // If second multiplifer is pow2, issue it first. We want the multiply by
24087       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24088       // is an add.
24089       std::swap(MulAmt1, MulAmt2);
24090
24091     SDValue NewMul;
24092     if (isPowerOf2_64(MulAmt1))
24093       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24094                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24095     else
24096       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24097                            DAG.getConstant(MulAmt1, VT));
24098
24099     if (isPowerOf2_64(MulAmt2))
24100       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24101                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24102     else
24103       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24104                            DAG.getConstant(MulAmt2, VT));
24105
24106     // Do not add new nodes to DAG combiner worklist.
24107     DCI.CombineTo(N, NewMul, false);
24108   }
24109   return SDValue();
24110 }
24111
24112 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24113   SDValue N0 = N->getOperand(0);
24114   SDValue N1 = N->getOperand(1);
24115   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24116   EVT VT = N0.getValueType();
24117
24118   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24119   // since the result of setcc_c is all zero's or all ones.
24120   if (VT.isInteger() && !VT.isVector() &&
24121       N1C && N0.getOpcode() == ISD::AND &&
24122       N0.getOperand(1).getOpcode() == ISD::Constant) {
24123     SDValue N00 = N0.getOperand(0);
24124     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24125         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24126           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24127          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24128       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24129       APInt ShAmt = N1C->getAPIntValue();
24130       Mask = Mask.shl(ShAmt);
24131       if (Mask != 0)
24132         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24133                            N00, DAG.getConstant(Mask, VT));
24134     }
24135   }
24136
24137   // Hardware support for vector shifts is sparse which makes us scalarize the
24138   // vector operations in many cases. Also, on sandybridge ADD is faster than
24139   // shl.
24140   // (shl V, 1) -> add V,V
24141   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24142     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24143       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24144       // We shift all of the values by one. In many cases we do not have
24145       // hardware support for this operation. This is better expressed as an ADD
24146       // of two values.
24147       if (N1SplatC->getZExtValue() == 1)
24148         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24149     }
24150
24151   return SDValue();
24152 }
24153
24154 /// \brief Returns a vector of 0s if the node in input is a vector logical
24155 /// shift by a constant amount which is known to be bigger than or equal
24156 /// to the vector element size in bits.
24157 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24158                                       const X86Subtarget *Subtarget) {
24159   EVT VT = N->getValueType(0);
24160
24161   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24162       (!Subtarget->hasInt256() ||
24163        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24164     return SDValue();
24165
24166   SDValue Amt = N->getOperand(1);
24167   SDLoc DL(N);
24168   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24169     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24170       APInt ShiftAmt = AmtSplat->getAPIntValue();
24171       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24172
24173       // SSE2/AVX2 logical shifts always return a vector of 0s
24174       // if the shift amount is bigger than or equal to
24175       // the element size. The constant shift amount will be
24176       // encoded as a 8-bit immediate.
24177       if (ShiftAmt.trunc(8).uge(MaxAmount))
24178         return getZeroVector(VT, Subtarget, DAG, DL);
24179     }
24180
24181   return SDValue();
24182 }
24183
24184 /// PerformShiftCombine - Combine shifts.
24185 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24186                                    TargetLowering::DAGCombinerInfo &DCI,
24187                                    const X86Subtarget *Subtarget) {
24188   if (N->getOpcode() == ISD::SHL) {
24189     SDValue V = PerformSHLCombine(N, DAG);
24190     if (V.getNode()) return V;
24191   }
24192
24193   if (N->getOpcode() != ISD::SRA) {
24194     // Try to fold this logical shift into a zero vector.
24195     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24196     if (V.getNode()) return V;
24197   }
24198
24199   return SDValue();
24200 }
24201
24202 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24203 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24204 // and friends.  Likewise for OR -> CMPNEQSS.
24205 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24206                             TargetLowering::DAGCombinerInfo &DCI,
24207                             const X86Subtarget *Subtarget) {
24208   unsigned opcode;
24209
24210   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24211   // we're requiring SSE2 for both.
24212   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24213     SDValue N0 = N->getOperand(0);
24214     SDValue N1 = N->getOperand(1);
24215     SDValue CMP0 = N0->getOperand(1);
24216     SDValue CMP1 = N1->getOperand(1);
24217     SDLoc DL(N);
24218
24219     // The SETCCs should both refer to the same CMP.
24220     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24221       return SDValue();
24222
24223     SDValue CMP00 = CMP0->getOperand(0);
24224     SDValue CMP01 = CMP0->getOperand(1);
24225     EVT     VT    = CMP00.getValueType();
24226
24227     if (VT == MVT::f32 || VT == MVT::f64) {
24228       bool ExpectingFlags = false;
24229       // Check for any users that want flags:
24230       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24231            !ExpectingFlags && UI != UE; ++UI)
24232         switch (UI->getOpcode()) {
24233         default:
24234         case ISD::BR_CC:
24235         case ISD::BRCOND:
24236         case ISD::SELECT:
24237           ExpectingFlags = true;
24238           break;
24239         case ISD::CopyToReg:
24240         case ISD::SIGN_EXTEND:
24241         case ISD::ZERO_EXTEND:
24242         case ISD::ANY_EXTEND:
24243           break;
24244         }
24245
24246       if (!ExpectingFlags) {
24247         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24248         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24249
24250         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24251           X86::CondCode tmp = cc0;
24252           cc0 = cc1;
24253           cc1 = tmp;
24254         }
24255
24256         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24257             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24258           // FIXME: need symbolic constants for these magic numbers.
24259           // See X86ATTInstPrinter.cpp:printSSECC().
24260           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24261           if (Subtarget->hasAVX512()) {
24262             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24263                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24264             if (N->getValueType(0) != MVT::i1)
24265               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24266                                  FSetCC);
24267             return FSetCC;
24268           }
24269           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24270                                               CMP00.getValueType(), CMP00, CMP01,
24271                                               DAG.getConstant(x86cc, MVT::i8));
24272
24273           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24274           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24275
24276           if (is64BitFP && !Subtarget->is64Bit()) {
24277             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24278             // 64-bit integer, since that's not a legal type. Since
24279             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24280             // bits, but can do this little dance to extract the lowest 32 bits
24281             // and work with those going forward.
24282             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24283                                            OnesOrZeroesF);
24284             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24285                                            Vector64);
24286             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24287                                         Vector32, DAG.getIntPtrConstant(0));
24288             IntVT = MVT::i32;
24289           }
24290
24291           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24292           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24293                                       DAG.getConstant(1, IntVT));
24294           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24295           return OneBitOfTruth;
24296         }
24297       }
24298     }
24299   }
24300   return SDValue();
24301 }
24302
24303 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24304 /// so it can be folded inside ANDNP.
24305 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24306   EVT VT = N->getValueType(0);
24307
24308   // Match direct AllOnes for 128 and 256-bit vectors
24309   if (ISD::isBuildVectorAllOnes(N))
24310     return true;
24311
24312   // Look through a bit convert.
24313   if (N->getOpcode() == ISD::BITCAST)
24314     N = N->getOperand(0).getNode();
24315
24316   // Sometimes the operand may come from a insert_subvector building a 256-bit
24317   // allones vector
24318   if (VT.is256BitVector() &&
24319       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24320     SDValue V1 = N->getOperand(0);
24321     SDValue V2 = N->getOperand(1);
24322
24323     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24324         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24325         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24326         ISD::isBuildVectorAllOnes(V2.getNode()))
24327       return true;
24328   }
24329
24330   return false;
24331 }
24332
24333 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24334 // register. In most cases we actually compare or select YMM-sized registers
24335 // and mixing the two types creates horrible code. This method optimizes
24336 // some of the transition sequences.
24337 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24338                                  TargetLowering::DAGCombinerInfo &DCI,
24339                                  const X86Subtarget *Subtarget) {
24340   EVT VT = N->getValueType(0);
24341   if (!VT.is256BitVector())
24342     return SDValue();
24343
24344   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24345           N->getOpcode() == ISD::ZERO_EXTEND ||
24346           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24347
24348   SDValue Narrow = N->getOperand(0);
24349   EVT NarrowVT = Narrow->getValueType(0);
24350   if (!NarrowVT.is128BitVector())
24351     return SDValue();
24352
24353   if (Narrow->getOpcode() != ISD::XOR &&
24354       Narrow->getOpcode() != ISD::AND &&
24355       Narrow->getOpcode() != ISD::OR)
24356     return SDValue();
24357
24358   SDValue N0  = Narrow->getOperand(0);
24359   SDValue N1  = Narrow->getOperand(1);
24360   SDLoc DL(Narrow);
24361
24362   // The Left side has to be a trunc.
24363   if (N0.getOpcode() != ISD::TRUNCATE)
24364     return SDValue();
24365
24366   // The type of the truncated inputs.
24367   EVT WideVT = N0->getOperand(0)->getValueType(0);
24368   if (WideVT != VT)
24369     return SDValue();
24370
24371   // The right side has to be a 'trunc' or a constant vector.
24372   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24373   ConstantSDNode *RHSConstSplat = nullptr;
24374   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24375     RHSConstSplat = RHSBV->getConstantSplatNode();
24376   if (!RHSTrunc && !RHSConstSplat)
24377     return SDValue();
24378
24379   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24380
24381   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24382     return SDValue();
24383
24384   // Set N0 and N1 to hold the inputs to the new wide operation.
24385   N0 = N0->getOperand(0);
24386   if (RHSConstSplat) {
24387     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24388                      SDValue(RHSConstSplat, 0));
24389     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24390     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24391   } else if (RHSTrunc) {
24392     N1 = N1->getOperand(0);
24393   }
24394
24395   // Generate the wide operation.
24396   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24397   unsigned Opcode = N->getOpcode();
24398   switch (Opcode) {
24399   case ISD::ANY_EXTEND:
24400     return Op;
24401   case ISD::ZERO_EXTEND: {
24402     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24403     APInt Mask = APInt::getAllOnesValue(InBits);
24404     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24405     return DAG.getNode(ISD::AND, DL, VT,
24406                        Op, DAG.getConstant(Mask, VT));
24407   }
24408   case ISD::SIGN_EXTEND:
24409     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24410                        Op, DAG.getValueType(NarrowVT));
24411   default:
24412     llvm_unreachable("Unexpected opcode");
24413   }
24414 }
24415
24416 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24417                                  TargetLowering::DAGCombinerInfo &DCI,
24418                                  const X86Subtarget *Subtarget) {
24419   EVT VT = N->getValueType(0);
24420   if (DCI.isBeforeLegalizeOps())
24421     return SDValue();
24422
24423   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24424   if (R.getNode())
24425     return R;
24426
24427   // Create BEXTR instructions
24428   // BEXTR is ((X >> imm) & (2**size-1))
24429   if (VT == MVT::i32 || VT == MVT::i64) {
24430     SDValue N0 = N->getOperand(0);
24431     SDValue N1 = N->getOperand(1);
24432     SDLoc DL(N);
24433
24434     // Check for BEXTR.
24435     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24436         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24437       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24438       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24439       if (MaskNode && ShiftNode) {
24440         uint64_t Mask = MaskNode->getZExtValue();
24441         uint64_t Shift = ShiftNode->getZExtValue();
24442         if (isMask_64(Mask)) {
24443           uint64_t MaskSize = CountPopulation_64(Mask);
24444           if (Shift + MaskSize <= VT.getSizeInBits())
24445             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24446                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24447         }
24448       }
24449     } // BEXTR
24450
24451     return SDValue();
24452   }
24453
24454   // Want to form ANDNP nodes:
24455   // 1) In the hopes of then easily combining them with OR and AND nodes
24456   //    to form PBLEND/PSIGN.
24457   // 2) To match ANDN packed intrinsics
24458   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24459     return SDValue();
24460
24461   SDValue N0 = N->getOperand(0);
24462   SDValue N1 = N->getOperand(1);
24463   SDLoc DL(N);
24464
24465   // Check LHS for vnot
24466   if (N0.getOpcode() == ISD::XOR &&
24467       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24468       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24469     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24470
24471   // Check RHS for vnot
24472   if (N1.getOpcode() == ISD::XOR &&
24473       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24474       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24475     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24476
24477   return SDValue();
24478 }
24479
24480 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24481                                 TargetLowering::DAGCombinerInfo &DCI,
24482                                 const X86Subtarget *Subtarget) {
24483   if (DCI.isBeforeLegalizeOps())
24484     return SDValue();
24485
24486   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24487   if (R.getNode())
24488     return R;
24489
24490   SDValue N0 = N->getOperand(0);
24491   SDValue N1 = N->getOperand(1);
24492   EVT VT = N->getValueType(0);
24493
24494   // look for psign/blend
24495   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24496     if (!Subtarget->hasSSSE3() ||
24497         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24498       return SDValue();
24499
24500     // Canonicalize pandn to RHS
24501     if (N0.getOpcode() == X86ISD::ANDNP)
24502       std::swap(N0, N1);
24503     // or (and (m, y), (pandn m, x))
24504     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24505       SDValue Mask = N1.getOperand(0);
24506       SDValue X    = N1.getOperand(1);
24507       SDValue Y;
24508       if (N0.getOperand(0) == Mask)
24509         Y = N0.getOperand(1);
24510       if (N0.getOperand(1) == Mask)
24511         Y = N0.getOperand(0);
24512
24513       // Check to see if the mask appeared in both the AND and ANDNP and
24514       if (!Y.getNode())
24515         return SDValue();
24516
24517       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24518       // Look through mask bitcast.
24519       if (Mask.getOpcode() == ISD::BITCAST)
24520         Mask = Mask.getOperand(0);
24521       if (X.getOpcode() == ISD::BITCAST)
24522         X = X.getOperand(0);
24523       if (Y.getOpcode() == ISD::BITCAST)
24524         Y = Y.getOperand(0);
24525
24526       EVT MaskVT = Mask.getValueType();
24527
24528       // Validate that the Mask operand is a vector sra node.
24529       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24530       // there is no psrai.b
24531       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24532       unsigned SraAmt = ~0;
24533       if (Mask.getOpcode() == ISD::SRA) {
24534         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24535           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24536             SraAmt = AmtConst->getZExtValue();
24537       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24538         SDValue SraC = Mask.getOperand(1);
24539         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24540       }
24541       if ((SraAmt + 1) != EltBits)
24542         return SDValue();
24543
24544       SDLoc DL(N);
24545
24546       // Now we know we at least have a plendvb with the mask val.  See if
24547       // we can form a psignb/w/d.
24548       // psign = x.type == y.type == mask.type && y = sub(0, x);
24549       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24550           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24551           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24552         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24553                "Unsupported VT for PSIGN");
24554         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24555         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24556       }
24557       // PBLENDVB only available on SSE 4.1
24558       if (!Subtarget->hasSSE41())
24559         return SDValue();
24560
24561       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24562
24563       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24564       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24565       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24566       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24567       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24568     }
24569   }
24570
24571   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24572     return SDValue();
24573
24574   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24575   MachineFunction &MF = DAG.getMachineFunction();
24576   bool OptForSize = MF.getFunction()->getAttributes().
24577     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24578
24579   // SHLD/SHRD instructions have lower register pressure, but on some
24580   // platforms they have higher latency than the equivalent
24581   // series of shifts/or that would otherwise be generated.
24582   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24583   // have higher latencies and we are not optimizing for size.
24584   if (!OptForSize && Subtarget->isSHLDSlow())
24585     return SDValue();
24586
24587   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24588     std::swap(N0, N1);
24589   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24590     return SDValue();
24591   if (!N0.hasOneUse() || !N1.hasOneUse())
24592     return SDValue();
24593
24594   SDValue ShAmt0 = N0.getOperand(1);
24595   if (ShAmt0.getValueType() != MVT::i8)
24596     return SDValue();
24597   SDValue ShAmt1 = N1.getOperand(1);
24598   if (ShAmt1.getValueType() != MVT::i8)
24599     return SDValue();
24600   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24601     ShAmt0 = ShAmt0.getOperand(0);
24602   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24603     ShAmt1 = ShAmt1.getOperand(0);
24604
24605   SDLoc DL(N);
24606   unsigned Opc = X86ISD::SHLD;
24607   SDValue Op0 = N0.getOperand(0);
24608   SDValue Op1 = N1.getOperand(0);
24609   if (ShAmt0.getOpcode() == ISD::SUB) {
24610     Opc = X86ISD::SHRD;
24611     std::swap(Op0, Op1);
24612     std::swap(ShAmt0, ShAmt1);
24613   }
24614
24615   unsigned Bits = VT.getSizeInBits();
24616   if (ShAmt1.getOpcode() == ISD::SUB) {
24617     SDValue Sum = ShAmt1.getOperand(0);
24618     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24619       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24620       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24621         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24622       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24623         return DAG.getNode(Opc, DL, VT,
24624                            Op0, Op1,
24625                            DAG.getNode(ISD::TRUNCATE, DL,
24626                                        MVT::i8, ShAmt0));
24627     }
24628   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24629     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24630     if (ShAmt0C &&
24631         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24632       return DAG.getNode(Opc, DL, VT,
24633                          N0.getOperand(0), N1.getOperand(0),
24634                          DAG.getNode(ISD::TRUNCATE, DL,
24635                                        MVT::i8, ShAmt0));
24636   }
24637
24638   return SDValue();
24639 }
24640
24641 // Generate NEG and CMOV for integer abs.
24642 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24643   EVT VT = N->getValueType(0);
24644
24645   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24646   // 8-bit integer abs to NEG and CMOV.
24647   if (VT.isInteger() && VT.getSizeInBits() == 8)
24648     return SDValue();
24649
24650   SDValue N0 = N->getOperand(0);
24651   SDValue N1 = N->getOperand(1);
24652   SDLoc DL(N);
24653
24654   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24655   // and change it to SUB and CMOV.
24656   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24657       N0.getOpcode() == ISD::ADD &&
24658       N0.getOperand(1) == N1 &&
24659       N1.getOpcode() == ISD::SRA &&
24660       N1.getOperand(0) == N0.getOperand(0))
24661     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24662       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24663         // Generate SUB & CMOV.
24664         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24665                                   DAG.getConstant(0, VT), N0.getOperand(0));
24666
24667         SDValue Ops[] = { N0.getOperand(0), Neg,
24668                           DAG.getConstant(X86::COND_GE, MVT::i8),
24669                           SDValue(Neg.getNode(), 1) };
24670         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24671       }
24672   return SDValue();
24673 }
24674
24675 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24676 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24677                                  TargetLowering::DAGCombinerInfo &DCI,
24678                                  const X86Subtarget *Subtarget) {
24679   if (DCI.isBeforeLegalizeOps())
24680     return SDValue();
24681
24682   if (Subtarget->hasCMov()) {
24683     SDValue RV = performIntegerAbsCombine(N, DAG);
24684     if (RV.getNode())
24685       return RV;
24686   }
24687
24688   return SDValue();
24689 }
24690
24691 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24692 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24693                                   TargetLowering::DAGCombinerInfo &DCI,
24694                                   const X86Subtarget *Subtarget) {
24695   LoadSDNode *Ld = cast<LoadSDNode>(N);
24696   EVT RegVT = Ld->getValueType(0);
24697   EVT MemVT = Ld->getMemoryVT();
24698   SDLoc dl(Ld);
24699   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24700
24701   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24702   // into two 16-byte operations.
24703   ISD::LoadExtType Ext = Ld->getExtensionType();
24704   unsigned Alignment = Ld->getAlignment();
24705   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24706   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24707       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24708     unsigned NumElems = RegVT.getVectorNumElements();
24709     if (NumElems < 2)
24710       return SDValue();
24711
24712     SDValue Ptr = Ld->getBasePtr();
24713     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24714
24715     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24716                                   NumElems/2);
24717     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24718                                 Ld->getPointerInfo(), Ld->isVolatile(),
24719                                 Ld->isNonTemporal(), Ld->isInvariant(),
24720                                 Alignment);
24721     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24722     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24723                                 Ld->getPointerInfo(), Ld->isVolatile(),
24724                                 Ld->isNonTemporal(), Ld->isInvariant(),
24725                                 std::min(16U, Alignment));
24726     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24727                              Load1.getValue(1),
24728                              Load2.getValue(1));
24729
24730     SDValue NewVec = DAG.getUNDEF(RegVT);
24731     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24732     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24733     return DCI.CombineTo(N, NewVec, TF, true);
24734   }
24735
24736   return SDValue();
24737 }
24738
24739 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
24740 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
24741                                    const X86Subtarget *Subtarget) {
24742   StoreSDNode *St = cast<StoreSDNode>(N);
24743   EVT VT = St->getValue().getValueType();
24744   EVT StVT = St->getMemoryVT();
24745   SDLoc dl(St);
24746   SDValue StoredVal = St->getOperand(1);
24747   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24748
24749   // If we are saving a concatenation of two XMM registers and 32-byte stores
24750   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
24751   unsigned Alignment = St->getAlignment();
24752   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
24753   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24754       StVT == VT && !IsAligned) {
24755     unsigned NumElems = VT.getVectorNumElements();
24756     if (NumElems < 2)
24757       return SDValue();
24758
24759     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
24760     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
24761
24762     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
24763     SDValue Ptr0 = St->getBasePtr();
24764     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
24765
24766     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
24767                                 St->getPointerInfo(), St->isVolatile(),
24768                                 St->isNonTemporal(), Alignment);
24769     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
24770                                 St->getPointerInfo(), St->isVolatile(),
24771                                 St->isNonTemporal(),
24772                                 std::min(16U, Alignment));
24773     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
24774   }
24775
24776   // Optimize trunc store (of multiple scalars) to shuffle and store.
24777   // First, pack all of the elements in one place. Next, store to memory
24778   // in fewer chunks.
24779   if (St->isTruncatingStore() && VT.isVector()) {
24780     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24781     unsigned NumElems = VT.getVectorNumElements();
24782     assert(StVT != VT && "Cannot truncate to the same type");
24783     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24784     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24785
24786     // From, To sizes and ElemCount must be pow of two
24787     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
24788     // We are going to use the original vector elt for storing.
24789     // Accumulated smaller vector elements must be a multiple of the store size.
24790     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
24791
24792     unsigned SizeRatio  = FromSz / ToSz;
24793
24794     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24795
24796     // Create a type on which we perform the shuffle
24797     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24798             StVT.getScalarType(), NumElems*SizeRatio);
24799
24800     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24801
24802     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
24803     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
24804     for (unsigned i = 0; i != NumElems; ++i)
24805       ShuffleVec[i] = i * SizeRatio;
24806
24807     // Can't shuffle using an illegal type.
24808     if (!TLI.isTypeLegal(WideVecVT))
24809       return SDValue();
24810
24811     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24812                                          DAG.getUNDEF(WideVecVT),
24813                                          &ShuffleVec[0]);
24814     // At this point all of the data is stored at the bottom of the
24815     // register. We now need to save it to mem.
24816
24817     // Find the largest store unit
24818     MVT StoreType = MVT::i8;
24819     for (MVT Tp : MVT::integer_valuetypes()) {
24820       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
24821         StoreType = Tp;
24822     }
24823
24824     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
24825     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
24826         (64 <= NumElems * ToSz))
24827       StoreType = MVT::f64;
24828
24829     // Bitcast the original vector into a vector of store-size units
24830     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
24831             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
24832     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
24833     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
24834     SmallVector<SDValue, 8> Chains;
24835     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
24836                                         TLI.getPointerTy());
24837     SDValue Ptr = St->getBasePtr();
24838
24839     // Perform one or more big stores into memory.
24840     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
24841       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24842                                    StoreType, ShuffWide,
24843                                    DAG.getIntPtrConstant(i));
24844       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
24845                                 St->getPointerInfo(), St->isVolatile(),
24846                                 St->isNonTemporal(), St->getAlignment());
24847       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24848       Chains.push_back(Ch);
24849     }
24850
24851     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
24852   }
24853
24854   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
24855   // the FP state in cases where an emms may be missing.
24856   // A preferable solution to the general problem is to figure out the right
24857   // places to insert EMMS.  This qualifies as a quick hack.
24858
24859   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
24860   if (VT.getSizeInBits() != 64)
24861     return SDValue();
24862
24863   const Function *F = DAG.getMachineFunction().getFunction();
24864   bool NoImplicitFloatOps = F->getAttributes().
24865     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
24866   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
24867                      && Subtarget->hasSSE2();
24868   if ((VT.isVector() ||
24869        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
24870       isa<LoadSDNode>(St->getValue()) &&
24871       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
24872       St->getChain().hasOneUse() && !St->isVolatile()) {
24873     SDNode* LdVal = St->getValue().getNode();
24874     LoadSDNode *Ld = nullptr;
24875     int TokenFactorIndex = -1;
24876     SmallVector<SDValue, 8> Ops;
24877     SDNode* ChainVal = St->getChain().getNode();
24878     // Must be a store of a load.  We currently handle two cases:  the load
24879     // is a direct child, and it's under an intervening TokenFactor.  It is
24880     // possible to dig deeper under nested TokenFactors.
24881     if (ChainVal == LdVal)
24882       Ld = cast<LoadSDNode>(St->getChain());
24883     else if (St->getValue().hasOneUse() &&
24884              ChainVal->getOpcode() == ISD::TokenFactor) {
24885       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
24886         if (ChainVal->getOperand(i).getNode() == LdVal) {
24887           TokenFactorIndex = i;
24888           Ld = cast<LoadSDNode>(St->getValue());
24889         } else
24890           Ops.push_back(ChainVal->getOperand(i));
24891       }
24892     }
24893
24894     if (!Ld || !ISD::isNormalLoad(Ld))
24895       return SDValue();
24896
24897     // If this is not the MMX case, i.e. we are just turning i64 load/store
24898     // into f64 load/store, avoid the transformation if there are multiple
24899     // uses of the loaded value.
24900     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
24901       return SDValue();
24902
24903     SDLoc LdDL(Ld);
24904     SDLoc StDL(N);
24905     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
24906     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
24907     // pair instead.
24908     if (Subtarget->is64Bit() || F64IsLegal) {
24909       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
24910       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
24911                                   Ld->getPointerInfo(), Ld->isVolatile(),
24912                                   Ld->isNonTemporal(), Ld->isInvariant(),
24913                                   Ld->getAlignment());
24914       SDValue NewChain = NewLd.getValue(1);
24915       if (TokenFactorIndex != -1) {
24916         Ops.push_back(NewChain);
24917         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
24918       }
24919       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
24920                           St->getPointerInfo(),
24921                           St->isVolatile(), St->isNonTemporal(),
24922                           St->getAlignment());
24923     }
24924
24925     // Otherwise, lower to two pairs of 32-bit loads / stores.
24926     SDValue LoAddr = Ld->getBasePtr();
24927     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
24928                                  DAG.getConstant(4, MVT::i32));
24929
24930     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
24931                                Ld->getPointerInfo(),
24932                                Ld->isVolatile(), Ld->isNonTemporal(),
24933                                Ld->isInvariant(), Ld->getAlignment());
24934     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
24935                                Ld->getPointerInfo().getWithOffset(4),
24936                                Ld->isVolatile(), Ld->isNonTemporal(),
24937                                Ld->isInvariant(),
24938                                MinAlign(Ld->getAlignment(), 4));
24939
24940     SDValue NewChain = LoLd.getValue(1);
24941     if (TokenFactorIndex != -1) {
24942       Ops.push_back(LoLd);
24943       Ops.push_back(HiLd);
24944       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
24945     }
24946
24947     LoAddr = St->getBasePtr();
24948     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
24949                          DAG.getConstant(4, MVT::i32));
24950
24951     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
24952                                 St->getPointerInfo(),
24953                                 St->isVolatile(), St->isNonTemporal(),
24954                                 St->getAlignment());
24955     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
24956                                 St->getPointerInfo().getWithOffset(4),
24957                                 St->isVolatile(),
24958                                 St->isNonTemporal(),
24959                                 MinAlign(St->getAlignment(), 4));
24960     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
24961   }
24962   return SDValue();
24963 }
24964
24965 /// Return 'true' if this vector operation is "horizontal"
24966 /// and return the operands for the horizontal operation in LHS and RHS.  A
24967 /// horizontal operation performs the binary operation on successive elements
24968 /// of its first operand, then on successive elements of its second operand,
24969 /// returning the resulting values in a vector.  For example, if
24970 ///   A = < float a0, float a1, float a2, float a3 >
24971 /// and
24972 ///   B = < float b0, float b1, float b2, float b3 >
24973 /// then the result of doing a horizontal operation on A and B is
24974 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
24975 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
24976 /// A horizontal-op B, for some already available A and B, and if so then LHS is
24977 /// set to A, RHS to B, and the routine returns 'true'.
24978 /// Note that the binary operation should have the property that if one of the
24979 /// operands is UNDEF then the result is UNDEF.
24980 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
24981   // Look for the following pattern: if
24982   //   A = < float a0, float a1, float a2, float a3 >
24983   //   B = < float b0, float b1, float b2, float b3 >
24984   // and
24985   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
24986   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
24987   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
24988   // which is A horizontal-op B.
24989
24990   // At least one of the operands should be a vector shuffle.
24991   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
24992       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
24993     return false;
24994
24995   MVT VT = LHS.getSimpleValueType();
24996
24997   assert((VT.is128BitVector() || VT.is256BitVector()) &&
24998          "Unsupported vector type for horizontal add/sub");
24999
25000   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25001   // operate independently on 128-bit lanes.
25002   unsigned NumElts = VT.getVectorNumElements();
25003   unsigned NumLanes = VT.getSizeInBits()/128;
25004   unsigned NumLaneElts = NumElts / NumLanes;
25005   assert((NumLaneElts % 2 == 0) &&
25006          "Vector type should have an even number of elements in each lane");
25007   unsigned HalfLaneElts = NumLaneElts/2;
25008
25009   // View LHS in the form
25010   //   LHS = VECTOR_SHUFFLE A, B, LMask
25011   // If LHS is not a shuffle then pretend it is the shuffle
25012   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25013   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25014   // type VT.
25015   SDValue A, B;
25016   SmallVector<int, 16> LMask(NumElts);
25017   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25018     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25019       A = LHS.getOperand(0);
25020     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25021       B = LHS.getOperand(1);
25022     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25023     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25024   } else {
25025     if (LHS.getOpcode() != ISD::UNDEF)
25026       A = LHS;
25027     for (unsigned i = 0; i != NumElts; ++i)
25028       LMask[i] = i;
25029   }
25030
25031   // Likewise, view RHS in the form
25032   //   RHS = VECTOR_SHUFFLE C, D, RMask
25033   SDValue C, D;
25034   SmallVector<int, 16> RMask(NumElts);
25035   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25036     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25037       C = RHS.getOperand(0);
25038     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25039       D = RHS.getOperand(1);
25040     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25041     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25042   } else {
25043     if (RHS.getOpcode() != ISD::UNDEF)
25044       C = RHS;
25045     for (unsigned i = 0; i != NumElts; ++i)
25046       RMask[i] = i;
25047   }
25048
25049   // Check that the shuffles are both shuffling the same vectors.
25050   if (!(A == C && B == D) && !(A == D && B == C))
25051     return false;
25052
25053   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25054   if (!A.getNode() && !B.getNode())
25055     return false;
25056
25057   // If A and B occur in reverse order in RHS, then "swap" them (which means
25058   // rewriting the mask).
25059   if (A != C)
25060     CommuteVectorShuffleMask(RMask, NumElts);
25061
25062   // At this point LHS and RHS are equivalent to
25063   //   LHS = VECTOR_SHUFFLE A, B, LMask
25064   //   RHS = VECTOR_SHUFFLE A, B, RMask
25065   // Check that the masks correspond to performing a horizontal operation.
25066   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25067     for (unsigned i = 0; i != NumLaneElts; ++i) {
25068       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25069
25070       // Ignore any UNDEF components.
25071       if (LIdx < 0 || RIdx < 0 ||
25072           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25073           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25074         continue;
25075
25076       // Check that successive elements are being operated on.  If not, this is
25077       // not a horizontal operation.
25078       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25079       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25080       if (!(LIdx == Index && RIdx == Index + 1) &&
25081           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25082         return false;
25083     }
25084   }
25085
25086   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25087   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25088   return true;
25089 }
25090
25091 /// Do target-specific dag combines on floating point adds.
25092 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25093                                   const X86Subtarget *Subtarget) {
25094   EVT VT = N->getValueType(0);
25095   SDValue LHS = N->getOperand(0);
25096   SDValue RHS = N->getOperand(1);
25097
25098   // Try to synthesize horizontal adds from adds of shuffles.
25099   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25100        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25101       isHorizontalBinOp(LHS, RHS, true))
25102     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25103   return SDValue();
25104 }
25105
25106 /// Do target-specific dag combines on floating point subs.
25107 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25108                                   const X86Subtarget *Subtarget) {
25109   EVT VT = N->getValueType(0);
25110   SDValue LHS = N->getOperand(0);
25111   SDValue RHS = N->getOperand(1);
25112
25113   // Try to synthesize horizontal subs from subs of shuffles.
25114   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25115        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25116       isHorizontalBinOp(LHS, RHS, false))
25117     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25118   return SDValue();
25119 }
25120
25121 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25122 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25123   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25124   // F[X]OR(0.0, x) -> x
25125   // F[X]OR(x, 0.0) -> x
25126   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25127     if (C->getValueAPF().isPosZero())
25128       return N->getOperand(1);
25129   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25130     if (C->getValueAPF().isPosZero())
25131       return N->getOperand(0);
25132   return SDValue();
25133 }
25134
25135 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25136 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25137   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25138
25139   // Only perform optimizations if UnsafeMath is used.
25140   if (!DAG.getTarget().Options.UnsafeFPMath)
25141     return SDValue();
25142
25143   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25144   // into FMINC and FMAXC, which are Commutative operations.
25145   unsigned NewOp = 0;
25146   switch (N->getOpcode()) {
25147     default: llvm_unreachable("unknown opcode");
25148     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25149     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25150   }
25151
25152   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25153                      N->getOperand(0), N->getOperand(1));
25154 }
25155
25156 /// Do target-specific dag combines on X86ISD::FAND nodes.
25157 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25158   // FAND(0.0, x) -> 0.0
25159   // FAND(x, 0.0) -> 0.0
25160   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25161     if (C->getValueAPF().isPosZero())
25162       return N->getOperand(0);
25163   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25164     if (C->getValueAPF().isPosZero())
25165       return N->getOperand(1);
25166   return SDValue();
25167 }
25168
25169 /// Do target-specific dag combines on X86ISD::FANDN nodes
25170 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25171   // FANDN(x, 0.0) -> 0.0
25172   // FANDN(0.0, x) -> x
25173   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25174     if (C->getValueAPF().isPosZero())
25175       return N->getOperand(1);
25176   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25177     if (C->getValueAPF().isPosZero())
25178       return N->getOperand(1);
25179   return SDValue();
25180 }
25181
25182 static SDValue PerformBTCombine(SDNode *N,
25183                                 SelectionDAG &DAG,
25184                                 TargetLowering::DAGCombinerInfo &DCI) {
25185   // BT ignores high bits in the bit index operand.
25186   SDValue Op1 = N->getOperand(1);
25187   if (Op1.hasOneUse()) {
25188     unsigned BitWidth = Op1.getValueSizeInBits();
25189     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25190     APInt KnownZero, KnownOne;
25191     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25192                                           !DCI.isBeforeLegalizeOps());
25193     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25194     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25195         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25196       DCI.CommitTargetLoweringOpt(TLO);
25197   }
25198   return SDValue();
25199 }
25200
25201 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25202   SDValue Op = N->getOperand(0);
25203   if (Op.getOpcode() == ISD::BITCAST)
25204     Op = Op.getOperand(0);
25205   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25206   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25207       VT.getVectorElementType().getSizeInBits() ==
25208       OpVT.getVectorElementType().getSizeInBits()) {
25209     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25210   }
25211   return SDValue();
25212 }
25213
25214 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25215                                                const X86Subtarget *Subtarget) {
25216   EVT VT = N->getValueType(0);
25217   if (!VT.isVector())
25218     return SDValue();
25219
25220   SDValue N0 = N->getOperand(0);
25221   SDValue N1 = N->getOperand(1);
25222   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25223   SDLoc dl(N);
25224
25225   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25226   // both SSE and AVX2 since there is no sign-extended shift right
25227   // operation on a vector with 64-bit elements.
25228   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25229   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25230   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25231       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25232     SDValue N00 = N0.getOperand(0);
25233
25234     // EXTLOAD has a better solution on AVX2,
25235     // it may be replaced with X86ISD::VSEXT node.
25236     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25237       if (!ISD::isNormalLoad(N00.getNode()))
25238         return SDValue();
25239
25240     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25241         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25242                                   N00, N1);
25243       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25244     }
25245   }
25246   return SDValue();
25247 }
25248
25249 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25250                                   TargetLowering::DAGCombinerInfo &DCI,
25251                                   const X86Subtarget *Subtarget) {
25252   SDValue N0 = N->getOperand(0);
25253   EVT VT = N->getValueType(0);
25254
25255   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25256   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25257   // This exposes the sext to the sdivrem lowering, so that it directly extends
25258   // from AH (which we otherwise need to do contortions to access).
25259   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25260       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25261     SDLoc dl(N);
25262     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25263     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25264                             N0.getOperand(0), N0.getOperand(1));
25265     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25266     return R.getValue(1);
25267   }
25268
25269   if (!DCI.isBeforeLegalizeOps())
25270     return SDValue();
25271
25272   if (!Subtarget->hasFp256())
25273     return SDValue();
25274
25275   if (VT.isVector() && VT.getSizeInBits() == 256) {
25276     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25277     if (R.getNode())
25278       return R;
25279   }
25280
25281   return SDValue();
25282 }
25283
25284 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25285                                  const X86Subtarget* Subtarget) {
25286   SDLoc dl(N);
25287   EVT VT = N->getValueType(0);
25288
25289   // Let legalize expand this if it isn't a legal type yet.
25290   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25291     return SDValue();
25292
25293   EVT ScalarVT = VT.getScalarType();
25294   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25295       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25296     return SDValue();
25297
25298   SDValue A = N->getOperand(0);
25299   SDValue B = N->getOperand(1);
25300   SDValue C = N->getOperand(2);
25301
25302   bool NegA = (A.getOpcode() == ISD::FNEG);
25303   bool NegB = (B.getOpcode() == ISD::FNEG);
25304   bool NegC = (C.getOpcode() == ISD::FNEG);
25305
25306   // Negative multiplication when NegA xor NegB
25307   bool NegMul = (NegA != NegB);
25308   if (NegA)
25309     A = A.getOperand(0);
25310   if (NegB)
25311     B = B.getOperand(0);
25312   if (NegC)
25313     C = C.getOperand(0);
25314
25315   unsigned Opcode;
25316   if (!NegMul)
25317     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25318   else
25319     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25320
25321   return DAG.getNode(Opcode, dl, VT, A, B, C);
25322 }
25323
25324 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25325                                   TargetLowering::DAGCombinerInfo &DCI,
25326                                   const X86Subtarget *Subtarget) {
25327   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25328   //           (and (i32 x86isd::setcc_carry), 1)
25329   // This eliminates the zext. This transformation is necessary because
25330   // ISD::SETCC is always legalized to i8.
25331   SDLoc dl(N);
25332   SDValue N0 = N->getOperand(0);
25333   EVT VT = N->getValueType(0);
25334
25335   if (N0.getOpcode() == ISD::AND &&
25336       N0.hasOneUse() &&
25337       N0.getOperand(0).hasOneUse()) {
25338     SDValue N00 = N0.getOperand(0);
25339     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25340       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25341       if (!C || C->getZExtValue() != 1)
25342         return SDValue();
25343       return DAG.getNode(ISD::AND, dl, VT,
25344                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25345                                      N00.getOperand(0), N00.getOperand(1)),
25346                          DAG.getConstant(1, VT));
25347     }
25348   }
25349
25350   if (N0.getOpcode() == ISD::TRUNCATE &&
25351       N0.hasOneUse() &&
25352       N0.getOperand(0).hasOneUse()) {
25353     SDValue N00 = N0.getOperand(0);
25354     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25355       return DAG.getNode(ISD::AND, dl, VT,
25356                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25357                                      N00.getOperand(0), N00.getOperand(1)),
25358                          DAG.getConstant(1, VT));
25359     }
25360   }
25361   if (VT.is256BitVector()) {
25362     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25363     if (R.getNode())
25364       return R;
25365   }
25366
25367   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25368   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25369   // This exposes the zext to the udivrem lowering, so that it directly extends
25370   // from AH (which we otherwise need to do contortions to access).
25371   if (N0.getOpcode() == ISD::UDIVREM &&
25372       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25373       (VT == MVT::i32 || VT == MVT::i64)) {
25374     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25375     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25376                             N0.getOperand(0), N0.getOperand(1));
25377     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25378     return R.getValue(1);
25379   }
25380
25381   return SDValue();
25382 }
25383
25384 // Optimize x == -y --> x+y == 0
25385 //          x != -y --> x+y != 0
25386 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25387                                       const X86Subtarget* Subtarget) {
25388   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25389   SDValue LHS = N->getOperand(0);
25390   SDValue RHS = N->getOperand(1);
25391   EVT VT = N->getValueType(0);
25392   SDLoc DL(N);
25393
25394   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25395     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25396       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25397         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25398                                    LHS.getValueType(), RHS, LHS.getOperand(1));
25399         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25400                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25401       }
25402   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25403     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25404       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25405         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
25406                                    RHS.getValueType(), LHS, RHS.getOperand(1));
25407         return DAG.getSetCC(SDLoc(N), N->getValueType(0),
25408                             addV, DAG.getConstant(0, addV.getValueType()), CC);
25409       }
25410
25411   if (VT.getScalarType() == MVT::i1) {
25412     bool IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25413       (LHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25414     bool IsVZero0 = ISD::isBuildVectorAllZeros(LHS.getNode());
25415     if (!IsSEXT0 && !IsVZero0)
25416       return SDValue();
25417     bool IsSEXT1 = (RHS.getOpcode() == ISD::SIGN_EXTEND) &&
25418       (RHS.getOperand(0).getValueType().getScalarType() ==  MVT::i1);
25419     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25420
25421     if (!IsSEXT1 && !IsVZero1)
25422       return SDValue();
25423
25424     if (IsSEXT0 && IsVZero1) {
25425       assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type");
25426       if (CC == ISD::SETEQ)
25427         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25428       return LHS.getOperand(0);
25429     }
25430     if (IsSEXT1 && IsVZero0) {
25431       assert(VT == RHS.getOperand(0).getValueType() && "Uexpected operand type");
25432       if (CC == ISD::SETEQ)
25433         return DAG.getNOT(DL, RHS.getOperand(0), VT);
25434       return RHS.getOperand(0);
25435     }
25436   }
25437
25438   return SDValue();
25439 }
25440
25441 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25442                                       const X86Subtarget *Subtarget) {
25443   SDLoc dl(N);
25444   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25445   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25446          "X86insertps is only defined for v4x32");
25447
25448   SDValue Ld = N->getOperand(1);
25449   if (MayFoldLoad(Ld)) {
25450     // Extract the countS bits from the immediate so we can get the proper
25451     // address when narrowing the vector load to a specific element.
25452     // When the second source op is a memory address, interps doesn't use
25453     // countS and just gets an f32 from that address.
25454     unsigned DestIndex =
25455         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25456     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25457   } else
25458     return SDValue();
25459
25460   // Create this as a scalar to vector to match the instruction pattern.
25461   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25462   // countS bits are ignored when loading from memory on insertps, which
25463   // means we don't need to explicitly set them to 0.
25464   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25465                      LoadScalarToVector, N->getOperand(2));
25466 }
25467
25468 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25469 // as "sbb reg,reg", since it can be extended without zext and produces
25470 // an all-ones bit which is more useful than 0/1 in some cases.
25471 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25472                                MVT VT) {
25473   if (VT == MVT::i8)
25474     return DAG.getNode(ISD::AND, DL, VT,
25475                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25476                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25477                        DAG.getConstant(1, VT));
25478   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25479   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25480                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25481                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25482 }
25483
25484 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25485 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25486                                    TargetLowering::DAGCombinerInfo &DCI,
25487                                    const X86Subtarget *Subtarget) {
25488   SDLoc DL(N);
25489   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25490   SDValue EFLAGS = N->getOperand(1);
25491
25492   if (CC == X86::COND_A) {
25493     // Try to convert COND_A into COND_B in an attempt to facilitate
25494     // materializing "setb reg".
25495     //
25496     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25497     // cannot take an immediate as its first operand.
25498     //
25499     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25500         EFLAGS.getValueType().isInteger() &&
25501         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25502       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25503                                    EFLAGS.getNode()->getVTList(),
25504                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25505       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25506       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25507     }
25508   }
25509
25510   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25511   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25512   // cases.
25513   if (CC == X86::COND_B)
25514     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25515
25516   SDValue Flags;
25517
25518   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25519   if (Flags.getNode()) {
25520     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25521     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25522   }
25523
25524   return SDValue();
25525 }
25526
25527 // Optimize branch condition evaluation.
25528 //
25529 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25530                                     TargetLowering::DAGCombinerInfo &DCI,
25531                                     const X86Subtarget *Subtarget) {
25532   SDLoc DL(N);
25533   SDValue Chain = N->getOperand(0);
25534   SDValue Dest = N->getOperand(1);
25535   SDValue EFLAGS = N->getOperand(3);
25536   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25537
25538   SDValue Flags;
25539
25540   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25541   if (Flags.getNode()) {
25542     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25543     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25544                        Flags);
25545   }
25546
25547   return SDValue();
25548 }
25549
25550 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25551                                                          SelectionDAG &DAG) {
25552   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25553   // optimize away operation when it's from a constant.
25554   //
25555   // The general transformation is:
25556   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25557   //       AND(VECTOR_CMP(x,y), constant2)
25558   //    constant2 = UNARYOP(constant)
25559
25560   // Early exit if this isn't a vector operation, the operand of the
25561   // unary operation isn't a bitwise AND, or if the sizes of the operations
25562   // aren't the same.
25563   EVT VT = N->getValueType(0);
25564   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25565       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25566       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25567     return SDValue();
25568
25569   // Now check that the other operand of the AND is a constant. We could
25570   // make the transformation for non-constant splats as well, but it's unclear
25571   // that would be a benefit as it would not eliminate any operations, just
25572   // perform one more step in scalar code before moving to the vector unit.
25573   if (BuildVectorSDNode *BV =
25574           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25575     // Bail out if the vector isn't a constant.
25576     if (!BV->isConstant())
25577       return SDValue();
25578
25579     // Everything checks out. Build up the new and improved node.
25580     SDLoc DL(N);
25581     EVT IntVT = BV->getValueType(0);
25582     // Create a new constant of the appropriate type for the transformed
25583     // DAG.
25584     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25585     // The AND node needs bitcasts to/from an integer vector type around it.
25586     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25587     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25588                                  N->getOperand(0)->getOperand(0), MaskConst);
25589     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25590     return Res;
25591   }
25592
25593   return SDValue();
25594 }
25595
25596 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25597                                         const X86TargetLowering *XTLI) {
25598   // First try to optimize away the conversion entirely when it's
25599   // conditionally from a constant. Vectors only.
25600   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25601   if (Res != SDValue())
25602     return Res;
25603
25604   // Now move on to more general possibilities.
25605   SDValue Op0 = N->getOperand(0);
25606   EVT InVT = Op0->getValueType(0);
25607
25608   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25609   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25610     SDLoc dl(N);
25611     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25612     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25613     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25614   }
25615
25616   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25617   // a 32-bit target where SSE doesn't support i64->FP operations.
25618   if (Op0.getOpcode() == ISD::LOAD) {
25619     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25620     EVT VT = Ld->getValueType(0);
25621     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25622         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25623         !XTLI->getSubtarget()->is64Bit() &&
25624         VT == MVT::i64) {
25625       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
25626                                           Ld->getChain(), Op0, DAG);
25627       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25628       return FILDChain;
25629     }
25630   }
25631   return SDValue();
25632 }
25633
25634 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25635 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25636                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25637   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25638   // the result is either zero or one (depending on the input carry bit).
25639   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25640   if (X86::isZeroNode(N->getOperand(0)) &&
25641       X86::isZeroNode(N->getOperand(1)) &&
25642       // We don't have a good way to replace an EFLAGS use, so only do this when
25643       // dead right now.
25644       SDValue(N, 1).use_empty()) {
25645     SDLoc DL(N);
25646     EVT VT = N->getValueType(0);
25647     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25648     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25649                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25650                                            DAG.getConstant(X86::COND_B,MVT::i8),
25651                                            N->getOperand(2)),
25652                                DAG.getConstant(1, VT));
25653     return DCI.CombineTo(N, Res1, CarryOut);
25654   }
25655
25656   return SDValue();
25657 }
25658
25659 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25660 //      (add Y, (setne X, 0)) -> sbb -1, Y
25661 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25662 //      (sub (setne X, 0), Y) -> adc -1, Y
25663 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25664   SDLoc DL(N);
25665
25666   // Look through ZExts.
25667   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25668   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25669     return SDValue();
25670
25671   SDValue SetCC = Ext.getOperand(0);
25672   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25673     return SDValue();
25674
25675   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25676   if (CC != X86::COND_E && CC != X86::COND_NE)
25677     return SDValue();
25678
25679   SDValue Cmp = SetCC.getOperand(1);
25680   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25681       !X86::isZeroNode(Cmp.getOperand(1)) ||
25682       !Cmp.getOperand(0).getValueType().isInteger())
25683     return SDValue();
25684
25685   SDValue CmpOp0 = Cmp.getOperand(0);
25686   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25687                                DAG.getConstant(1, CmpOp0.getValueType()));
25688
25689   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25690   if (CC == X86::COND_NE)
25691     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25692                        DL, OtherVal.getValueType(), OtherVal,
25693                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
25694   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
25695                      DL, OtherVal.getValueType(), OtherVal,
25696                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
25697 }
25698
25699 /// PerformADDCombine - Do target-specific dag combines on integer adds.
25700 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
25701                                  const X86Subtarget *Subtarget) {
25702   EVT VT = N->getValueType(0);
25703   SDValue Op0 = N->getOperand(0);
25704   SDValue Op1 = N->getOperand(1);
25705
25706   // Try to synthesize horizontal adds from adds of shuffles.
25707   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25708        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25709       isHorizontalBinOp(Op0, Op1, true))
25710     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
25711
25712   return OptimizeConditionalInDecrement(N, DAG);
25713 }
25714
25715 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
25716                                  const X86Subtarget *Subtarget) {
25717   SDValue Op0 = N->getOperand(0);
25718   SDValue Op1 = N->getOperand(1);
25719
25720   // X86 can't encode an immediate LHS of a sub. See if we can push the
25721   // negation into a preceding instruction.
25722   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
25723     // If the RHS of the sub is a XOR with one use and a constant, invert the
25724     // immediate. Then add one to the LHS of the sub so we can turn
25725     // X-Y -> X+~Y+1, saving one register.
25726     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
25727         isa<ConstantSDNode>(Op1.getOperand(1))) {
25728       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
25729       EVT VT = Op0.getValueType();
25730       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
25731                                    Op1.getOperand(0),
25732                                    DAG.getConstant(~XorC, VT));
25733       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
25734                          DAG.getConstant(C->getAPIntValue()+1, VT));
25735     }
25736   }
25737
25738   // Try to synthesize horizontal adds from adds of shuffles.
25739   EVT VT = N->getValueType(0);
25740   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25741        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25742       isHorizontalBinOp(Op0, Op1, true))
25743     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
25744
25745   return OptimizeConditionalInDecrement(N, DAG);
25746 }
25747
25748 /// performVZEXTCombine - Performs build vector combines
25749 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
25750                                    TargetLowering::DAGCombinerInfo &DCI,
25751                                    const X86Subtarget *Subtarget) {
25752   SDLoc DL(N);
25753   MVT VT = N->getSimpleValueType(0);
25754   SDValue Op = N->getOperand(0);
25755   MVT OpVT = Op.getSimpleValueType();
25756   MVT OpEltVT = OpVT.getVectorElementType();
25757   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
25758
25759   // (vzext (bitcast (vzext (x)) -> (vzext x)
25760   SDValue V = Op;
25761   while (V.getOpcode() == ISD::BITCAST)
25762     V = V.getOperand(0);
25763
25764   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
25765     MVT InnerVT = V.getSimpleValueType();
25766     MVT InnerEltVT = InnerVT.getVectorElementType();
25767
25768     // If the element sizes match exactly, we can just do one larger vzext. This
25769     // is always an exact type match as vzext operates on integer types.
25770     if (OpEltVT == InnerEltVT) {
25771       assert(OpVT == InnerVT && "Types must match for vzext!");
25772       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
25773     }
25774
25775     // The only other way we can combine them is if only a single element of the
25776     // inner vzext is used in the input to the outer vzext.
25777     if (InnerEltVT.getSizeInBits() < InputBits)
25778       return SDValue();
25779
25780     // In this case, the inner vzext is completely dead because we're going to
25781     // only look at bits inside of the low element. Just do the outer vzext on
25782     // a bitcast of the input to the inner.
25783     return DAG.getNode(X86ISD::VZEXT, DL, VT,
25784                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
25785   }
25786
25787   // Check if we can bypass extracting and re-inserting an element of an input
25788   // vector. Essentialy:
25789   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
25790   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
25791       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25792       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
25793     SDValue ExtractedV = V.getOperand(0);
25794     SDValue OrigV = ExtractedV.getOperand(0);
25795     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
25796       if (ExtractIdx->getZExtValue() == 0) {
25797         MVT OrigVT = OrigV.getSimpleValueType();
25798         // Extract a subvector if necessary...
25799         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
25800           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
25801           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
25802                                     OrigVT.getVectorNumElements() / Ratio);
25803           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
25804                               DAG.getIntPtrConstant(0));
25805         }
25806         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
25807         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
25808       }
25809   }
25810
25811   return SDValue();
25812 }
25813
25814 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
25815                                              DAGCombinerInfo &DCI) const {
25816   SelectionDAG &DAG = DCI.DAG;
25817   switch (N->getOpcode()) {
25818   default: break;
25819   case ISD::EXTRACT_VECTOR_ELT:
25820     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
25821   case ISD::VSELECT:
25822   case ISD::SELECT:
25823   case X86ISD::SHRUNKBLEND:
25824     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
25825   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
25826   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
25827   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
25828   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
25829   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
25830   case ISD::SHL:
25831   case ISD::SRA:
25832   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
25833   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
25834   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
25835   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
25836   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
25837   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
25838   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
25839   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
25840   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
25841   case X86ISD::FXOR:
25842   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
25843   case X86ISD::FMIN:
25844   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
25845   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
25846   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
25847   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
25848   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
25849   case ISD::ANY_EXTEND:
25850   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
25851   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
25852   case ISD::SIGN_EXTEND_INREG:
25853     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
25854   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
25855   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
25856   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
25857   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
25858   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
25859   case X86ISD::SHUFP:       // Handle all target specific shuffles
25860   case X86ISD::PALIGNR:
25861   case X86ISD::UNPCKH:
25862   case X86ISD::UNPCKL:
25863   case X86ISD::MOVHLPS:
25864   case X86ISD::MOVLHPS:
25865   case X86ISD::PSHUFB:
25866   case X86ISD::PSHUFD:
25867   case X86ISD::PSHUFHW:
25868   case X86ISD::PSHUFLW:
25869   case X86ISD::MOVSS:
25870   case X86ISD::MOVSD:
25871   case X86ISD::VPERMILPI:
25872   case X86ISD::VPERM2X128:
25873   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
25874   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
25875   case ISD::INTRINSIC_WO_CHAIN:
25876     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
25877   case X86ISD::INSERTPS:
25878     return PerformINSERTPSCombine(N, DAG, Subtarget);
25879   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
25880   }
25881
25882   return SDValue();
25883 }
25884
25885 /// isTypeDesirableForOp - Return true if the target has native support for
25886 /// the specified value type and it is 'desirable' to use the type for the
25887 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
25888 /// instruction encodings are longer and some i16 instructions are slow.
25889 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
25890   if (!isTypeLegal(VT))
25891     return false;
25892   if (VT != MVT::i16)
25893     return true;
25894
25895   switch (Opc) {
25896   default:
25897     return true;
25898   case ISD::LOAD:
25899   case ISD::SIGN_EXTEND:
25900   case ISD::ZERO_EXTEND:
25901   case ISD::ANY_EXTEND:
25902   case ISD::SHL:
25903   case ISD::SRL:
25904   case ISD::SUB:
25905   case ISD::ADD:
25906   case ISD::MUL:
25907   case ISD::AND:
25908   case ISD::OR:
25909   case ISD::XOR:
25910     return false;
25911   }
25912 }
25913
25914 /// IsDesirableToPromoteOp - This method query the target whether it is
25915 /// beneficial for dag combiner to promote the specified node. If true, it
25916 /// should return the desired promotion type by reference.
25917 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
25918   EVT VT = Op.getValueType();
25919   if (VT != MVT::i16)
25920     return false;
25921
25922   bool Promote = false;
25923   bool Commute = false;
25924   switch (Op.getOpcode()) {
25925   default: break;
25926   case ISD::LOAD: {
25927     LoadSDNode *LD = cast<LoadSDNode>(Op);
25928     // If the non-extending load has a single use and it's not live out, then it
25929     // might be folded.
25930     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
25931                                                      Op.hasOneUse()*/) {
25932       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
25933              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
25934         // The only case where we'd want to promote LOAD (rather then it being
25935         // promoted as an operand is when it's only use is liveout.
25936         if (UI->getOpcode() != ISD::CopyToReg)
25937           return false;
25938       }
25939     }
25940     Promote = true;
25941     break;
25942   }
25943   case ISD::SIGN_EXTEND:
25944   case ISD::ZERO_EXTEND:
25945   case ISD::ANY_EXTEND:
25946     Promote = true;
25947     break;
25948   case ISD::SHL:
25949   case ISD::SRL: {
25950     SDValue N0 = Op.getOperand(0);
25951     // Look out for (store (shl (load), x)).
25952     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
25953       return false;
25954     Promote = true;
25955     break;
25956   }
25957   case ISD::ADD:
25958   case ISD::MUL:
25959   case ISD::AND:
25960   case ISD::OR:
25961   case ISD::XOR:
25962     Commute = true;
25963     // fallthrough
25964   case ISD::SUB: {
25965     SDValue N0 = Op.getOperand(0);
25966     SDValue N1 = Op.getOperand(1);
25967     if (!Commute && MayFoldLoad(N1))
25968       return false;
25969     // Avoid disabling potential load folding opportunities.
25970     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
25971       return false;
25972     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
25973       return false;
25974     Promote = true;
25975   }
25976   }
25977
25978   PVT = MVT::i32;
25979   return Promote;
25980 }
25981
25982 //===----------------------------------------------------------------------===//
25983 //                           X86 Inline Assembly Support
25984 //===----------------------------------------------------------------------===//
25985
25986 namespace {
25987   // Helper to match a string separated by whitespace.
25988   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
25989     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
25990
25991     for (unsigned i = 0, e = args.size(); i != e; ++i) {
25992       StringRef piece(*args[i]);
25993       if (!s.startswith(piece)) // Check if the piece matches.
25994         return false;
25995
25996       s = s.substr(piece.size());
25997       StringRef::size_type pos = s.find_first_not_of(" \t");
25998       if (pos == 0) // We matched a prefix.
25999         return false;
26000
26001       s = s.substr(pos);
26002     }
26003
26004     return s.empty();
26005   }
26006   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26007 }
26008
26009 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26010
26011   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26012     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26013         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26014         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26015
26016       if (AsmPieces.size() == 3)
26017         return true;
26018       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26019         return true;
26020     }
26021   }
26022   return false;
26023 }
26024
26025 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26026   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26027
26028   std::string AsmStr = IA->getAsmString();
26029
26030   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26031   if (!Ty || Ty->getBitWidth() % 16 != 0)
26032     return false;
26033
26034   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26035   SmallVector<StringRef, 4> AsmPieces;
26036   SplitString(AsmStr, AsmPieces, ";\n");
26037
26038   switch (AsmPieces.size()) {
26039   default: return false;
26040   case 1:
26041     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26042     // we will turn this bswap into something that will be lowered to logical
26043     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26044     // lower so don't worry about this.
26045     // bswap $0
26046     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26047         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26048         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26049         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26050         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26051         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26052       // No need to check constraints, nothing other than the equivalent of
26053       // "=r,0" would be valid here.
26054       return IntrinsicLowering::LowerToByteSwap(CI);
26055     }
26056
26057     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26058     if (CI->getType()->isIntegerTy(16) &&
26059         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26060         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26061          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26062       AsmPieces.clear();
26063       const std::string &ConstraintsStr = IA->getConstraintString();
26064       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26065       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26066       if (clobbersFlagRegisters(AsmPieces))
26067         return IntrinsicLowering::LowerToByteSwap(CI);
26068     }
26069     break;
26070   case 3:
26071     if (CI->getType()->isIntegerTy(32) &&
26072         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26073         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26074         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26075         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26076       AsmPieces.clear();
26077       const std::string &ConstraintsStr = IA->getConstraintString();
26078       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26079       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26080       if (clobbersFlagRegisters(AsmPieces))
26081         return IntrinsicLowering::LowerToByteSwap(CI);
26082     }
26083
26084     if (CI->getType()->isIntegerTy(64)) {
26085       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26086       if (Constraints.size() >= 2 &&
26087           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26088           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26089         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26090         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26091             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26092             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26093           return IntrinsicLowering::LowerToByteSwap(CI);
26094       }
26095     }
26096     break;
26097   }
26098   return false;
26099 }
26100
26101 /// getConstraintType - Given a constraint letter, return the type of
26102 /// constraint it is for this target.
26103 X86TargetLowering::ConstraintType
26104 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26105   if (Constraint.size() == 1) {
26106     switch (Constraint[0]) {
26107     case 'R':
26108     case 'q':
26109     case 'Q':
26110     case 'f':
26111     case 't':
26112     case 'u':
26113     case 'y':
26114     case 'x':
26115     case 'Y':
26116     case 'l':
26117       return C_RegisterClass;
26118     case 'a':
26119     case 'b':
26120     case 'c':
26121     case 'd':
26122     case 'S':
26123     case 'D':
26124     case 'A':
26125       return C_Register;
26126     case 'I':
26127     case 'J':
26128     case 'K':
26129     case 'L':
26130     case 'M':
26131     case 'N':
26132     case 'G':
26133     case 'C':
26134     case 'e':
26135     case 'Z':
26136       return C_Other;
26137     default:
26138       break;
26139     }
26140   }
26141   return TargetLowering::getConstraintType(Constraint);
26142 }
26143
26144 /// Examine constraint type and operand type and determine a weight value.
26145 /// This object must already have been set up with the operand type
26146 /// and the current alternative constraint selected.
26147 TargetLowering::ConstraintWeight
26148   X86TargetLowering::getSingleConstraintMatchWeight(
26149     AsmOperandInfo &info, const char *constraint) const {
26150   ConstraintWeight weight = CW_Invalid;
26151   Value *CallOperandVal = info.CallOperandVal;
26152     // If we don't have a value, we can't do a match,
26153     // but allow it at the lowest weight.
26154   if (!CallOperandVal)
26155     return CW_Default;
26156   Type *type = CallOperandVal->getType();
26157   // Look at the constraint type.
26158   switch (*constraint) {
26159   default:
26160     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26161   case 'R':
26162   case 'q':
26163   case 'Q':
26164   case 'a':
26165   case 'b':
26166   case 'c':
26167   case 'd':
26168   case 'S':
26169   case 'D':
26170   case 'A':
26171     if (CallOperandVal->getType()->isIntegerTy())
26172       weight = CW_SpecificReg;
26173     break;
26174   case 'f':
26175   case 't':
26176   case 'u':
26177     if (type->isFloatingPointTy())
26178       weight = CW_SpecificReg;
26179     break;
26180   case 'y':
26181     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26182       weight = CW_SpecificReg;
26183     break;
26184   case 'x':
26185   case 'Y':
26186     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26187         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26188       weight = CW_Register;
26189     break;
26190   case 'I':
26191     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26192       if (C->getZExtValue() <= 31)
26193         weight = CW_Constant;
26194     }
26195     break;
26196   case 'J':
26197     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26198       if (C->getZExtValue() <= 63)
26199         weight = CW_Constant;
26200     }
26201     break;
26202   case 'K':
26203     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26204       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26205         weight = CW_Constant;
26206     }
26207     break;
26208   case 'L':
26209     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26210       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26211         weight = CW_Constant;
26212     }
26213     break;
26214   case 'M':
26215     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26216       if (C->getZExtValue() <= 3)
26217         weight = CW_Constant;
26218     }
26219     break;
26220   case 'N':
26221     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26222       if (C->getZExtValue() <= 0xff)
26223         weight = CW_Constant;
26224     }
26225     break;
26226   case 'G':
26227   case 'C':
26228     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26229       weight = CW_Constant;
26230     }
26231     break;
26232   case 'e':
26233     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26234       if ((C->getSExtValue() >= -0x80000000LL) &&
26235           (C->getSExtValue() <= 0x7fffffffLL))
26236         weight = CW_Constant;
26237     }
26238     break;
26239   case 'Z':
26240     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26241       if (C->getZExtValue() <= 0xffffffff)
26242         weight = CW_Constant;
26243     }
26244     break;
26245   }
26246   return weight;
26247 }
26248
26249 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26250 /// with another that has more specific requirements based on the type of the
26251 /// corresponding operand.
26252 const char *X86TargetLowering::
26253 LowerXConstraint(EVT ConstraintVT) const {
26254   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26255   // 'f' like normal targets.
26256   if (ConstraintVT.isFloatingPoint()) {
26257     if (Subtarget->hasSSE2())
26258       return "Y";
26259     if (Subtarget->hasSSE1())
26260       return "x";
26261   }
26262
26263   return TargetLowering::LowerXConstraint(ConstraintVT);
26264 }
26265
26266 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26267 /// vector.  If it is invalid, don't add anything to Ops.
26268 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26269                                                      std::string &Constraint,
26270                                                      std::vector<SDValue>&Ops,
26271                                                      SelectionDAG &DAG) const {
26272   SDValue Result;
26273
26274   // Only support length 1 constraints for now.
26275   if (Constraint.length() > 1) return;
26276
26277   char ConstraintLetter = Constraint[0];
26278   switch (ConstraintLetter) {
26279   default: break;
26280   case 'I':
26281     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26282       if (C->getZExtValue() <= 31) {
26283         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26284         break;
26285       }
26286     }
26287     return;
26288   case 'J':
26289     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26290       if (C->getZExtValue() <= 63) {
26291         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26292         break;
26293       }
26294     }
26295     return;
26296   case 'K':
26297     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26298       if (isInt<8>(C->getSExtValue())) {
26299         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26300         break;
26301       }
26302     }
26303     return;
26304   case 'L':
26305     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26306       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26307           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26308         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26309         break;
26310       }
26311     }
26312     return;
26313   case 'M':
26314     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26315       if (C->getZExtValue() <= 3) {
26316         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26317         break;
26318       }
26319     }
26320     return;
26321   case 'N':
26322     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26323       if (C->getZExtValue() <= 255) {
26324         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26325         break;
26326       }
26327     }
26328     return;
26329   case 'O':
26330     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26331       if (C->getZExtValue() <= 127) {
26332         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26333         break;
26334       }
26335     }
26336     return;
26337   case 'e': {
26338     // 32-bit signed value
26339     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26340       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26341                                            C->getSExtValue())) {
26342         // Widen to 64 bits here to get it sign extended.
26343         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26344         break;
26345       }
26346     // FIXME gcc accepts some relocatable values here too, but only in certain
26347     // memory models; it's complicated.
26348     }
26349     return;
26350   }
26351   case 'Z': {
26352     // 32-bit unsigned value
26353     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26354       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26355                                            C->getZExtValue())) {
26356         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26357         break;
26358       }
26359     }
26360     // FIXME gcc accepts some relocatable values here too, but only in certain
26361     // memory models; it's complicated.
26362     return;
26363   }
26364   case 'i': {
26365     // Literal immediates are always ok.
26366     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26367       // Widen to 64 bits here to get it sign extended.
26368       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26369       break;
26370     }
26371
26372     // In any sort of PIC mode addresses need to be computed at runtime by
26373     // adding in a register or some sort of table lookup.  These can't
26374     // be used as immediates.
26375     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26376       return;
26377
26378     // If we are in non-pic codegen mode, we allow the address of a global (with
26379     // an optional displacement) to be used with 'i'.
26380     GlobalAddressSDNode *GA = nullptr;
26381     int64_t Offset = 0;
26382
26383     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26384     while (1) {
26385       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26386         Offset += GA->getOffset();
26387         break;
26388       } else if (Op.getOpcode() == ISD::ADD) {
26389         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26390           Offset += C->getZExtValue();
26391           Op = Op.getOperand(0);
26392           continue;
26393         }
26394       } else if (Op.getOpcode() == ISD::SUB) {
26395         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26396           Offset += -C->getZExtValue();
26397           Op = Op.getOperand(0);
26398           continue;
26399         }
26400       }
26401
26402       // Otherwise, this isn't something we can handle, reject it.
26403       return;
26404     }
26405
26406     const GlobalValue *GV = GA->getGlobal();
26407     // If we require an extra load to get this address, as in PIC mode, we
26408     // can't accept it.
26409     if (isGlobalStubReference(
26410             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26411       return;
26412
26413     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26414                                         GA->getValueType(0), Offset);
26415     break;
26416   }
26417   }
26418
26419   if (Result.getNode()) {
26420     Ops.push_back(Result);
26421     return;
26422   }
26423   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26424 }
26425
26426 std::pair<unsigned, const TargetRegisterClass*>
26427 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26428                                                 MVT VT) const {
26429   // First, see if this is a constraint that directly corresponds to an LLVM
26430   // register class.
26431   if (Constraint.size() == 1) {
26432     // GCC Constraint Letters
26433     switch (Constraint[0]) {
26434     default: break;
26435       // TODO: Slight differences here in allocation order and leaving
26436       // RIP in the class. Do they matter any more here than they do
26437       // in the normal allocation?
26438     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26439       if (Subtarget->is64Bit()) {
26440         if (VT == MVT::i32 || VT == MVT::f32)
26441           return std::make_pair(0U, &X86::GR32RegClass);
26442         if (VT == MVT::i16)
26443           return std::make_pair(0U, &X86::GR16RegClass);
26444         if (VT == MVT::i8 || VT == MVT::i1)
26445           return std::make_pair(0U, &X86::GR8RegClass);
26446         if (VT == MVT::i64 || VT == MVT::f64)
26447           return std::make_pair(0U, &X86::GR64RegClass);
26448         break;
26449       }
26450       // 32-bit fallthrough
26451     case 'Q':   // Q_REGS
26452       if (VT == MVT::i32 || VT == MVT::f32)
26453         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26454       if (VT == MVT::i16)
26455         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26456       if (VT == MVT::i8 || VT == MVT::i1)
26457         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26458       if (VT == MVT::i64)
26459         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26460       break;
26461     case 'r':   // GENERAL_REGS
26462     case 'l':   // INDEX_REGS
26463       if (VT == MVT::i8 || VT == MVT::i1)
26464         return std::make_pair(0U, &X86::GR8RegClass);
26465       if (VT == MVT::i16)
26466         return std::make_pair(0U, &X86::GR16RegClass);
26467       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26468         return std::make_pair(0U, &X86::GR32RegClass);
26469       return std::make_pair(0U, &X86::GR64RegClass);
26470     case 'R':   // LEGACY_REGS
26471       if (VT == MVT::i8 || VT == MVT::i1)
26472         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26473       if (VT == MVT::i16)
26474         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26475       if (VT == MVT::i32 || !Subtarget->is64Bit())
26476         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26477       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26478     case 'f':  // FP Stack registers.
26479       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26480       // value to the correct fpstack register class.
26481       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26482         return std::make_pair(0U, &X86::RFP32RegClass);
26483       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26484         return std::make_pair(0U, &X86::RFP64RegClass);
26485       return std::make_pair(0U, &X86::RFP80RegClass);
26486     case 'y':   // MMX_REGS if MMX allowed.
26487       if (!Subtarget->hasMMX()) break;
26488       return std::make_pair(0U, &X86::VR64RegClass);
26489     case 'Y':   // SSE_REGS if SSE2 allowed
26490       if (!Subtarget->hasSSE2()) break;
26491       // FALL THROUGH.
26492     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26493       if (!Subtarget->hasSSE1()) break;
26494
26495       switch (VT.SimpleTy) {
26496       default: break;
26497       // Scalar SSE types.
26498       case MVT::f32:
26499       case MVT::i32:
26500         return std::make_pair(0U, &X86::FR32RegClass);
26501       case MVT::f64:
26502       case MVT::i64:
26503         return std::make_pair(0U, &X86::FR64RegClass);
26504       // Vector types.
26505       case MVT::v16i8:
26506       case MVT::v8i16:
26507       case MVT::v4i32:
26508       case MVT::v2i64:
26509       case MVT::v4f32:
26510       case MVT::v2f64:
26511         return std::make_pair(0U, &X86::VR128RegClass);
26512       // AVX types.
26513       case MVT::v32i8:
26514       case MVT::v16i16:
26515       case MVT::v8i32:
26516       case MVT::v4i64:
26517       case MVT::v8f32:
26518       case MVT::v4f64:
26519         return std::make_pair(0U, &X86::VR256RegClass);
26520       case MVT::v8f64:
26521       case MVT::v16f32:
26522       case MVT::v16i32:
26523       case MVT::v8i64:
26524         return std::make_pair(0U, &X86::VR512RegClass);
26525       }
26526       break;
26527     }
26528   }
26529
26530   // Use the default implementation in TargetLowering to convert the register
26531   // constraint into a member of a register class.
26532   std::pair<unsigned, const TargetRegisterClass*> Res;
26533   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26534
26535   // Not found as a standard register?
26536   if (!Res.second) {
26537     // Map st(0) -> st(7) -> ST0
26538     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26539         tolower(Constraint[1]) == 's' &&
26540         tolower(Constraint[2]) == 't' &&
26541         Constraint[3] == '(' &&
26542         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26543         Constraint[5] == ')' &&
26544         Constraint[6] == '}') {
26545
26546       Res.first = X86::FP0+Constraint[4]-'0';
26547       Res.second = &X86::RFP80RegClass;
26548       return Res;
26549     }
26550
26551     // GCC allows "st(0)" to be called just plain "st".
26552     if (StringRef("{st}").equals_lower(Constraint)) {
26553       Res.first = X86::FP0;
26554       Res.second = &X86::RFP80RegClass;
26555       return Res;
26556     }
26557
26558     // flags -> EFLAGS
26559     if (StringRef("{flags}").equals_lower(Constraint)) {
26560       Res.first = X86::EFLAGS;
26561       Res.second = &X86::CCRRegClass;
26562       return Res;
26563     }
26564
26565     // 'A' means EAX + EDX.
26566     if (Constraint == "A") {
26567       Res.first = X86::EAX;
26568       Res.second = &X86::GR32_ADRegClass;
26569       return Res;
26570     }
26571     return Res;
26572   }
26573
26574   // Otherwise, check to see if this is a register class of the wrong value
26575   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26576   // turn into {ax},{dx}.
26577   if (Res.second->hasType(VT))
26578     return Res;   // Correct type already, nothing to do.
26579
26580   // All of the single-register GCC register classes map their values onto
26581   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26582   // really want an 8-bit or 32-bit register, map to the appropriate register
26583   // class and return the appropriate register.
26584   if (Res.second == &X86::GR16RegClass) {
26585     if (VT == MVT::i8 || VT == MVT::i1) {
26586       unsigned DestReg = 0;
26587       switch (Res.first) {
26588       default: break;
26589       case X86::AX: DestReg = X86::AL; break;
26590       case X86::DX: DestReg = X86::DL; break;
26591       case X86::CX: DestReg = X86::CL; break;
26592       case X86::BX: DestReg = X86::BL; break;
26593       }
26594       if (DestReg) {
26595         Res.first = DestReg;
26596         Res.second = &X86::GR8RegClass;
26597       }
26598     } else if (VT == MVT::i32 || VT == MVT::f32) {
26599       unsigned DestReg = 0;
26600       switch (Res.first) {
26601       default: break;
26602       case X86::AX: DestReg = X86::EAX; break;
26603       case X86::DX: DestReg = X86::EDX; break;
26604       case X86::CX: DestReg = X86::ECX; break;
26605       case X86::BX: DestReg = X86::EBX; break;
26606       case X86::SI: DestReg = X86::ESI; break;
26607       case X86::DI: DestReg = X86::EDI; break;
26608       case X86::BP: DestReg = X86::EBP; break;
26609       case X86::SP: DestReg = X86::ESP; break;
26610       }
26611       if (DestReg) {
26612         Res.first = DestReg;
26613         Res.second = &X86::GR32RegClass;
26614       }
26615     } else if (VT == MVT::i64 || VT == MVT::f64) {
26616       unsigned DestReg = 0;
26617       switch (Res.first) {
26618       default: break;
26619       case X86::AX: DestReg = X86::RAX; break;
26620       case X86::DX: DestReg = X86::RDX; break;
26621       case X86::CX: DestReg = X86::RCX; break;
26622       case X86::BX: DestReg = X86::RBX; break;
26623       case X86::SI: DestReg = X86::RSI; break;
26624       case X86::DI: DestReg = X86::RDI; break;
26625       case X86::BP: DestReg = X86::RBP; break;
26626       case X86::SP: DestReg = X86::RSP; break;
26627       }
26628       if (DestReg) {
26629         Res.first = DestReg;
26630         Res.second = &X86::GR64RegClass;
26631       }
26632     }
26633   } else if (Res.second == &X86::FR32RegClass ||
26634              Res.second == &X86::FR64RegClass ||
26635              Res.second == &X86::VR128RegClass ||
26636              Res.second == &X86::VR256RegClass ||
26637              Res.second == &X86::FR32XRegClass ||
26638              Res.second == &X86::FR64XRegClass ||
26639              Res.second == &X86::VR128XRegClass ||
26640              Res.second == &X86::VR256XRegClass ||
26641              Res.second == &X86::VR512RegClass) {
26642     // Handle references to XMM physical registers that got mapped into the
26643     // wrong class.  This can happen with constraints like {xmm0} where the
26644     // target independent register mapper will just pick the first match it can
26645     // find, ignoring the required type.
26646
26647     if (VT == MVT::f32 || VT == MVT::i32)
26648       Res.second = &X86::FR32RegClass;
26649     else if (VT == MVT::f64 || VT == MVT::i64)
26650       Res.second = &X86::FR64RegClass;
26651     else if (X86::VR128RegClass.hasType(VT))
26652       Res.second = &X86::VR128RegClass;
26653     else if (X86::VR256RegClass.hasType(VT))
26654       Res.second = &X86::VR256RegClass;
26655     else if (X86::VR512RegClass.hasType(VT))
26656       Res.second = &X86::VR512RegClass;
26657   }
26658
26659   return Res;
26660 }
26661
26662 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26663                                             Type *Ty) const {
26664   // Scaling factors are not free at all.
26665   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26666   // will take 2 allocations in the out of order engine instead of 1
26667   // for plain addressing mode, i.e. inst (reg1).
26668   // E.g.,
26669   // vaddps (%rsi,%drx), %ymm0, %ymm1
26670   // Requires two allocations (one for the load, one for the computation)
26671   // whereas:
26672   // vaddps (%rsi), %ymm0, %ymm1
26673   // Requires just 1 allocation, i.e., freeing allocations for other operations
26674   // and having less micro operations to execute.
26675   //
26676   // For some X86 architectures, this is even worse because for instance for
26677   // stores, the complex addressing mode forces the instruction to use the
26678   // "load" ports instead of the dedicated "store" port.
26679   // E.g., on Haswell:
26680   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26681   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26682   if (isLegalAddressingMode(AM, Ty))
26683     // Scale represents reg2 * scale, thus account for 1
26684     // as soon as we use a second register.
26685     return AM.Scale != 0;
26686   return -1;
26687 }
26688
26689 bool X86TargetLowering::isTargetFTOL() const {
26690   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
26691 }