contrib/llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86MachineFunctionInfo.h"
  21 #include "X86TargetMachine.h"
  22 #include "X86TargetObjectFile.h"
  23 #include "llvm/ADT/SmallBitVector.h"
  24 #include "llvm/ADT/SmallSet.h"
  25 #include "llvm/ADT/Statistic.h"
  26 #include "llvm/ADT/StringExtras.h"
  27 #include "llvm/ADT/StringSwitch.h"
  28 #include "llvm/ADT/VariadicFunction.h"
  29 #include "llvm/CodeGen/IntrinsicLowering.h"
  30 #include "llvm/CodeGen/MachineFrameInfo.h"
  31 #include "llvm/CodeGen/MachineFunction.h"
  32 #include "llvm/CodeGen/MachineInstrBuilder.h"
  33 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  34 #include "llvm/CodeGen/MachineModuleInfo.h"
  35 #include "llvm/CodeGen/MachineRegisterInfo.h"
  36 #include "llvm/IR/CallSite.h"
  37 #include "llvm/IR/CallingConv.h"
  38 #include "llvm/IR/Constants.h"
  39 #include "llvm/IR/DerivedTypes.h"
  40 #include "llvm/IR/Function.h"
  41 #include "llvm/IR/GlobalAlias.h"
  42 #include "llvm/IR/GlobalVariable.h"
  43 #include "llvm/IR/Instructions.h"
  44 #include "llvm/IR/Intrinsics.h"
  45 #include "llvm/MC/MCAsmInfo.h"
  46 #include "llvm/MC/MCContext.h"
  47 #include "llvm/MC/MCExpr.h"
  48 #include "llvm/MC/MCSymbol.h"
  49 #include "llvm/Support/CommandLine.h"
  50 #include "llvm/Support/Debug.h"
  51 #include "llvm/Support/ErrorHandling.h"
  52 #include "llvm/Support/MathExtras.h"
  53 #include "llvm/Target/TargetOptions.h"
  54 #include "X86IntrinsicsInfo.h"
  55 #include <bitset>
  56 #include <numeric>
  57 #include <cctype>
  58 using namespace llvm;
  59
  60 #define DEBUG_TYPE "x86-isel"
  61
  62 STATISTIC(NumTailCalls, "Number of tail calls");
  63
  64 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  65     "x86-experimental-vector-widening-legalization", cl::init(false),
  66     cl::desc("Enable an experimental vector type legalization through widening "
  67              "rather than promotion."),
  68     cl::Hidden);
  69
  70 static cl::opt<bool> ExperimentalVectorShuffleLowering(
  71     "x86-experimental-vector-shuffle-lowering", cl::init(true),
  72     cl::desc("Enable an experimental vector shuffle lowering code path."),
  73     cl::Hidden);
  74
  75 static cl::opt<bool> ExperimentalVectorShuffleLegality(
  76     "x86-experimental-vector-shuffle-legality", cl::init(false),
  77     cl::desc("Enable experimental shuffle legality based on the experimental "
  78              "shuffle lowering. Should only be used with the experimental "
  79              "shuffle lowering."),
  80     cl::Hidden);
  81
  82 static cl::opt<int> ReciprocalEstimateRefinementSteps(
  83     "x86-recip-refinement-steps", cl::init(1),
  84     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
  85              "result of the hardware reciprocal estimate instruction."),
  86     cl::NotHidden);
  87
  88 // Forward declarations.
  89 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
  90                        SDValue V2);
  91
  92 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
  93                                 SelectionDAG &DAG, SDLoc dl,
  94                                 unsigned vectorWidth) {
  95   assert((vectorWidth == 128 || vectorWidth == 256) &&
  96          "Unsupported vector width");
  97   EVT VT = Vec.getValueType();
  98   EVT ElVT = VT.getVectorElementType();
  99   unsigned Factor = VT.getSizeInBits()/vectorWidth;
 100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
 101                                   VT.getVectorNumElements()/Factor);
 102
 103   // Extract from UNDEF is UNDEF.
 104   if (Vec.getOpcode() == ISD::UNDEF)
 105     return DAG.getUNDEF(ResultVT);
 106
 107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
 108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 109
 110   // This is the index of the first element of the vectorWidth-bit chunk
 111   // we want.
 112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
 113                                * ElemsPerChunk);
 114
 115   // If the input is a buildvector just emit a smaller one.
 116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
 117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
 118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
 119                                     ElemsPerChunk));
 120
 121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 123 }
 124
 125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 128 /// instructions or a simple subregister reference. Idx is an index in the
 129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 130 /// lowering EXTRACT_VECTOR_ELT operations easier.
 131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
 132                                    SelectionDAG &DAG, SDLoc dl) {
 133   assert((Vec.getValueType().is256BitVector() ||
 134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
 135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 136 }
 137
 138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
 140                                    SelectionDAG &DAG, SDLoc dl) {
 141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
 142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
 143 }
 144
 145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 146                                unsigned IdxVal, SelectionDAG &DAG,
 147                                SDLoc dl, unsigned vectorWidth) {
 148   assert((vectorWidth == 128 || vectorWidth == 256) &&
 149          "Unsupported vector width");
 150   // Inserting UNDEF is Result
 151   if (Vec.getOpcode() == ISD::UNDEF)
 152     return Result;
 153   EVT VT = Vec.getValueType();
 154   EVT ElVT = VT.getVectorElementType();
 155   EVT ResultVT = Result.getValueType();
 156
 157   // Insert the relevant vectorWidth bits.
 158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 159
 160   // This is the index of the first element of the vectorWidth-bit chunk
 161   // we want.
 162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
 163                                * ElemsPerChunk);
 164
 165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
 166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 167 }
 168
 169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 172 /// simple superregister reference.  Idx is an index in the 128 bits
 173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 174 /// lowering INSERT_VECTOR_ELT operations easier.
 175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 176                                   SelectionDAG &DAG,SDLoc dl) {
 177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
 178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 179 }
 180
 181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 182                                   SelectionDAG &DAG, SDLoc dl) {
 183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
 184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 185 }
 186
 187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 190 /// large BUILD_VECTORS.
 191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
 192                                    unsigned NumElems, SelectionDAG &DAG,
 193                                    SDLoc dl) {
 194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 196 }
 197
 198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 199                                    unsigned NumElems, SelectionDAG &DAG,
 200                                    SDLoc dl) {
 201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
 202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
 203 }
 204
 205 // FIXME: This should stop caching the target machine as soon as
 206 // we can remove resetOperationActions et al.
 207 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
 208     : TargetLowering(TM) {
 209   Subtarget = &TM.getSubtarget<X86Subtarget>();
 210   X86ScalarSSEf64 = Subtarget->hasSSE2();
 211   X86ScalarSSEf32 = Subtarget->hasSSE1();
 212   TD = getDataLayout();
 213
 214   resetOperationActions();
 215 }
 216
 217 void X86TargetLowering::resetOperationActions() {
 218   const TargetMachine &TM = getTargetMachine();
 219   static bool FirstTimeThrough = true;
 220
 221   // If none of the target options have changed, then we don't need to reset the
 222   // operation actions.
 223   if (!FirstTimeThrough && TO == TM.Options) return;
 224
 225   if (!FirstTimeThrough) {
 226     // Reinitialize the actions.
 227     initActions();
 228     FirstTimeThrough = false;
 229   }
 230
 231   TO = TM.Options;
 232
 233   // Set up the TargetLowering object.
 234   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
 235
 236   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 237   setBooleanContents(ZeroOrOneBooleanContent);
 238   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 239   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 240
 241   // For 64-bit, since we have so many registers, use the ILP scheduler.
 242   // For 32-bit, use the register pressure specific scheduling.
 243   // For Atom, always use ILP scheduling.
 244   if (Subtarget->isAtom())
 245     setSchedulingPreference(Sched::ILP);
 246   else if (Subtarget->is64Bit())
 247     setSchedulingPreference(Sched::ILP);
 248   else
 249     setSchedulingPreference(Sched::RegPressure);
 250   const X86RegisterInfo *RegInfo =
 251       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
 252   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 253
 254   // Bypass expensive divides on Atom when compiling with O2.
 255   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 256     if (Subtarget->hasSlowDivide32())
 257       addBypassSlowDiv(32, 8);
 258     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
 259       addBypassSlowDiv(64, 16);
 260   }
 261
 262   if (Subtarget->isTargetKnownWindowsMSVC()) {
 263     // Setup Windows compiler runtime calls.
 264     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 265     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 266     setLibcallName(RTLIB::SREM_I64, "_allrem");
 267     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 268     setLibcallName(RTLIB::MUL_I64, "_allmul");
 269     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 270     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 271     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 272     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 273     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 274
 275     // The _ftol2 runtime function has an unusual calling conv, which
 276     // is modeled by a special pseudo-instruction.
 277     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
 278     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
 279     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
 280     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
 281   }
 282
 283   if (Subtarget->isTargetDarwin()) {
 284     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 285     setUseUnderscoreSetJmp(false);
 286     setUseUnderscoreLongJmp(false);
 287   } else if (Subtarget->isTargetWindowsGNU()) {
 288     // MS runtime is weird: it exports _setjmp, but longjmp!
 289     setUseUnderscoreSetJmp(true);
 290     setUseUnderscoreLongJmp(false);
 291   } else {
 292     setUseUnderscoreSetJmp(true);
 293     setUseUnderscoreLongJmp(true);
 294   }
 295
 296   // Set up the register classes.
 297   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 298   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 299   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 300   if (Subtarget->is64Bit())
 301     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 302
 303   for (MVT VT : MVT::integer_valuetypes())
 304     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 305
 306   // We don't accept any truncstore of integer registers.
 307   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 308   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 309   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 310   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 311   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 312   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 313
 314   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 315
 316   // SETOEQ and SETUNE require checking two conditions.
 317   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 318   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 319   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 320   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 321   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 322   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 323
 324   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 325   // operation.
 326   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 327   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 328   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 329
 330   if (Subtarget->is64Bit()) {
 331     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
 332     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 333   } else if (!TM.Options.UseSoftFloat) {
 334     // We have an algorithm for SSE2->double, and we turn this into a
 335     // 64-bit FILD followed by conditional FADD for other targets.
 336     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 337     // We have an algorithm for SSE2, and we turn this into a 64-bit
 338     // FILD for other targets.
 339     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 340   }
 341
 342   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 343   // this operation.
 344   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 345   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 346
 347   if (!TM.Options.UseSoftFloat) {
 348     // SSE has no i16 to fp conversion, only i32
 349     if (X86ScalarSSEf32) {
 350       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 351       // f32 and f64 cases are Legal, f80 case is not
 352       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 353     } else {
 354       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 355       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 356     }
 357   } else {
 358     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 359     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 360   }
 361
 362   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 363   // are Legal, f80 is custom lowered.
 364   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 365   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 366
 367   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 368   // this operation.
 369   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 370   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 371
 372   if (X86ScalarSSEf32) {
 373     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 374     // f32 and f64 cases are Legal, f80 case is not
 375     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 376   } else {
 377     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 378     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 379   }
 380
 381   // Handle FP_TO_UINT by promoting the destination to a larger signed
 382   // conversion.
 383   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 384   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 385   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 386
 387   if (Subtarget->is64Bit()) {
 388     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
 389     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
 390   } else if (!TM.Options.UseSoftFloat) {
 391     // Since AVX is a superset of SSE3, only check for SSE here.
 392     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
 393       // Expand FP_TO_UINT into a select.
 394       // FIXME: We would like to use a Custom expander here eventually to do
 395       // the optimal thing for SSE vs. the default expansion in the legalizer.
 396       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 397     else
 398       // With SSE3 we can use fisttpll to convert to a signed i64; without
 399       // SSE, we're stuck with a fistpll.
 400       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 401   }
 402
 403   if (isTargetFTOL()) {
 404     // Use the _ftol2 runtime function, which has a pseudo-instruction
 405     // to handle its weird calling convention.
 406     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 407   }
 408
 409   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 410   if (!X86ScalarSSEf64) {
 411     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 412     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 413     if (Subtarget->is64Bit()) {
 414       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 415       // Without SSE, i64->f64 goes through memory.
 416       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 417     }
 418   }
 419
 420   // Scalar integer divide and remainder are lowered to use operations that
 421   // produce two results, to match the available instructions. This exposes
 422   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 423   // into a single instruction.
 424   //
 425   // Scalar integer multiply-high is also lowered to use two-result
 426   // operations, to match the available instructions. However, plain multiply
 427   // (low) operations are left as Legal, as there are single-result
 428   // instructions for this in x86. Using the two-result multiply instructions
 429   // when both high and low results are needed must be arranged by dagcombine.
 430   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 431     MVT VT = IntVTs[i];
 432     setOperationAction(ISD::MULHS, VT, Expand);
 433     setOperationAction(ISD::MULHU, VT, Expand);
 434     setOperationAction(ISD::SDIV, VT, Expand);
 435     setOperationAction(ISD::UDIV, VT, Expand);
 436     setOperationAction(ISD::SREM, VT, Expand);
 437     setOperationAction(ISD::UREM, VT, Expand);
 438
 439     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
 440     setOperationAction(ISD::ADDC, VT, Custom);
 441     setOperationAction(ISD::ADDE, VT, Custom);
 442     setOperationAction(ISD::SUBC, VT, Custom);
 443     setOperationAction(ISD::SUBE, VT, Custom);
 444   }
 445
 446   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 447   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 448   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
 449   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
 450   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
 451   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
 452   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
 453   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
 454   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
 455   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
 456   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
 457   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
 458   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
 459   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
 460   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
 461   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
 462   if (Subtarget->is64Bit())
 463     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 464   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 465   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 466   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 467   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 468   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 469   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 470   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 471   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 472
 473   // Promote the i8 variants and force them on up to i32 which has a shorter
 474   // encoding.
 475   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
 476   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
 477   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
 478   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
 479   if (Subtarget->hasBMI()) {
 480     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
 481     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
 482     if (Subtarget->is64Bit())
 483       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
 484   } else {
 485     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 486     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 487     if (Subtarget->is64Bit())
 488       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 489   }
 490
 491   if (Subtarget->hasLZCNT()) {
 492     // When promoting the i8 variants, force them to i32 for a shorter
 493     // encoding.
 494     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
 495     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
 496     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
 497     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 498     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
 499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
 500     if (Subtarget->is64Bit())
 501       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 502   } else {
 503     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 504     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 505     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 506     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 507     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 508     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 509     if (Subtarget->is64Bit()) {
 510       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 511       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 512     }
 513   }
 514
 515   // Special handling for half-precision floating point conversions.
 516   // If we don't have F16C support, then lower half float conversions
 517   // into library calls.
 518   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
 519     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 520     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 521   }
 522
 523   // There's never any support for operations beyond MVT::f32.
 524   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 525   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 526   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 527   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 528
 529   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 530   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 531   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 532   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 533   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 534   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 535
 536   if (Subtarget->hasPOPCNT()) {
 537     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 538   } else {
 539     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 540     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 541     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 542     if (Subtarget->is64Bit())
 543       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 544   }
 545
 546   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 547
 548   if (!Subtarget->hasMOVBE())
 549     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 550
 551   // These should be promoted to a larger select which is supported.
 552   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 553   // X86 wants to expand cmov itself.
 554   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
 555   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
 556   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
 557   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
 558   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
 559   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
 560   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
 561   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
 562   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
 563   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
 564   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
 565   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
 566   if (Subtarget->is64Bit()) {
 567     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
 568     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
 569   }
 570   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 571   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 572   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 573   // support continuation, user-level threading, and etc.. As a result, no
 574   // other SjLj exception interfaces are implemented and please don't build
 575   // your own exception handling based on them.
 576   // LLVM/Clang supports zero-cost DWARF exception handling.
 577   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 578   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 579
 580   // Darwin ABI issue.
 581   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
 582   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
 583   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
 584   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
 585   if (Subtarget->is64Bit())
 586     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
 587   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
 588   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
 589   if (Subtarget->is64Bit()) {
 590     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
 591     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
 592     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
 593     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
 594     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
 595   }
 596   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
 597   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
 598   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
 599   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
 600   if (Subtarget->is64Bit()) {
 601     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
 602     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
 603     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
 604   }
 605
 606   if (Subtarget->hasSSE1())
 607     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 608
 609   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 610
 611   // Expand certain atomics
 612   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
 613     MVT VT = IntVTs[i];
 614     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 615     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 616     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 617   }
 618
 619   if (Subtarget->hasCmpxchg16b()) {
 620     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 621   }
 622
 623   // FIXME - use subtarget debug flags
 624   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
 625       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
 626     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 627   }
 628
 629   if (Subtarget->is64Bit()) {
 630     setExceptionPointerRegister(X86::RAX);
 631     setExceptionSelectorRegister(X86::RDX);
 632   } else {
 633     setExceptionPointerRegister(X86::EAX);
 634     setExceptionSelectorRegister(X86::EDX);
 635   }
 636   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 637   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 638
 639   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 640   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 641
 642   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 643   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 644
 645   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 646   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 647   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 648   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
 649     // TargetInfo::X86_64ABIBuiltinVaList
 650     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
 651     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
 652   } else {
 653     // TargetInfo::CharPtrBuiltinVaList
 654     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
 655     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
 656   }
 657
 658   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 659   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 660
 661   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
 662
 663   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
 664     // f32 and f64 use SSE.
 665     // Set up the FP register classes.
 666     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 667     addRegisterClass(MVT::f64, &X86::FR64RegClass);
 668
 669     // Use ANDPD to simulate FABS.
 670     setOperationAction(ISD::FABS , MVT::f64, Custom);
 671     setOperationAction(ISD::FABS , MVT::f32, Custom);
 672
 673     // Use XORP to simulate FNEG.
 674     setOperationAction(ISD::FNEG , MVT::f64, Custom);
 675     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 676
 677     // Use ANDPD and ORPD to simulate FCOPYSIGN.
 678     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
 679     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 680
 681     // Lower this to FGETSIGNx86 plus an AND.
 682     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 683     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 684
 685     // We don't support sin/cos/fmod
 686     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 687     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 688     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 689     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 690     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 691     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 692
 693     // Expand FP immediates into loads from the stack, except for the special
 694     // cases we handle.
 695     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 696     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 697   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
 698     // Use SSE for f32, x87 for f64.
 699     // Set up the FP register classes.
 700     addRegisterClass(MVT::f32, &X86::FR32RegClass);
 701     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 702
 703     // Use ANDPS to simulate FABS.
 704     setOperationAction(ISD::FABS , MVT::f32, Custom);
 705
 706     // Use XORP to simulate FNEG.
 707     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 708
 709     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 710
 711     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 712     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 713     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 714
 715     // We don't support sin/cos/fmod
 716     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 717     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 718     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 719
 720     // Special cases we handle for FP constants.
 721     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 722     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 723     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 724     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 725     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 726
 727     if (!TM.Options.UnsafeFPMath) {
 728       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 729       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 730       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 731     }
 732   } else if (!TM.Options.UseSoftFloat) {
 733     // f32 and f64 in x87.
 734     // Set up the FP register classes.
 735     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 736     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 737
 738     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 739     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
 740     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 741     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 742
 743     if (!TM.Options.UnsafeFPMath) {
 744       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 745       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 746       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 747       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 748       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 749       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 750     }
 751     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 752     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 753     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 754     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 755     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 756     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 757     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 758     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 759   }
 760
 761   // We don't support FMA.
 762   setOperationAction(ISD::FMA, MVT::f64, Expand);
 763   setOperationAction(ISD::FMA, MVT::f32, Expand);
 764
 765   // Long double always uses X87.
 766   if (!TM.Options.UseSoftFloat) {
 767     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 768     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 769     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 770     {
 771       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
 772       addLegalFPImmediate(TmpFlt);  // FLD0
 773       TmpFlt.changeSign();
 774       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 775
 776       bool ignored;
 777       APFloat TmpFlt2(+1.0);
 778       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
 779                       &ignored);
 780       addLegalFPImmediate(TmpFlt2);  // FLD1
 781       TmpFlt2.changeSign();
 782       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 783     }
 784
 785     if (!TM.Options.UnsafeFPMath) {
 786       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 787       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 788       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 789     }
 790
 791     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 792     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 793     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 794     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 795     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 796     setOperationAction(ISD::FMA, MVT::f80, Expand);
 797   }
 798
 799   // Always use a library call for pow.
 800   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 801   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 802   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 803
 804   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 805   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 806   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 807   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 808   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 809   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 810   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 811
 812   // First set operation action for all vector types to either promote
 813   // (for widening) or expand (for scalarization). Then we will selectively
 814   // turn on ones that can be effectively codegen'd.
 815   for (MVT VT : MVT::vector_valuetypes()) {
 816     setOperationAction(ISD::ADD , VT, Expand);
 817     setOperationAction(ISD::SUB , VT, Expand);
 818     setOperationAction(ISD::FADD, VT, Expand);
 819     setOperationAction(ISD::FNEG, VT, Expand);
 820     setOperationAction(ISD::FSUB, VT, Expand);
 821     setOperationAction(ISD::MUL , VT, Expand);
 822     setOperationAction(ISD::FMUL, VT, Expand);
 823     setOperationAction(ISD::SDIV, VT, Expand);
 824     setOperationAction(ISD::UDIV, VT, Expand);
 825     setOperationAction(ISD::FDIV, VT, Expand);
 826     setOperationAction(ISD::SREM, VT, Expand);
 827     setOperationAction(ISD::UREM, VT, Expand);
 828     setOperationAction(ISD::LOAD, VT, Expand);
 829     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
 830     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 831     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 832     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 833     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 834     setOperationAction(ISD::FABS, VT, Expand);
 835     setOperationAction(ISD::FSIN, VT, Expand);
 836     setOperationAction(ISD::FSINCOS, VT, Expand);
 837     setOperationAction(ISD::FCOS, VT, Expand);
 838     setOperationAction(ISD::FSINCOS, VT, Expand);
 839     setOperationAction(ISD::FREM, VT, Expand);
 840     setOperationAction(ISD::FMA,  VT, Expand);
 841     setOperationAction(ISD::FPOWI, VT, Expand);
 842     setOperationAction(ISD::FSQRT, VT, Expand);
 843     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 844     setOperationAction(ISD::FFLOOR, VT, Expand);
 845     setOperationAction(ISD::FCEIL, VT, Expand);
 846     setOperationAction(ISD::FTRUNC, VT, Expand);
 847     setOperationAction(ISD::FRINT, VT, Expand);
 848     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 849     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 850     setOperationAction(ISD::MULHS, VT, Expand);
 851     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 852     setOperationAction(ISD::MULHU, VT, Expand);
 853     setOperationAction(ISD::SDIVREM, VT, Expand);
 854     setOperationAction(ISD::UDIVREM, VT, Expand);
 855     setOperationAction(ISD::FPOW, VT, Expand);
 856     setOperationAction(ISD::CTPOP, VT, Expand);
 857     setOperationAction(ISD::CTTZ, VT, Expand);
 858     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
 859     setOperationAction(ISD::CTLZ, VT, Expand);
 860     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
 861     setOperationAction(ISD::SHL, VT, Expand);
 862     setOperationAction(ISD::SRA, VT, Expand);
 863     setOperationAction(ISD::SRL, VT, Expand);
 864     setOperationAction(ISD::ROTL, VT, Expand);
 865     setOperationAction(ISD::ROTR, VT, Expand);
 866     setOperationAction(ISD::BSWAP, VT, Expand);
 867     setOperationAction(ISD::SETCC, VT, Expand);
 868     setOperationAction(ISD::FLOG, VT, Expand);
 869     setOperationAction(ISD::FLOG2, VT, Expand);
 870     setOperationAction(ISD::FLOG10, VT, Expand);
 871     setOperationAction(ISD::FEXP, VT, Expand);
 872     setOperationAction(ISD::FEXP2, VT, Expand);
 873     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 874     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 875     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 876     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 877     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 878     setOperationAction(ISD::TRUNCATE, VT, Expand);
 879     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 880     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 881     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 882     setOperationAction(ISD::VSELECT, VT, Expand);
 883     setOperationAction(ISD::SELECT_CC, VT, Expand);
 884     for (MVT InnerVT : MVT::vector_valuetypes()) {
 885       setTruncStoreAction(InnerVT, VT, Expand);
 886
 887       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 888       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 889
 890       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 891       // types, we have to deal with them whether we ask for Expansion or not.
 892       // Setting Expand causes its own optimisation problems though, so leave
 893       // them legal.
 894       if (VT.getVectorElementType() == MVT::i1)
 895         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 896     }
 897   }
 898
 899   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 900   // with -msoft-float, disable use of MMX as well.
 901   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
 902     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 903     // No operations on x86mmx supported, everything uses intrinsics.
 904   }
 905
 906   // MMX-sized vectors (other than x86mmx) are expected to be expanded
 907   // into smaller operations.
 908   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
 909   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
 910   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
 911   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
 912   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
 913   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
 914   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
 915   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
 916   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
 917   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
 918   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
 919   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
 920   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
 921   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
 922   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
 923   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
 924   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
 925   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
 926   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
 927   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
 928   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
 929   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
 930   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
 931   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
 932   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
 933   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
 934   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
 935   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
 936   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
 937
 938   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
 939     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
 940
 941     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
 942     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
 943     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
 944     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
 945     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
 946     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 947     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 948     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
 949     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 950     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 951     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 952     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 953     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 954   }
 955
 956   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
 957     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
 958
 959     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 960     // registers cannot be used even for integer operations.
 961     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
 962     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
 963     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
 964     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
 965
 966     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
 967     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
 968     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
 969     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
 970     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 971     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 972     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 973     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 974     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 975     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 976     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
 977     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
 978     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
 979     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
 980     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 981     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
 982     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
 983     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
 984     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
 985     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
 986     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 987     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 988
 989     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
 990     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
 991     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
 992     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 993
 994     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
 995     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
 996     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 997     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 998     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 999
1000     // Only provide customized ctpop vector bit twiddling for vector types we
1001     // know to perform better than using the popcnt instructions on each vector
1002     // element. If popcnt isn't supported, always provide the custom version.
1003     if (!Subtarget->hasPOPCNT()) {
1004       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
1005       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
1006     }
1007
1008     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
1009     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1010       MVT VT = (MVT::SimpleValueType)i;
1011       // Do not attempt to custom lower non-power-of-2 vectors
1012       if (!isPowerOf2_32(VT.getVectorNumElements()))
1013         continue;
1014       // Do not attempt to custom lower non-128-bit vectors
1015       if (!VT.is128BitVector())
1016         continue;
1017       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1018       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1019       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1020     }
1021
1022     // We support custom legalizing of sext and anyext loads for specific
1023     // memory vector types which we can load as a scalar (or sequence of
1024     // scalars) and extend in-register to a legal 128-bit vector type. For sext
1025     // loads these must work with a single scalar load.
1026     for (MVT VT : MVT::integer_vector_valuetypes()) {
1027       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
1028       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
1029       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
1030       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
1031       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
1032       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
1033       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
1034       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
1035       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
1036     }
1037
1038     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
1039     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
1040     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
1041     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
1042     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
1043     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
1044
1045     if (Subtarget->is64Bit()) {
1046       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1047       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1048     }
1049
1050     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
1051     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
1052       MVT VT = (MVT::SimpleValueType)i;
1053
1054       // Do not attempt to promote non-128-bit vectors
1055       if (!VT.is128BitVector())
1056         continue;
1057
1058       setOperationAction(ISD::AND,    VT, Promote);
1059       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
1060       setOperationAction(ISD::OR,     VT, Promote);
1061       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
1062       setOperationAction(ISD::XOR,    VT, Promote);
1063       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
1064       setOperationAction(ISD::LOAD,   VT, Promote);
1065       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
1066       setOperationAction(ISD::SELECT, VT, Promote);
1067       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
1068     }
1069
1070     // Custom lower v2i64 and v2f64 selects.
1071     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
1072     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
1073     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
1074     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
1075
1076     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
1077     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
1078
1079     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
1080     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
1081     // As there is no 64-bit GPR available, we need build a special custom
1082     // sequence to convert from v2i32 to v2f32.
1083     if (!Subtarget->is64Bit())
1084       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
1085
1086     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
1087     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
1088
1089     for (MVT VT : MVT::fp_vector_valuetypes())
1090       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
1091
1092     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
1093     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
1094     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
1095   }
1096
1097   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
1098     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
1099     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
1100     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
1101     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
1102     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
1103     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
1104     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
1105     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
1106     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
1107     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
1108
1109     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
1110     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
1111     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
1112     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
1113     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
1114     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
1115     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
1116     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
1117     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
1118     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
1119
1120     // FIXME: Do we need to handle scalar-to-vector here?
1121     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
1122
1123     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
1124     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
1125     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
1126     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
1127     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
1128     // There is no BLENDI for byte vectors. We don't need to custom lower
1129     // some vselects for now.
1130     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
1131
1132     // SSE41 brings specific instructions for doing vector sign extend even in
1133     // cases where we don't have SRA.
1134     for (MVT VT : MVT::integer_vector_valuetypes()) {
1135       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
1136       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
1137       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
1138     }
1139
1140     // i8 and i16 vectors are custom because the source register and source
1141     // source memory operand types are not the same width.  f32 vectors are
1142     // custom since the immediate controlling the insert encodes additional
1143     // information.
1144     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
1145     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
1146     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
1147     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
1148
1149     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
1150     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
1151     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
1152     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
1153
1154     // FIXME: these should be Legal, but that's only for the case where
1155     // the index is constant.  For now custom expand to deal with that.
1156     if (Subtarget->is64Bit()) {
1157       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
1158       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
1159     }
1160   }
1161
1162   if (Subtarget->hasSSE2()) {
1163     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
1164     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
1165
1166     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
1167     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
1168
1169     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
1170     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
1171
1172     // In the customized shift lowering, the legal cases in AVX2 will be
1173     // recognized.
1174     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
1175     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
1176
1177     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
1178     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
1179
1180     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
1181   }
1182
1183   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
1184     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
1185     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
1186     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
1187     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
1188     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
1189     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
1190
1191     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
1192     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
1193     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
1194
1195     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
1196     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
1197     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
1198     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
1199     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
1200     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
1201     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
1202     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
1203     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
1204     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
1205     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
1206     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
1207
1208     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
1209     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
1210     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
1211     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
1212     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
1213     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
1214     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
1215     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
1216     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
1217     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
1218     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
1219     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
1220
1221     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1222     // even though v8i16 is a legal type.
1223     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1224     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1225     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1226
1227     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1228     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1229     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1230
1231     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1232     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1233
1234     for (MVT VT : MVT::fp_vector_valuetypes())
1235       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1236
1237     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
1238     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
1239
1240     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
1241     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
1242
1243     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
1244     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
1245
1246     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
1247     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
1248     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
1249     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
1250
1251     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1252     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1253     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1254
1255     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
1256     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
1257     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
1258     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
1259
1260     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
1261     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
1262     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1263     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1264     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1265     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1266     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1267     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1268     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1269     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1270     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1271     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1272
1273     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
1274       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
1275       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
1276       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
1277       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
1278       setOperationAction(ISD::FMA,             MVT::f32, Legal);
1279       setOperationAction(ISD::FMA,             MVT::f64, Legal);
1280     }
1281
1282     if (Subtarget->hasInt256()) {
1283       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
1284       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
1285       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
1286       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
1287
1288       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
1289       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
1290       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
1291       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
1292
1293       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1294       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
1295       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
1296       // Don't lower v32i8 because there is no 128-bit byte mul
1297
1298       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
1299       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
1300       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
1301       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
1302
1303       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
1304       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1305
1306       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1307       // when we have a 256bit-wide blend with immediate.
1308       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1309
1310       // Only provide customized ctpop vector bit twiddling for vector types we
1311       // know to perform better than using the popcnt instructions on each
1312       // vector element. If popcnt isn't supported, always provide the custom
1313       // version.
1314       if (!Subtarget->hasPOPCNT())
1315         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
1316
1317       // Custom CTPOP always performs better on natively supported v8i32
1318       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
1319     } else {
1320       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
1321       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
1322       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
1323       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
1324
1325       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
1326       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
1327       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
1328       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
1329
1330       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
1331       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
1332       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
1333       // Don't lower v32i8 because there is no 128-bit byte mul
1334     }
1335
1336     // In the customized shift lowering, the legal cases in AVX2 will be
1337     // recognized.
1338     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
1339     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
1340
1341     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
1342     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
1343
1344     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
1345
1346     // Custom lower several nodes for 256-bit types.
1347     for (MVT VT : MVT::vector_valuetypes()) {
1348       if (VT.getScalarSizeInBits() >= 32) {
1349         setOperationAction(ISD::MLOAD,  VT, Legal);
1350         setOperationAction(ISD::MSTORE, VT, Legal);
1351       }
1352       // Extract subvector is special because the value type
1353       // (result) is 128-bit but the source is 256-bit wide.
1354       if (VT.is128BitVector()) {
1355         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1356       }
1357       // Do not attempt to custom lower other non-256-bit vectors
1358       if (!VT.is256BitVector())
1359         continue;
1360
1361       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1362       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1363       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1364       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1365       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1366       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1367       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1368     }
1369
1370     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1371     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
1372       MVT VT = (MVT::SimpleValueType)i;
1373
1374       // Do not attempt to promote non-256-bit vectors
1375       if (!VT.is256BitVector())
1376         continue;
1377
1378       setOperationAction(ISD::AND,    VT, Promote);
1379       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
1380       setOperationAction(ISD::OR,     VT, Promote);
1381       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
1382       setOperationAction(ISD::XOR,    VT, Promote);
1383       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
1384       setOperationAction(ISD::LOAD,   VT, Promote);
1385       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
1386       setOperationAction(ISD::SELECT, VT, Promote);
1387       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
1388     }
1389   }
1390
1391   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
1392     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1393     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1394     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1395     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1396
1397     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1398     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1399     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1400
1401     for (MVT VT : MVT::fp_vector_valuetypes())
1402       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1403
1404     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1405     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1406     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1407     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1408     setOperationAction(ISD::AND,                MVT::i1,    Legal);
1409     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
1410     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
1411     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
1412     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
1413     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
1414
1415     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
1416     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
1417     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
1418     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
1419     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
1420     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
1421
1422     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
1423     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
1424     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
1425     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
1426     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
1427     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
1428     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
1429     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
1430
1431     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
1432     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
1433     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
1434     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
1435     if (Subtarget->is64Bit()) {
1436       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
1437       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
1438       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
1439       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
1440     }
1441     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1442     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1443     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1444     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1445     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1446     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1447     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1448     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1449     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1450     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1451     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1452     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1453     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1454     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1455
1456     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1457     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1458     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1459     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1460     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1461     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1462     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1463     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1464     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1465     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1466     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1467     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1468     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1469
1470     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1471     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1472     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1473     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1474     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
1475     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
1476
1477     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1478     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1479
1480     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1481
1482     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1483     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1484     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1485     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1486     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1487     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1488     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1489     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1490     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1491
1492     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
1493     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
1494
1495     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
1496     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
1497
1498     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1499
1500     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
1501     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
1502
1503     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
1504     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
1505
1506     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
1507     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
1508
1509     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
1510     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
1511     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
1512     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
1513     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
1514     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
1515
1516     if (Subtarget->hasCDI()) {
1517       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
1518       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1519     }
1520
1521     // Custom lower several nodes.
1522     for (MVT VT : MVT::vector_valuetypes()) {
1523       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1524       // Extract subvector is special because the value type
1525       // (result) is 256/128-bit but the source is 512-bit wide.
1526       if (VT.is128BitVector() || VT.is256BitVector()) {
1527         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1528       }
1529       if (VT.getVectorElementType() == MVT::i1)
1530         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1531
1532       // Do not attempt to custom lower other non-512-bit vectors
1533       if (!VT.is512BitVector())
1534         continue;
1535
1536       if ( EltSize >= 32) {
1537         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1538         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1539         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1540         setOperationAction(ISD::VSELECT,             VT, Legal);
1541         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1542         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1543         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1544         setOperationAction(ISD::MLOAD,               VT, Legal);
1545         setOperationAction(ISD::MSTORE,              VT, Legal);
1546       }
1547     }
1548     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1549       MVT VT = (MVT::SimpleValueType)i;
1550
1551       // Do not attempt to promote non-512-bit vectors.
1552       if (!VT.is512BitVector())
1553         continue;
1554
1555       setOperationAction(ISD::SELECT, VT, Promote);
1556       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
1557     }
1558   }// has  AVX-512
1559
1560   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
1561     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1562     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1563
1564     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1565     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1566
1567     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
1568     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
1569     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1570     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1571     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
1572     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
1573     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
1574     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
1575     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1576
1577     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
1578       const MVT VT = (MVT::SimpleValueType)i;
1579
1580       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
1581
1582       // Do not attempt to promote non-512-bit vectors.
1583       if (!VT.is512BitVector())
1584         continue;
1585
1586       if (EltSize < 32) {
1587         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1588         setOperationAction(ISD::VSELECT,             VT, Legal);
1589       }
1590     }
1591   }
1592
1593   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
1594     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1595     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1596
1597     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1598     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1599     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
1600
1601     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
1602     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
1603     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
1604     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
1605     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
1606     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
1607   }
1608
1609   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
1610   // of this type with custom code.
1611   for (MVT VT : MVT::vector_valuetypes())
1612     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1613
1614   // We want to custom lower some of our intrinsics.
1615   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1616   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1617   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1618   if (!Subtarget->is64Bit())
1619     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1620
1621   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1622   // handle type legalization for these operations here.
1623   //
1624   // FIXME: We really should do custom legalization for addition and
1625   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1626   // than generic legalization for 64-bit multiplication-with-overflow, though.
1627   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
1628     // Add/Sub/Mul with overflow operations are custom lowered.
1629     MVT VT = IntVTs[i];
1630     setOperationAction(ISD::SADDO, VT, Custom);
1631     setOperationAction(ISD::UADDO, VT, Custom);
1632     setOperationAction(ISD::SSUBO, VT, Custom);
1633     setOperationAction(ISD::USUBO, VT, Custom);
1634     setOperationAction(ISD::SMULO, VT, Custom);
1635     setOperationAction(ISD::UMULO, VT, Custom);
1636   }
1637
1638
1639   if (!Subtarget->is64Bit()) {
1640     // These libcalls are not available in 32-bit.
1641     setLibcallName(RTLIB::SHL_I128, nullptr);
1642     setLibcallName(RTLIB::SRL_I128, nullptr);
1643     setLibcallName(RTLIB::SRA_I128, nullptr);
1644   }
1645
1646   // Combine sin / cos into one node or libcall if possible.
1647   if (Subtarget->hasSinCos()) {
1648     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1649     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1650     if (Subtarget->isTargetDarwin()) {
1651       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1652       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1653       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1654       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1655     }
1656   }
1657
1658   if (Subtarget->isTargetWin64()) {
1659     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1660     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1661     setOperationAction(ISD::SREM, MVT::i128, Custom);
1662     setOperationAction(ISD::UREM, MVT::i128, Custom);
1663     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1664     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1665   }
1666
1667   // We have target-specific dag combine patterns for the following nodes:
1668   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1669   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1670   setTargetDAGCombine(ISD::VSELECT);
1671   setTargetDAGCombine(ISD::SELECT);
1672   setTargetDAGCombine(ISD::SHL);
1673   setTargetDAGCombine(ISD::SRA);
1674   setTargetDAGCombine(ISD::SRL);
1675   setTargetDAGCombine(ISD::OR);
1676   setTargetDAGCombine(ISD::AND);
1677   setTargetDAGCombine(ISD::ADD);
1678   setTargetDAGCombine(ISD::FADD);
1679   setTargetDAGCombine(ISD::FSUB);
1680   setTargetDAGCombine(ISD::FMA);
1681   setTargetDAGCombine(ISD::SUB);
1682   setTargetDAGCombine(ISD::LOAD);
1683   setTargetDAGCombine(ISD::MLOAD);
1684   setTargetDAGCombine(ISD::STORE);
1685   setTargetDAGCombine(ISD::MSTORE);
1686   setTargetDAGCombine(ISD::ZERO_EXTEND);
1687   setTargetDAGCombine(ISD::ANY_EXTEND);
1688   setTargetDAGCombine(ISD::SIGN_EXTEND);
1689   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1690   setTargetDAGCombine(ISD::TRUNCATE);
1691   setTargetDAGCombine(ISD::SINT_TO_FP);
1692   setTargetDAGCombine(ISD::SETCC);
1693   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1694   setTargetDAGCombine(ISD::BUILD_VECTOR);
1695   if (Subtarget->is64Bit())
1696     setTargetDAGCombine(ISD::MUL);
1697   setTargetDAGCombine(ISD::XOR);
1698
1699   computeRegisterProperties();
1700
1701   // On Darwin, -Os means optimize for size without hurting performance,
1702   // do not reduce the limit.
1703   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1704   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
1705   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1706   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1707   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1708   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
1709   setPrefLoopAlignment(4); // 2^4 bytes.
1710
1711   // Predictable cmov don't hurt on atom because it's in-order.
1712   PredictableSelectIsExpensive = !Subtarget->isAtom();
1713   EnableExtLdPromotion = true;
1714   setPrefFunctionAlignment(4); // 2^4 bytes.
1715
1716   verifyIntrinsicTables();
1717 }
1718
1719 // This has so far only been implemented for 64-bit MachO.
1720 bool X86TargetLowering::useLoadStackGuardNode() const {
1721   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
1722 }
1723
1724 TargetLoweringBase::LegalizeTypeAction
1725 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1726   if (ExperimentalVectorWideningLegalization &&
1727       VT.getVectorNumElements() != 1 &&
1728       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1729     return TypeWidenVector;
1730
1731   return TargetLoweringBase::getPreferredVectorAction(VT);
1732 }
1733
1734 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1735   if (!VT.isVector())
1736     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
1737
1738   const unsigned NumElts = VT.getVectorNumElements();
1739   const EVT EltVT = VT.getVectorElementType();
1740   if (VT.is512BitVector()) {
1741     if (Subtarget->hasAVX512())
1742       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1743           EltVT == MVT::f32 || EltVT == MVT::f64)
1744         switch(NumElts) {
1745         case  8: return MVT::v8i1;
1746         case 16: return MVT::v16i1;
1747       }
1748     if (Subtarget->hasBWI())
1749       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1750         switch(NumElts) {
1751         case 32: return MVT::v32i1;
1752         case 64: return MVT::v64i1;
1753       }
1754   }
1755
1756   if (VT.is256BitVector() || VT.is128BitVector()) {
1757     if (Subtarget->hasVLX())
1758       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1759           EltVT == MVT::f32 || EltVT == MVT::f64)
1760         switch(NumElts) {
1761         case 2: return MVT::v2i1;
1762         case 4: return MVT::v4i1;
1763         case 8: return MVT::v8i1;
1764       }
1765     if (Subtarget->hasBWI() && Subtarget->hasVLX())
1766       if (EltVT == MVT::i8 || EltVT == MVT::i16)
1767         switch(NumElts) {
1768         case  8: return MVT::v8i1;
1769         case 16: return MVT::v16i1;
1770         case 32: return MVT::v32i1;
1771       }
1772   }
1773
1774   return VT.changeVectorElementTypeToInteger();
1775 }
1776
1777 /// Helper for getByValTypeAlignment to determine
1778 /// the desired ByVal argument alignment.
1779 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1780   if (MaxAlign == 16)
1781     return;
1782   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1783     if (VTy->getBitWidth() == 128)
1784       MaxAlign = 16;
1785   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1786     unsigned EltAlign = 0;
1787     getMaxByValAlign(ATy->getElementType(), EltAlign);
1788     if (EltAlign > MaxAlign)
1789       MaxAlign = EltAlign;
1790   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1791     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1792       unsigned EltAlign = 0;
1793       getMaxByValAlign(STy->getElementType(i), EltAlign);
1794       if (EltAlign > MaxAlign)
1795         MaxAlign = EltAlign;
1796       if (MaxAlign == 16)
1797         break;
1798     }
1799   }
1800 }
1801
1802 /// Return the desired alignment for ByVal aggregate
1803 /// function arguments in the caller parameter area. For X86, aggregates
1804 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1805 /// are at 4-byte boundaries.
1806 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
1807   if (Subtarget->is64Bit()) {
1808     // Max of 8 and alignment of type.
1809     unsigned TyAlign = TD->getABITypeAlignment(Ty);
1810     if (TyAlign > 8)
1811       return TyAlign;
1812     return 8;
1813   }
1814
1815   unsigned Align = 4;
1816   if (Subtarget->hasSSE1())
1817     getMaxByValAlign(Ty, Align);
1818   return Align;
1819 }
1820
1821 /// Returns the target specific optimal type for load
1822 /// and store operations as a result of memset, memcpy, and memmove
1823 /// lowering. If DstAlign is zero that means it's safe to destination
1824 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1825 /// means there isn't a need to check it against alignment requirement,
1826 /// probably because the source does not need to be loaded. If 'IsMemset' is
1827 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1828 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1829 /// source is constant so it does not need to be loaded.
1830 /// It returns EVT::Other if the type should be determined using generic
1831 /// target-independent logic.
1832 EVT
1833 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1834                                        unsigned DstAlign, unsigned SrcAlign,
1835                                        bool IsMemset, bool ZeroMemset,
1836                                        bool MemcpyStrSrc,
1837                                        MachineFunction &MF) const {
1838   const Function *F = MF.getFunction();
1839   if ((!IsMemset || ZeroMemset) &&
1840       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
1841                                        Attribute::NoImplicitFloat)) {
1842     if (Size >= 16 &&
1843         (Subtarget->isUnalignedMemAccessFast() ||
1844          ((DstAlign == 0 || DstAlign >= 16) &&
1845           (SrcAlign == 0 || SrcAlign >= 16)))) {
1846       if (Size >= 32) {
1847         if (Subtarget->hasInt256())
1848           return MVT::v8i32;
1849         if (Subtarget->hasFp256())
1850           return MVT::v8f32;
1851       }
1852       if (Subtarget->hasSSE2())
1853         return MVT::v4i32;
1854       if (Subtarget->hasSSE1())
1855         return MVT::v4f32;
1856     } else if (!MemcpyStrSrc && Size >= 8 &&
1857                !Subtarget->is64Bit() &&
1858                Subtarget->hasSSE2()) {
1859       // Do not use f64 to lower memcpy if source is string constant. It's
1860       // better to use i32 to avoid the loads.
1861       return MVT::f64;
1862     }
1863   }
1864   if (Subtarget->is64Bit() && Size >= 8)
1865     return MVT::i64;
1866   return MVT::i32;
1867 }
1868
1869 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1870   if (VT == MVT::f32)
1871     return X86ScalarSSEf32;
1872   else if (VT == MVT::f64)
1873     return X86ScalarSSEf64;
1874   return true;
1875 }
1876
1877 bool
1878 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1879                                                   unsigned,
1880                                                   unsigned,
1881                                                   bool *Fast) const {
1882   if (Fast)
1883     *Fast = Subtarget->isUnalignedMemAccessFast();
1884   return true;
1885 }
1886
1887 /// Return the entry encoding for a jump table in the
1888 /// current function.  The returned value is a member of the
1889 /// MachineJumpTableInfo::JTEntryKind enum.
1890 unsigned X86TargetLowering::getJumpTableEncoding() const {
1891   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1892   // symbol.
1893   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1894       Subtarget->isPICStyleGOT())
1895     return MachineJumpTableInfo::EK_Custom32;
1896
1897   // Otherwise, use the normal jump table encoding heuristics.
1898   return TargetLowering::getJumpTableEncoding();
1899 }
1900
1901 const MCExpr *
1902 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1903                                              const MachineBasicBlock *MBB,
1904                                              unsigned uid,MCContext &Ctx) const{
1905   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
1906          Subtarget->isPICStyleGOT());
1907   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1908   // entries.
1909   return MCSymbolRefExpr::Create(MBB->getSymbol(),
1910                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1911 }
1912
1913 /// Returns relocation base for the given PIC jumptable.
1914 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1915                                                     SelectionDAG &DAG) const {
1916   if (!Subtarget->is64Bit())
1917     // This doesn't have SDLoc associated with it, but is not really the
1918     // same as a Register.
1919     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
1920   return Table;
1921 }
1922
1923 /// This returns the relocation base for the given PIC jumptable,
1924 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1925 const MCExpr *X86TargetLowering::
1926 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1927                              MCContext &Ctx) const {
1928   // X86-64 uses RIP relative addressing based on the jump table label.
1929   if (Subtarget->isPICStyleRIPRel())
1930     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1931
1932   // Otherwise, the reference is relative to the PIC base.
1933   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
1934 }
1935
1936 // FIXME: Why this routine is here? Move to RegInfo!
1937 std::pair<const TargetRegisterClass*, uint8_t>
1938 X86TargetLowering::findRepresentativeClass(MVT VT) const{
1939   const TargetRegisterClass *RRC = nullptr;
1940   uint8_t Cost = 1;
1941   switch (VT.SimpleTy) {
1942   default:
1943     return TargetLowering::findRepresentativeClass(VT);
1944   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1945     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1946     break;
1947   case MVT::x86mmx:
1948     RRC = &X86::VR64RegClass;
1949     break;
1950   case MVT::f32: case MVT::f64:
1951   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1952   case MVT::v4f32: case MVT::v2f64:
1953   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1954   case MVT::v4f64:
1955     RRC = &X86::VR128RegClass;
1956     break;
1957   }
1958   return std::make_pair(RRC, Cost);
1959 }
1960
1961 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
1962                                                unsigned &Offset) const {
1963   if (!Subtarget->isTargetLinux())
1964     return false;
1965
1966   if (Subtarget->is64Bit()) {
1967     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1968     Offset = 0x28;
1969     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
1970       AddressSpace = 256;
1971     else
1972       AddressSpace = 257;
1973   } else {
1974     // %gs:0x14 on i386
1975     Offset = 0x14;
1976     AddressSpace = 256;
1977   }
1978   return true;
1979 }
1980
1981 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1982                                             unsigned DestAS) const {
1983   assert(SrcAS != DestAS && "Expected different address spaces!");
1984
1985   return SrcAS < 256 && DestAS < 256;
1986 }
1987
1988 //===----------------------------------------------------------------------===//
1989 //               Return Value Calling Convention Implementation
1990 //===----------------------------------------------------------------------===//
1991
1992 #include "X86GenCallingConv.inc"
1993
1994 bool
1995 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
1996                                   MachineFunction &MF, bool isVarArg,
1997                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1998                         LLVMContext &Context) const {
1999   SmallVector<CCValAssign, 16> RVLocs;
2000   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2001   return CCInfo.CheckReturn(Outs, RetCC_X86);
2002 }
2003
2004 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2005   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2006   return ScratchRegs;
2007 }
2008
2009 SDValue
2010 X86TargetLowering::LowerReturn(SDValue Chain,
2011                                CallingConv::ID CallConv, bool isVarArg,
2012                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2013                                const SmallVectorImpl<SDValue> &OutVals,
2014                                SDLoc dl, SelectionDAG &DAG) const {
2015   MachineFunction &MF = DAG.getMachineFunction();
2016   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2017
2018   SmallVector<CCValAssign, 16> RVLocs;
2019   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2020   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2021
2022   SDValue Flag;
2023   SmallVector<SDValue, 6> RetOps;
2024   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2025   // Operand #1 = Bytes To Pop
2026   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
2027                    MVT::i16));
2028
2029   // Copy the result values into the output registers.
2030   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2031     CCValAssign &VA = RVLocs[i];
2032     assert(VA.isRegLoc() && "Can only return in registers!");
2033     SDValue ValToCopy = OutVals[i];
2034     EVT ValVT = ValToCopy.getValueType();
2035
2036     // Promote values to the appropriate types.
2037     if (VA.getLocInfo() == CCValAssign::SExt)
2038       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2039     else if (VA.getLocInfo() == CCValAssign::ZExt)
2040       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2041     else if (VA.getLocInfo() == CCValAssign::AExt)
2042       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2043     else if (VA.getLocInfo() == CCValAssign::BCvt)
2044       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
2045
2046     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2047            "Unexpected FP-extend for return value.");
2048
2049     // If this is x86-64, and we disabled SSE, we can't return FP values,
2050     // or SSE or MMX vectors.
2051     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2052          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2053           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
2054       report_fatal_error("SSE register return with SSE disabled");
2055     }
2056     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2057     // llvm-gcc has never done it right and no one has noticed, so this
2058     // should be OK for now.
2059     if (ValVT == MVT::f64 &&
2060         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
2061       report_fatal_error("SSE2 register return with SSE2 disabled");
2062
2063     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2064     // the RET instruction and handled by the FP Stackifier.
2065     if (VA.getLocReg() == X86::FP0 ||
2066         VA.getLocReg() == X86::FP1) {
2067       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2068       // change the value to the FP stack register class.
2069       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2070         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2071       RetOps.push_back(ValToCopy);
2072       // Don't emit a copytoreg.
2073       continue;
2074     }
2075
2076     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2077     // which is returned in RAX / RDX.
2078     if (Subtarget->is64Bit()) {
2079       if (ValVT == MVT::x86mmx) {
2080         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2081           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
2082           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2083                                   ValToCopy);
2084           // If we don't have SSE2 available, convert to v4f32 so the generated
2085           // register is legal.
2086           if (!Subtarget->hasSSE2())
2087             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
2088         }
2089       }
2090     }
2091
2092     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2093     Flag = Chain.getValue(1);
2094     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2095   }
2096
2097   // The x86-64 ABIs require that for returning structs by value we copy
2098   // the sret argument into %rax/%eax (depending on ABI) for the return.
2099   // Win32 requires us to put the sret argument to %eax as well.
2100   // We saved the argument into a virtual register in the entry block,
2101   // so now we copy the value out and into %rax/%eax.
2102   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
2103       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
2104     MachineFunction &MF = DAG.getMachineFunction();
2105     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2106     unsigned Reg = FuncInfo->getSRetReturnReg();
2107     assert(Reg &&
2108            "SRetReturnReg should have been set in LowerFormalArguments().");
2109     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
2110
2111     unsigned RetValReg
2112         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
2113           X86::RAX : X86::EAX;
2114     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2115     Flag = Chain.getValue(1);
2116
2117     // RAX/EAX now acts like a return value.
2118     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
2119   }
2120
2121   RetOps[0] = Chain;  // Update chain.
2122
2123   // Add the flag if we have it.
2124   if (Flag.getNode())
2125     RetOps.push_back(Flag);
2126
2127   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
2128 }
2129
2130 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2131   if (N->getNumValues() != 1)
2132     return false;
2133   if (!N->hasNUsesOfValue(1, 0))
2134     return false;
2135
2136   SDValue TCChain = Chain;
2137   SDNode *Copy = *N->use_begin();
2138   if (Copy->getOpcode() == ISD::CopyToReg) {
2139     // If the copy has a glue operand, we conservatively assume it isn't safe to
2140     // perform a tail call.
2141     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2142       return false;
2143     TCChain = Copy->getOperand(0);
2144   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2145     return false;
2146
2147   bool HasRet = false;
2148   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2149        UI != UE; ++UI) {
2150     if (UI->getOpcode() != X86ISD::RET_FLAG)
2151       return false;
2152     // If we are returning more than one value, we can definitely
2153     // not make a tail call see PR19530
2154     if (UI->getNumOperands() > 4)
2155       return false;
2156     if (UI->getNumOperands() == 4 &&
2157         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2158       return false;
2159     HasRet = true;
2160   }
2161
2162   if (!HasRet)
2163     return false;
2164
2165   Chain = TCChain;
2166   return true;
2167 }
2168
2169 EVT
2170 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
2171                                             ISD::NodeType ExtendKind) const {
2172   MVT ReturnMVT;
2173   // TODO: Is this also valid on 32-bit?
2174   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
2175     ReturnMVT = MVT::i8;
2176   else
2177     ReturnMVT = MVT::i32;
2178
2179   EVT MinVT = getRegisterType(Context, ReturnMVT);
2180   return VT.bitsLT(MinVT) ? MinVT : VT;
2181 }
2182
2183 /// Lower the result values of a call into the
2184 /// appropriate copies out of appropriate physical registers.
2185 ///
2186 SDValue
2187 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
2188                                    CallingConv::ID CallConv, bool isVarArg,
2189                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2190                                    SDLoc dl, SelectionDAG &DAG,
2191                                    SmallVectorImpl<SDValue> &InVals) const {
2192
2193   // Assign locations to each value returned by this call.
2194   SmallVector<CCValAssign, 16> RVLocs;
2195   bool Is64Bit = Subtarget->is64Bit();
2196   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2197                  *DAG.getContext());
2198   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2199
2200   // Copy all of the result registers out of their specified physreg.
2201   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2202     CCValAssign &VA = RVLocs[i];
2203     EVT CopyVT = VA.getValVT();
2204
2205     // If this is x86-64, and we disabled SSE, we can't return FP values
2206     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
2207         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
2208       report_fatal_error("SSE register return with SSE disabled");
2209     }
2210
2211     // If we prefer to use the value in xmm registers, copy it out as f80 and
2212     // use a truncate to move it from fp stack reg to xmm reg.
2213     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2214         isScalarFPTypeInSSEReg(VA.getValVT()))
2215       CopyVT = MVT::f80;
2216
2217     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2218                                CopyVT, InFlag).getValue(1);
2219     SDValue Val = Chain.getValue(0);
2220
2221     if (CopyVT != VA.getValVT())
2222       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2223                         // This truncation won't change the value.
2224                         DAG.getIntPtrConstant(1));
2225
2226     InFlag = Chain.getValue(2);
2227     InVals.push_back(Val);
2228   }
2229
2230   return Chain;
2231 }
2232
2233 //===----------------------------------------------------------------------===//
2234 //                C & StdCall & Fast Calling Convention implementation
2235 //===----------------------------------------------------------------------===//
2236 //  StdCall calling convention seems to be standard for many Windows' API
2237 //  routines and around. It differs from C calling convention just a little:
2238 //  callee should clean up the stack, not caller. Symbols should be also
2239 //  decorated in some fancy way :) It doesn't support any vector arguments.
2240 //  For info on fast calling convention see Fast Calling Convention (tail call)
2241 //  implementation LowerX86_32FastCCCallTo.
2242
2243 /// CallIsStructReturn - Determines whether a call uses struct return
2244 /// semantics.
2245 enum StructReturnType {
2246   NotStructReturn,
2247   RegStructReturn,
2248   StackStructReturn
2249 };
2250 static StructReturnType
2251 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
2252   if (Outs.empty())
2253     return NotStructReturn;
2254
2255   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2256   if (!Flags.isSRet())
2257     return NotStructReturn;
2258   if (Flags.isInReg())
2259     return RegStructReturn;
2260   return StackStructReturn;
2261 }
2262
2263 /// Determines whether a function uses struct return semantics.
2264 static StructReturnType
2265 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
2266   if (Ins.empty())
2267     return NotStructReturn;
2268
2269   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2270   if (!Flags.isSRet())
2271     return NotStructReturn;
2272   if (Flags.isInReg())
2273     return RegStructReturn;
2274   return StackStructReturn;
2275 }
2276
2277 /// Make a copy of an aggregate at address specified by "Src" to address
2278 /// "Dst" with size and alignment information specified by the specific
2279 /// parameter attribute. The copy will be passed as a byval function parameter.
2280 static SDValue
2281 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
2282                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
2283                           SDLoc dl) {
2284   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
2285
2286   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2287                        /*isVolatile*/false, /*AlwaysInline=*/true,
2288                        MachinePointerInfo(), MachinePointerInfo());
2289 }
2290
2291 /// Return true if the calling convention is one that
2292 /// supports tail call optimization.
2293 static bool IsTailCallConvention(CallingConv::ID CC) {
2294   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2295           CC == CallingConv::HiPE);
2296 }
2297
2298 /// \brief Return true if the calling convention is a C calling convention.
2299 static bool IsCCallConvention(CallingConv::ID CC) {
2300   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
2301           CC == CallingConv::X86_64_SysV);
2302 }
2303
2304 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2305   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
2306     return false;
2307
2308   CallSite CS(CI);
2309   CallingConv::ID CalleeCC = CS.getCallingConv();
2310   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
2311     return false;
2312
2313   return true;
2314 }
2315
2316 /// Return true if the function is being made into
2317 /// a tailcall target by changing its ABI.
2318 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
2319                                    bool GuaranteedTailCallOpt) {
2320   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
2321 }
2322
2323 SDValue
2324 X86TargetLowering::LowerMemArgument(SDValue Chain,
2325                                     CallingConv::ID CallConv,
2326                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2327                                     SDLoc dl, SelectionDAG &DAG,
2328                                     const CCValAssign &VA,
2329                                     MachineFrameInfo *MFI,
2330                                     unsigned i) const {
2331   // Create the nodes corresponding to a load from this parameter slot.
2332   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2333   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
2334       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2335   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2336   EVT ValVT;
2337
2338   // If value is passed by pointer we have address passed instead of the value
2339   // itself.
2340   if (VA.getLocInfo() == CCValAssign::Indirect)
2341     ValVT = VA.getLocVT();
2342   else
2343     ValVT = VA.getValVT();
2344
2345   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2346   // changed with more analysis.
2347   // In case of tail call optimization mark all arguments mutable. Since they
2348   // could be overwritten by lowering of arguments in case of a tail call.
2349   if (Flags.isByVal()) {
2350     unsigned Bytes = Flags.getByValSize();
2351     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2352     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2353     return DAG.getFrameIndex(FI, getPointerTy());
2354   } else {
2355     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2356                                     VA.getLocMemOffset(), isImmutable);
2357     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
2358     return DAG.getLoad(ValVT, dl, Chain, FIN,
2359                        MachinePointerInfo::getFixedStack(FI),
2360                        false, false, false, 0);
2361   }
2362 }
2363
2364 // FIXME: Get this from tablegen.
2365 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2366                                                 const X86Subtarget *Subtarget) {
2367   assert(Subtarget->is64Bit());
2368
2369   if (Subtarget->isCallingConvWin64(CallConv)) {
2370     static const MCPhysReg GPR64ArgRegsWin64[] = {
2371       X86::RCX, X86::RDX, X86::R8,  X86::R9
2372     };
2373     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2374   }
2375
2376   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2377     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2378   };
2379   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2380 }
2381
2382 // FIXME: Get this from tablegen.
2383 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2384                                                 CallingConv::ID CallConv,
2385                                                 const X86Subtarget *Subtarget) {
2386   assert(Subtarget->is64Bit());
2387   if (Subtarget->isCallingConvWin64(CallConv)) {
2388     // The XMM registers which might contain var arg parameters are shadowed
2389     // in their paired GPR.  So we only need to save the GPR to their home
2390     // slots.
2391     // TODO: __vectorcall will change this.
2392     return None;
2393   }
2394
2395   const Function *Fn = MF.getFunction();
2396   bool NoImplicitFloatOps = Fn->getAttributes().
2397       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
2398   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
2399          "SSE register cannot be used when SSE is disabled!");
2400   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
2401       !Subtarget->hasSSE1())
2402     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2403     // registers.
2404     return None;
2405
2406   static const MCPhysReg XMMArgRegs64Bit[] = {
2407     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2408     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2409   };
2410   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2411 }
2412
2413 SDValue
2414 X86TargetLowering::LowerFormalArguments(SDValue Chain,
2415                                         CallingConv::ID CallConv,
2416                                         bool isVarArg,
2417                                       const SmallVectorImpl<ISD::InputArg> &Ins,
2418                                         SDLoc dl,
2419                                         SelectionDAG &DAG,
2420                                         SmallVectorImpl<SDValue> &InVals)
2421                                           const {
2422   MachineFunction &MF = DAG.getMachineFunction();
2423   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2424
2425   const Function* Fn = MF.getFunction();
2426   if (Fn->hasExternalLinkage() &&
2427       Subtarget->isTargetCygMing() &&
2428       Fn->getName() == "main")
2429     FuncInfo->setForceFramePointer(true);
2430
2431   MachineFrameInfo *MFI = MF.getFrameInfo();
2432   bool Is64Bit = Subtarget->is64Bit();
2433   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
2434
2435   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2436          "Var args not supported with calling convention fastcc, ghc or hipe");
2437
2438   // Assign locations to all of the incoming arguments.
2439   SmallVector<CCValAssign, 16> ArgLocs;
2440   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2441
2442   // Allocate shadow area for Win64
2443   if (IsWin64)
2444     CCInfo.AllocateStack(32, 8);
2445
2446   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2447
2448   unsigned LastVal = ~0U;
2449   SDValue ArgValue;
2450   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2451     CCValAssign &VA = ArgLocs[i];
2452     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2453     // places.
2454     assert(VA.getValNo() != LastVal &&
2455            "Don't support value assigned to multiple locs yet");
2456     (void)LastVal;
2457     LastVal = VA.getValNo();
2458
2459     if (VA.isRegLoc()) {
2460       EVT RegVT = VA.getLocVT();
2461       const TargetRegisterClass *RC;
2462       if (RegVT == MVT::i32)
2463         RC = &X86::GR32RegClass;
2464       else if (Is64Bit && RegVT == MVT::i64)
2465         RC = &X86::GR64RegClass;
2466       else if (RegVT == MVT::f32)
2467         RC = &X86::FR32RegClass;
2468       else if (RegVT == MVT::f64)
2469         RC = &X86::FR64RegClass;
2470       else if (RegVT.is512BitVector())
2471         RC = &X86::VR512RegClass;
2472       else if (RegVT.is256BitVector())
2473         RC = &X86::VR256RegClass;
2474       else if (RegVT.is128BitVector())
2475         RC = &X86::VR128RegClass;
2476       else if (RegVT == MVT::x86mmx)
2477         RC = &X86::VR64RegClass;
2478       else if (RegVT == MVT::i1)
2479         RC = &X86::VK1RegClass;
2480       else if (RegVT == MVT::v8i1)
2481         RC = &X86::VK8RegClass;
2482       else if (RegVT == MVT::v16i1)
2483         RC = &X86::VK16RegClass;
2484       else if (RegVT == MVT::v32i1)
2485         RC = &X86::VK32RegClass;
2486       else if (RegVT == MVT::v64i1)
2487         RC = &X86::VK64RegClass;
2488       else
2489         llvm_unreachable("Unknown argument type!");
2490
2491       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2492       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2493
2494       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2495       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2496       // right size.
2497       if (VA.getLocInfo() == CCValAssign::SExt)
2498         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2499                                DAG.getValueType(VA.getValVT()));
2500       else if (VA.getLocInfo() == CCValAssign::ZExt)
2501         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2502                                DAG.getValueType(VA.getValVT()));
2503       else if (VA.getLocInfo() == CCValAssign::BCvt)
2504         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
2505
2506       if (VA.isExtInLoc()) {
2507         // Handle MMX values passed in XMM regs.
2508         if (RegVT.isVector())
2509           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2510         else
2511           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2512       }
2513     } else {
2514       assert(VA.isMemLoc());
2515       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2516     }
2517
2518     // If value is passed via pointer - do a load.
2519     if (VA.getLocInfo() == CCValAssign::Indirect)
2520       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2521                              MachinePointerInfo(), false, false, false, 0);
2522
2523     InVals.push_back(ArgValue);
2524   }
2525
2526   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
2527     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2528       // The x86-64 ABIs require that for returning structs by value we copy
2529       // the sret argument into %rax/%eax (depending on ABI) for the return.
2530       // Win32 requires us to put the sret argument to %eax as well.
2531       // Save the argument into a virtual register so that we can access it
2532       // from the return points.
2533       if (Ins[i].Flags.isSRet()) {
2534         unsigned Reg = FuncInfo->getSRetReturnReg();
2535         if (!Reg) {
2536           MVT PtrTy = getPointerTy();
2537           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2538           FuncInfo->setSRetReturnReg(Reg);
2539         }
2540         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2541         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2542         break;
2543       }
2544     }
2545   }
2546
2547   unsigned StackSize = CCInfo.getNextStackOffset();
2548   // Align stack specially for tail calls.
2549   if (FuncIsMadeTailCallSafe(CallConv,
2550                              MF.getTarget().Options.GuaranteedTailCallOpt))
2551     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2552
2553   // If the function takes variable number of arguments, make a frame index for
2554   // the start of the first vararg value... for expansion of llvm.va_start. We
2555   // can skip this if there are no va_start calls.
2556   if (MFI->hasVAStart() &&
2557       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2558                    CallConv != CallingConv::X86_ThisCall))) {
2559     FuncInfo->setVarArgsFrameIndex(
2560         MFI->CreateFixedObject(1, StackSize, true));
2561   }
2562
2563   // Figure out if XMM registers are in use.
2564   assert(!(MF.getTarget().Options.UseSoftFloat &&
2565            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
2566                                             Attribute::NoImplicitFloat)) &&
2567          "SSE register cannot be used when SSE is disabled!");
2568
2569   // 64-bit calling conventions support varargs and register parameters, so we
2570   // have to do extra work to spill them in the prologue.
2571   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2572     // Find the first unallocated argument registers.
2573     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2574     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2575     unsigned NumIntRegs =
2576         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
2577     unsigned NumXMMRegs =
2578         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
2579     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
2580            "SSE register cannot be used when SSE is disabled!");
2581
2582     // Gather all the live in physical registers.
2583     SmallVector<SDValue, 6> LiveGPRs;
2584     SmallVector<SDValue, 8> LiveXMMRegs;
2585     SDValue ALVal;
2586     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2587       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2588       LiveGPRs.push_back(
2589           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2590     }
2591     if (!ArgXMMs.empty()) {
2592       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2593       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2594       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2595         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2596         LiveXMMRegs.push_back(
2597             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2598       }
2599     }
2600
2601     if (IsWin64) {
2602       const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
2603       // Get to the caller-allocated home save location.  Add 8 to account
2604       // for the return address.
2605       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2606       FuncInfo->setRegSaveFrameIndex(
2607           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2608       // Fixup to set vararg frame on shadow area (4 x i64).
2609       if (NumIntRegs < 4)
2610         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2611     } else {
2612       // For X86-64, if there are vararg parameters that are passed via
2613       // registers, then we must store them to their spots on the stack so
2614       // they may be loaded by deferencing the result of va_next.
2615       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2616       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2617       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2618           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2619     }
2620
2621     // Store the integer parameter registers.
2622     SmallVector<SDValue, 8> MemOps;
2623     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2624                                       getPointerTy());
2625     unsigned Offset = FuncInfo->getVarArgsGPOffset();
2626     for (SDValue Val : LiveGPRs) {
2627       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
2628                                 DAG.getIntPtrConstant(Offset));
2629       SDValue Store =
2630         DAG.getStore(Val.getValue(1), dl, Val, FIN,
2631                      MachinePointerInfo::getFixedStack(
2632                        FuncInfo->getRegSaveFrameIndex(), Offset),
2633                      false, false, 0);
2634       MemOps.push_back(Store);
2635       Offset += 8;
2636     }
2637
2638     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2639       // Now store the XMM (fp + vector) parameter registers.
2640       SmallVector<SDValue, 12> SaveXMMOps;
2641       SaveXMMOps.push_back(Chain);
2642       SaveXMMOps.push_back(ALVal);
2643       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2644                              FuncInfo->getRegSaveFrameIndex()));
2645       SaveXMMOps.push_back(DAG.getIntPtrConstant(
2646                              FuncInfo->getVarArgsFPOffset()));
2647       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2648                         LiveXMMRegs.end());
2649       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2650                                    MVT::Other, SaveXMMOps));
2651     }
2652
2653     if (!MemOps.empty())
2654       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2655   }
2656
2657   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2658     // Find the largest legal vector type.
2659     MVT VecVT = MVT::Other;
2660     // FIXME: Only some x86_32 calling conventions support AVX512.
2661     if (Subtarget->hasAVX512() &&
2662         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2663                      CallConv == CallingConv::Intel_OCL_BI)))
2664       VecVT = MVT::v16f32;
2665     else if (Subtarget->hasAVX())
2666       VecVT = MVT::v8f32;
2667     else if (Subtarget->hasSSE2())
2668       VecVT = MVT::v4f32;
2669
2670     // We forward some GPRs and some vector types.
2671     SmallVector<MVT, 2> RegParmTypes;
2672     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2673     RegParmTypes.push_back(IntVT);
2674     if (VecVT != MVT::Other)
2675       RegParmTypes.push_back(VecVT);
2676
2677     // Compute the set of forwarded registers. The rest are scratch.
2678     SmallVectorImpl<ForwardedRegister> &Forwards =
2679         FuncInfo->getForwardedMustTailRegParms();
2680     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2681
2682     // Conservatively forward AL on x86_64, since it might be used for varargs.
2683     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2684       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2685       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2686     }
2687
2688     // Copy all forwards from physical to virtual registers.
2689     for (ForwardedRegister &F : Forwards) {
2690       // FIXME: Can we use a less constrained schedule?
2691       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2692       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2693       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2694     }
2695   }
2696
2697   // Some CCs need callee pop.
2698   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2699                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
2700     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2701   } else {
2702     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2703     // If this is an sret function, the return should pop the hidden pointer.
2704     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
2705         !Subtarget->getTargetTriple().isOSMSVCRT() &&
2706         argsAreStructReturn(Ins) == StackStructReturn)
2707       FuncInfo->setBytesToPopOnReturn(4);
2708   }
2709
2710   if (!Is64Bit) {
2711     // RegSaveFrameIndex is X86-64 only.
2712     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2713     if (CallConv == CallingConv::X86_FastCall ||
2714         CallConv == CallingConv::X86_ThisCall)
2715       // fastcc functions can't have varargs.
2716       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2717   }
2718
2719   FuncInfo->setArgumentStackSize(StackSize);
2720
2721   return Chain;
2722 }
2723
2724 SDValue
2725 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
2726                                     SDValue StackPtr, SDValue Arg,
2727                                     SDLoc dl, SelectionDAG &DAG,
2728                                     const CCValAssign &VA,
2729                                     ISD::ArgFlagsTy Flags) const {
2730   unsigned LocMemOffset = VA.getLocMemOffset();
2731   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
2732   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
2733   if (Flags.isByVal())
2734     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2735
2736   return DAG.getStore(Chain, dl, Arg, PtrOff,
2737                       MachinePointerInfo::getStack(LocMemOffset),
2738                       false, false, 0);
2739 }
2740
2741 /// Emit a load of return address if tail call
2742 /// optimization is performed and it is required.
2743 SDValue
2744 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
2745                                            SDValue &OutRetAddr, SDValue Chain,
2746                                            bool IsTailCall, bool Is64Bit,
2747                                            int FPDiff, SDLoc dl) const {
2748   // Adjust the Return address stack slot.
2749   EVT VT = getPointerTy();
2750   OutRetAddr = getReturnAddressFrameIndex(DAG);
2751
2752   // Load the "old" Return address.
2753   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2754                            false, false, false, 0);
2755   return SDValue(OutRetAddr.getNode(), 1);
2756 }
2757
2758 /// Emit a store of the return address if tail call
2759 /// optimization is performed and it is required (FPDiff!=0).
2760 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2761                                         SDValue Chain, SDValue RetAddrFrIdx,
2762                                         EVT PtrVT, unsigned SlotSize,
2763                                         int FPDiff, SDLoc dl) {
2764   // Store the return address to the appropriate stack slot.
2765   if (!FPDiff) return Chain;
2766   // Calculate the new stack slot for the return address.
2767   int NewReturnAddrFI =
2768     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2769                                          false);
2770   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2771   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2772                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
2773                        false, false, 0);
2774   return Chain;
2775 }
2776
2777 SDValue
2778 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2779                              SmallVectorImpl<SDValue> &InVals) const {
2780   SelectionDAG &DAG                     = CLI.DAG;
2781   SDLoc &dl                             = CLI.DL;
2782   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2783   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2784   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2785   SDValue Chain                         = CLI.Chain;
2786   SDValue Callee                        = CLI.Callee;
2787   CallingConv::ID CallConv              = CLI.CallConv;
2788   bool &isTailCall                      = CLI.IsTailCall;
2789   bool isVarArg                         = CLI.IsVarArg;
2790
2791   MachineFunction &MF = DAG.getMachineFunction();
2792   bool Is64Bit        = Subtarget->is64Bit();
2793   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
2794   StructReturnType SR = callIsStructReturn(Outs);
2795   bool IsSibcall      = false;
2796   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2797
2798   if (MF.getTarget().Options.DisableTailCalls)
2799     isTailCall = false;
2800
2801   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
2802   if (IsMustTail) {
2803     // Force this to be a tail call.  The verifier rules are enough to ensure
2804     // that we can lower this successfully without moving the return address
2805     // around.
2806     isTailCall = true;
2807   } else if (isTailCall) {
2808     // Check if it's really possible to do a tail call.
2809     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
2810                     isVarArg, SR != NotStructReturn,
2811                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
2812                     Outs, OutVals, Ins, DAG);
2813
2814     // Sibcalls are automatically detected tailcalls which do not require
2815     // ABI changes.
2816     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
2817       IsSibcall = true;
2818
2819     if (isTailCall)
2820       ++NumTailCalls;
2821   }
2822
2823   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
2824          "Var args not supported with calling convention fastcc, ghc or hipe");
2825
2826   // Analyze operands of the call, assigning locations to each operand.
2827   SmallVector<CCValAssign, 16> ArgLocs;
2828   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2829
2830   // Allocate shadow area for Win64
2831   if (IsWin64)
2832     CCInfo.AllocateStack(32, 8);
2833
2834   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
2835
2836   // Get a count of how many bytes are to be pushed on the stack.
2837   unsigned NumBytes = CCInfo.getNextStackOffset();
2838   if (IsSibcall)
2839     // This is a sibcall. The memory operands are available in caller's
2840     // own caller's stack.
2841     NumBytes = 0;
2842   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
2843            IsTailCallConvention(CallConv))
2844     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
2845
2846   int FPDiff = 0;
2847   if (isTailCall && !IsSibcall && !IsMustTail) {
2848     // Lower arguments at fp - stackoffset + fpdiff.
2849     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
2850
2851     FPDiff = NumBytesCallerPushed - NumBytes;
2852
2853     // Set the delta of movement of the returnaddr stackslot.
2854     // But only set if delta is greater than previous delta.
2855     if (FPDiff < X86Info->getTCReturnAddrDelta())
2856       X86Info->setTCReturnAddrDelta(FPDiff);
2857   }
2858
2859   unsigned NumBytesToPush = NumBytes;
2860   unsigned NumBytesToPop = NumBytes;
2861
2862   // If we have an inalloca argument, all stack space has already been allocated
2863   // for us and be right at the top of the stack.  We don't support multiple
2864   // arguments passed in memory when using inalloca.
2865   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
2866     NumBytesToPush = 0;
2867     if (!ArgLocs.back().isMemLoc())
2868       report_fatal_error("cannot use inalloca attribute on a register "
2869                          "parameter");
2870     if (ArgLocs.back().getLocMemOffset() != 0)
2871       report_fatal_error("any parameter with the inalloca attribute must be "
2872                          "the only memory argument");
2873   }
2874
2875   if (!IsSibcall)
2876     Chain = DAG.getCALLSEQ_START(
2877         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
2878
2879   SDValue RetAddrFrIdx;
2880   // Load return address for tail calls.
2881   if (isTailCall && FPDiff)
2882     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
2883                                     Is64Bit, FPDiff, dl);
2884
2885   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2886   SmallVector<SDValue, 8> MemOpChains;
2887   SDValue StackPtr;
2888
2889   // Walk the register/memloc assignments, inserting copies/loads.  In the case
2890   // of tail call optimization arguments are handle later.
2891   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
2892       DAG.getSubtarget().getRegisterInfo());
2893   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2894     // Skip inalloca arguments, they have already been written.
2895     ISD::ArgFlagsTy Flags = Outs[i].Flags;
2896     if (Flags.isInAlloca())
2897       continue;
2898
2899     CCValAssign &VA = ArgLocs[i];
2900     EVT RegVT = VA.getLocVT();
2901     SDValue Arg = OutVals[i];
2902     bool isByVal = Flags.isByVal();
2903
2904     // Promote the value if needed.
2905     switch (VA.getLocInfo()) {
2906     default: llvm_unreachable("Unknown loc info!");
2907     case CCValAssign::Full: break;
2908     case CCValAssign::SExt:
2909       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
2910       break;
2911     case CCValAssign::ZExt:
2912       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
2913       break;
2914     case CCValAssign::AExt:
2915       if (RegVT.is128BitVector()) {
2916         // Special case: passing MMX values in XMM registers.
2917         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
2918         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
2919         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
2920       } else
2921         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
2922       break;
2923     case CCValAssign::BCvt:
2924       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
2925       break;
2926     case CCValAssign::Indirect: {
2927       // Store the argument.
2928       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
2929       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
2930       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
2931                            MachinePointerInfo::getFixedStack(FI),
2932                            false, false, 0);
2933       Arg = SpillSlot;
2934       break;
2935     }
2936     }
2937
2938     if (VA.isRegLoc()) {
2939       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2940       if (isVarArg && IsWin64) {
2941         // Win64 ABI requires argument XMM reg to be copied to the corresponding
2942         // shadow reg if callee is a varargs function.
2943         unsigned ShadowReg = 0;
2944         switch (VA.getLocReg()) {
2945         case X86::XMM0: ShadowReg = X86::RCX; break;
2946         case X86::XMM1: ShadowReg = X86::RDX; break;
2947         case X86::XMM2: ShadowReg = X86::R8; break;
2948         case X86::XMM3: ShadowReg = X86::R9; break;
2949         }
2950         if (ShadowReg)
2951           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
2952       }
2953     } else if (!IsSibcall && (!isTailCall || isByVal)) {
2954       assert(VA.isMemLoc());
2955       if (!StackPtr.getNode())
2956         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
2957                                       getPointerTy());
2958       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
2959                                              dl, DAG, VA, Flags));
2960     }
2961   }
2962
2963   if (!MemOpChains.empty())
2964     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2965
2966   if (Subtarget->isPICStyleGOT()) {
2967     // ELF / PIC requires GOT in the EBX register before function calls via PLT
2968     // GOT pointer.
2969     if (!isTailCall) {
2970       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
2971                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
2972     } else {
2973       // If we are tail calling and generating PIC/GOT style code load the
2974       // address of the callee into ECX. The value in ecx is used as target of
2975       // the tail jump. This is done to circumvent the ebx/callee-saved problem
2976       // for tail calls on PIC/GOT architectures. Normally we would just put the
2977       // address of GOT into ebx and then call target@PLT. But for tail calls
2978       // ebx would be restored (since ebx is callee saved) before jumping to the
2979       // target@PLT.
2980
2981       // Note: The actual moving to ECX is done further down.
2982       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2983       if (G && !G->getGlobal()->hasHiddenVisibility() &&
2984           !G->getGlobal()->hasProtectedVisibility())
2985         Callee = LowerGlobalAddress(Callee, DAG);
2986       else if (isa<ExternalSymbolSDNode>(Callee))
2987         Callee = LowerExternalSymbol(Callee, DAG);
2988     }
2989   }
2990
2991   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
2992     // From AMD64 ABI document:
2993     // For calls that may call functions that use varargs or stdargs
2994     // (prototype-less calls or calls to functions containing ellipsis (...) in
2995     // the declaration) %al is used as hidden argument to specify the number
2996     // of SSE registers used. The contents of %al do not need to match exactly
2997     // the number of registers, but must be an ubound on the number of SSE
2998     // registers used and is in the range 0 - 8 inclusive.
2999
3000     // Count the number of XMM registers allocated.
3001     static const MCPhysReg XMMArgRegs[] = {
3002       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3003       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3004     };
3005     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
3006     assert((Subtarget->hasSSE1() || !NumXMMRegs)
3007            && "SSE registers cannot be used when SSE is disabled");
3008
3009     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3010                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
3011   }
3012
3013   if (isVarArg && IsMustTail) {
3014     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3015     for (const auto &F : Forwards) {
3016       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3017       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3018     }
3019   }
3020
3021   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3022   // don't need this because the eligibility check rejects calls that require
3023   // shuffling arguments passed in memory.
3024   if (!IsSibcall && isTailCall) {
3025     // Force all the incoming stack arguments to be loaded from the stack
3026     // before any new outgoing arguments are stored to the stack, because the
3027     // outgoing stack slots may alias the incoming argument stack slots, and
3028     // the alias isn't otherwise explicit. This is slightly more conservative
3029     // than necessary, because it means that each store effectively depends
3030     // on every argument instead of just those arguments it would clobber.
3031     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3032
3033     SmallVector<SDValue, 8> MemOpChains2;
3034     SDValue FIN;
3035     int FI = 0;
3036     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3037       CCValAssign &VA = ArgLocs[i];
3038       if (VA.isRegLoc())
3039         continue;
3040       assert(VA.isMemLoc());
3041       SDValue Arg = OutVals[i];
3042       ISD::ArgFlagsTy Flags = Outs[i].Flags;
3043       // Skip inalloca arguments.  They don't require any work.
3044       if (Flags.isInAlloca())
3045         continue;
3046       // Create frame index.
3047       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3048       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3049       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3050       FIN = DAG.getFrameIndex(FI, getPointerTy());
3051
3052       if (Flags.isByVal()) {
3053         // Copy relative to framepointer.
3054         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
3055         if (!StackPtr.getNode())
3056           StackPtr = DAG.getCopyFromReg(Chain, dl,
3057                                         RegInfo->getStackRegister(),
3058                                         getPointerTy());
3059         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
3060
3061         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3062                                                          ArgChain,
3063                                                          Flags, DAG, dl));
3064       } else {
3065         // Store relative to framepointer.
3066         MemOpChains2.push_back(
3067           DAG.getStore(ArgChain, dl, Arg, FIN,
3068                        MachinePointerInfo::getFixedStack(FI),
3069                        false, false, 0));
3070       }
3071     }
3072
3073     if (!MemOpChains2.empty())
3074       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3075
3076     // Store the return address to the appropriate stack slot.
3077     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3078                                      getPointerTy(), RegInfo->getSlotSize(),
3079                                      FPDiff, dl);
3080   }
3081
3082   // Build a sequence of copy-to-reg nodes chained together with token chain
3083   // and flag operands which copy the outgoing args into registers.
3084   SDValue InFlag;
3085   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3086     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3087                              RegsToPass[i].second, InFlag);
3088     InFlag = Chain.getValue(1);
3089   }
3090
3091   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3092     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3093     // In the 64-bit large code model, we have to make all calls
3094     // through a register, since the call instruction's 32-bit
3095     // pc-relative offset may not be large enough to hold the whole
3096     // address.
3097   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3098     // If the callee is a GlobalAddress node (quite common, every direct call
3099     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3100     // it.
3101     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3102
3103     // We should use extra load for direct calls to dllimported functions in
3104     // non-JIT mode.
3105     const GlobalValue *GV = G->getGlobal();
3106     if (!GV->hasDLLImportStorageClass()) {
3107       unsigned char OpFlags = 0;
3108       bool ExtraLoad = false;
3109       unsigned WrapperKind = ISD::DELETED_NODE;
3110
3111       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
3112       // external symbols most go through the PLT in PIC mode.  If the symbol
3113       // has hidden or protected visibility, or if it is static or local, then
3114       // we don't need to use the PLT - we can directly call it.
3115       if (Subtarget->isTargetELF() &&
3116           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
3117           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
3118         OpFlags = X86II::MO_PLT;
3119       } else if (Subtarget->isPICStyleStubAny() &&
3120                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
3121                  (!Subtarget->getTargetTriple().isMacOSX() ||
3122                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3123         // PC-relative references to external symbols should go through $stub,
3124         // unless we're building with the leopard linker or later, which
3125         // automatically synthesizes these stubs.
3126         OpFlags = X86II::MO_DARWIN_STUB;
3127       } else if (Subtarget->isPICStyleRIPRel() &&
3128                  isa<Function>(GV) &&
3129                  cast<Function>(GV)->getAttributes().
3130                    hasAttribute(AttributeSet::FunctionIndex,
3131                                 Attribute::NonLazyBind)) {
3132         // If the function is marked as non-lazy, generate an indirect call
3133         // which loads from the GOT directly. This avoids runtime overhead
3134         // at the cost of eager binding (and one extra byte of encoding).
3135         OpFlags = X86II::MO_GOTPCREL;
3136         WrapperKind = X86ISD::WrapperRIP;
3137         ExtraLoad = true;
3138       }
3139
3140       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
3141                                           G->getOffset(), OpFlags);
3142
3143       // Add a wrapper if needed.
3144       if (WrapperKind != ISD::DELETED_NODE)
3145         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
3146       // Add extra indirection if needed.
3147       if (ExtraLoad)
3148         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
3149                              MachinePointerInfo::getGOT(),
3150                              false, false, false, 0);
3151     }
3152   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3153     unsigned char OpFlags = 0;
3154
3155     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
3156     // external symbols should go through the PLT.
3157     if (Subtarget->isTargetELF() &&
3158         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
3159       OpFlags = X86II::MO_PLT;
3160     } else if (Subtarget->isPICStyleStubAny() &&
3161                (!Subtarget->getTargetTriple().isMacOSX() ||
3162                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
3163       // PC-relative references to external symbols should go through $stub,
3164       // unless we're building with the leopard linker or later, which
3165       // automatically synthesizes these stubs.
3166       OpFlags = X86II::MO_DARWIN_STUB;
3167     }
3168
3169     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
3170                                          OpFlags);
3171   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
3172     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3173     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3174   }
3175
3176   // Returns a chain & a flag for retval copy to use.
3177   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3178   SmallVector<SDValue, 8> Ops;
3179
3180   if (!IsSibcall && isTailCall) {
3181     Chain = DAG.getCALLSEQ_END(Chain,
3182                                DAG.getIntPtrConstant(NumBytesToPop, true),
3183                                DAG.getIntPtrConstant(0, true), InFlag, dl);
3184     InFlag = Chain.getValue(1);
3185   }
3186
3187   Ops.push_back(Chain);
3188   Ops.push_back(Callee);
3189
3190   if (isTailCall)
3191     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
3192
3193   // Add argument registers to the end of the list so that they are known live
3194   // into the call.
3195   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3196     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3197                                   RegsToPass[i].second.getValueType()));
3198
3199   // Add a register mask operand representing the call-preserved registers.
3200   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
3201   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
3202   assert(Mask && "Missing call preserved mask for calling convention");
3203   Ops.push_back(DAG.getRegisterMask(Mask));
3204
3205   if (InFlag.getNode())
3206     Ops.push_back(InFlag);
3207
3208   if (isTailCall) {
3209     // We used to do:
3210     //// If this is the first return lowered for this function, add the regs
3211     //// to the liveout set for the function.
3212     // This isn't right, although it's probably harmless on x86; liveouts
3213     // should be computed from returns not tail calls.  Consider a void
3214     // function making a tail call to a function returning int.
3215     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3216   }
3217
3218   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3219   InFlag = Chain.getValue(1);
3220
3221   // Create the CALLSEQ_END node.
3222   unsigned NumBytesForCalleeToPop;
3223   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3224                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3225     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3226   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
3227            !Subtarget->getTargetTriple().isOSMSVCRT() &&
3228            SR == StackStructReturn)
3229     // If this is a call to a struct-return function, the callee
3230     // pops the hidden struct pointer, so we have to push it back.
3231     // This is common for Darwin/X86, Linux & Mingw32 targets.
3232     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3233     NumBytesForCalleeToPop = 4;
3234   else
3235     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3236
3237   // Returns a flag for retval copy to use.
3238   if (!IsSibcall) {
3239     Chain = DAG.getCALLSEQ_END(Chain,
3240                                DAG.getIntPtrConstant(NumBytesToPop, true),
3241                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
3242                                                      true),
3243                                InFlag, dl);
3244     InFlag = Chain.getValue(1);
3245   }
3246
3247   // Handle result values, copying them out of physregs into vregs that we
3248   // return.
3249   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3250                          Ins, dl, DAG, InVals);
3251 }
3252
3253 //===----------------------------------------------------------------------===//
3254 //                Fast Calling Convention (tail call) implementation
3255 //===----------------------------------------------------------------------===//
3256
3257 //  Like std call, callee cleans arguments, convention except that ECX is
3258 //  reserved for storing the tail called function address. Only 2 registers are
3259 //  free for argument passing (inreg). Tail call optimization is performed
3260 //  provided:
3261 //                * tailcallopt is enabled
3262 //                * caller/callee are fastcc
3263 //  On X86_64 architecture with GOT-style position independent code only local
3264 //  (within module) calls are supported at the moment.
3265 //  To keep the stack aligned according to platform abi the function
3266 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3267 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3268 //  If a tail called function callee has more arguments than the caller the
3269 //  caller needs to make sure that there is room to move the RETADDR to. This is
3270 //  achieved by reserving an area the size of the argument delta right after the
3271 //  original RETADDR, but before the saved framepointer or the spilled registers
3272 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3273 //  stack layout:
3274 //    arg1
3275 //    arg2
3276 //    RETADDR
3277 //    [ new RETADDR
3278 //      move area ]
3279 //    (possible EBP)
3280 //    ESI
3281 //    EDI
3282 //    local1 ..
3283
3284 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
3285 /// for a 16 byte align requirement.
3286 unsigned
3287 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3288                                                SelectionDAG& DAG) const {
3289   MachineFunction &MF = DAG.getMachineFunction();
3290   const TargetMachine &TM = MF.getTarget();
3291   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3292       TM.getSubtargetImpl()->getRegisterInfo());
3293   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
3294   unsigned StackAlignment = TFI.getStackAlignment();
3295   uint64_t AlignMask = StackAlignment - 1;
3296   int64_t Offset = StackSize;
3297   unsigned SlotSize = RegInfo->getSlotSize();
3298   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3299     // Number smaller than 12 so just add the difference.
3300     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3301   } else {
3302     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3303     Offset = ((~AlignMask) & Offset) + StackAlignment +
3304       (StackAlignment-SlotSize);
3305   }
3306   return Offset;
3307 }
3308
3309 /// MatchingStackOffset - Return true if the given stack call argument is
3310 /// already available in the same position (relatively) of the caller's
3311 /// incoming argument stack.
3312 static
3313 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3314                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3315                          const X86InstrInfo *TII) {
3316   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3317   int FI = INT_MAX;
3318   if (Arg.getOpcode() == ISD::CopyFromReg) {
3319     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3320     if (!TargetRegisterInfo::isVirtualRegister(VR))
3321       return false;
3322     MachineInstr *Def = MRI->getVRegDef(VR);
3323     if (!Def)
3324       return false;
3325     if (!Flags.isByVal()) {
3326       if (!TII->isLoadFromStackSlot(Def, FI))
3327         return false;
3328     } else {
3329       unsigned Opcode = Def->getOpcode();
3330       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
3331           Def->getOperand(1).isFI()) {
3332         FI = Def->getOperand(1).getIndex();
3333         Bytes = Flags.getByValSize();
3334       } else
3335         return false;
3336     }
3337   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3338     if (Flags.isByVal())
3339       // ByVal argument is passed in as a pointer but it's now being
3340       // dereferenced. e.g.
3341       // define @foo(%struct.X* %A) {
3342       //   tail call @bar(%struct.X* byval %A)
3343       // }
3344       return false;
3345     SDValue Ptr = Ld->getBasePtr();
3346     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3347     if (!FINode)
3348       return false;
3349     FI = FINode->getIndex();
3350   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3351     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3352     FI = FINode->getIndex();
3353     Bytes = Flags.getByValSize();
3354   } else
3355     return false;
3356
3357   assert(FI != INT_MAX);
3358   if (!MFI->isFixedObjectIndex(FI))
3359     return false;
3360   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
3361 }
3362
3363 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
3364 /// for tail call optimization. Targets which want to do tail call
3365 /// optimization should implement this function.
3366 bool
3367 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
3368                                                      CallingConv::ID CalleeCC,
3369                                                      bool isVarArg,
3370                                                      bool isCalleeStructRet,
3371                                                      bool isCallerStructRet,
3372                                                      Type *RetTy,
3373                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
3374                                     const SmallVectorImpl<SDValue> &OutVals,
3375                                     const SmallVectorImpl<ISD::InputArg> &Ins,
3376                                                      SelectionDAG &DAG) const {
3377   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
3378     return false;
3379
3380   // If -tailcallopt is specified, make fastcc functions tail-callable.
3381   const MachineFunction &MF = DAG.getMachineFunction();
3382   const Function *CallerF = MF.getFunction();
3383
3384   // If the function return type is x86_fp80 and the callee return type is not,
3385   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3386   // perform a tailcall optimization here.
3387   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3388     return false;
3389
3390   CallingConv::ID CallerCC = CallerF->getCallingConv();
3391   bool CCMatch = CallerCC == CalleeCC;
3392   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
3393   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
3394
3395   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3396     if (IsTailCallConvention(CalleeCC) && CCMatch)
3397       return true;
3398     return false;
3399   }
3400
3401   // Look for obvious safe cases to perform tail call optimization that do not
3402   // require ABI changes. This is what gcc calls sibcall.
3403
3404   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3405   // emit a special epilogue.
3406   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3407       DAG.getSubtarget().getRegisterInfo());
3408   if (RegInfo->needsStackRealignment(MF))
3409     return false;
3410
3411   // Also avoid sibcall optimization if either caller or callee uses struct
3412   // return semantics.
3413   if (isCalleeStructRet || isCallerStructRet)
3414     return false;
3415
3416   // An stdcall/thiscall caller is expected to clean up its arguments; the
3417   // callee isn't going to do that.
3418   // FIXME: this is more restrictive than needed. We could produce a tailcall
3419   // when the stack adjustment matches. For example, with a thiscall that takes
3420   // only one argument.
3421   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
3422                    CallerCC == CallingConv::X86_ThisCall))
3423     return false;
3424
3425   // Do not sibcall optimize vararg calls unless all arguments are passed via
3426   // registers.
3427   if (isVarArg && !Outs.empty()) {
3428
3429     // Optimizing for varargs on Win64 is unlikely to be safe without
3430     // additional testing.
3431     if (IsCalleeWin64 || IsCallerWin64)
3432       return false;
3433
3434     SmallVector<CCValAssign, 16> ArgLocs;
3435     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3436                    *DAG.getContext());
3437
3438     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3439     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3440       if (!ArgLocs[i].isRegLoc())
3441         return false;
3442   }
3443
3444   // If the call result is in ST0 / ST1, it needs to be popped off the x87
3445   // stack.  Therefore, if it's not used by the call it is not safe to optimize
3446   // this into a sibcall.
3447   bool Unused = false;
3448   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3449     if (!Ins[i].Used) {
3450       Unused = true;
3451       break;
3452     }
3453   }
3454   if (Unused) {
3455     SmallVector<CCValAssign, 16> RVLocs;
3456     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
3457                    *DAG.getContext());
3458     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3459     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3460       CCValAssign &VA = RVLocs[i];
3461       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3462         return false;
3463     }
3464   }
3465
3466   // If the calling conventions do not match, then we'd better make sure the
3467   // results are returned in the same way as what the caller expects.
3468   if (!CCMatch) {
3469     SmallVector<CCValAssign, 16> RVLocs1;
3470     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
3471                     *DAG.getContext());
3472     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
3473
3474     SmallVector<CCValAssign, 16> RVLocs2;
3475     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
3476                     *DAG.getContext());
3477     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
3478
3479     if (RVLocs1.size() != RVLocs2.size())
3480       return false;
3481     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
3482       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
3483         return false;
3484       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
3485         return false;
3486       if (RVLocs1[i].isRegLoc()) {
3487         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
3488           return false;
3489       } else {
3490         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
3491           return false;
3492       }
3493     }
3494   }
3495
3496   // If the callee takes no arguments then go on to check the results of the
3497   // call.
3498   if (!Outs.empty()) {
3499     // Check if stack adjustment is needed. For now, do not do this if any
3500     // argument is passed on the stack.
3501     SmallVector<CCValAssign, 16> ArgLocs;
3502     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
3503                    *DAG.getContext());
3504
3505     // Allocate shadow area for Win64
3506     if (IsCalleeWin64)
3507       CCInfo.AllocateStack(32, 8);
3508
3509     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3510     if (CCInfo.getNextStackOffset()) {
3511       MachineFunction &MF = DAG.getMachineFunction();
3512       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
3513         return false;
3514
3515       // Check if the arguments are already laid out in the right way as
3516       // the caller's fixed stack objects.
3517       MachineFrameInfo *MFI = MF.getFrameInfo();
3518       const MachineRegisterInfo *MRI = &MF.getRegInfo();
3519       const X86InstrInfo *TII =
3520           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
3521       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3522         CCValAssign &VA = ArgLocs[i];
3523         SDValue Arg = OutVals[i];
3524         ISD::ArgFlagsTy Flags = Outs[i].Flags;
3525         if (VA.getLocInfo() == CCValAssign::Indirect)
3526           return false;
3527         if (!VA.isRegLoc()) {
3528           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3529                                    MFI, MRI, TII))
3530             return false;
3531         }
3532       }
3533     }
3534
3535     // If the tailcall address may be in a register, then make sure it's
3536     // possible to register allocate for it. In 32-bit, the call address can
3537     // only target EAX, EDX, or ECX since the tail call must be scheduled after
3538     // callee-saved registers are restored. These happen to be the same
3539     // registers used to pass 'inreg' arguments so watch out for those.
3540     if (!Subtarget->is64Bit() &&
3541         ((!isa<GlobalAddressSDNode>(Callee) &&
3542           !isa<ExternalSymbolSDNode>(Callee)) ||
3543          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
3544       unsigned NumInRegs = 0;
3545       // In PIC we need an extra register to formulate the address computation
3546       // for the callee.
3547       unsigned MaxInRegs =
3548         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
3549
3550       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3551         CCValAssign &VA = ArgLocs[i];
3552         if (!VA.isRegLoc())
3553           continue;
3554         unsigned Reg = VA.getLocReg();
3555         switch (Reg) {
3556         default: break;
3557         case X86::EAX: case X86::EDX: case X86::ECX:
3558           if (++NumInRegs == MaxInRegs)
3559             return false;
3560           break;
3561         }
3562       }
3563     }
3564   }
3565
3566   return true;
3567 }
3568
3569 FastISel *
3570 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3571                                   const TargetLibraryInfo *libInfo) const {
3572   return X86::createFastISel(funcInfo, libInfo);
3573 }
3574
3575 //===----------------------------------------------------------------------===//
3576 //                           Other Lowering Hooks
3577 //===----------------------------------------------------------------------===//
3578
3579 static bool MayFoldLoad(SDValue Op) {
3580   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3581 }
3582
3583 static bool MayFoldIntoStore(SDValue Op) {
3584   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3585 }
3586
3587 static bool isTargetShuffle(unsigned Opcode) {
3588   switch(Opcode) {
3589   default: return false;
3590   case X86ISD::BLENDI:
3591   case X86ISD::PSHUFB:
3592   case X86ISD::PSHUFD:
3593   case X86ISD::PSHUFHW:
3594   case X86ISD::PSHUFLW:
3595   case X86ISD::SHUFP:
3596   case X86ISD::PALIGNR:
3597   case X86ISD::MOVLHPS:
3598   case X86ISD::MOVLHPD:
3599   case X86ISD::MOVHLPS:
3600   case X86ISD::MOVLPS:
3601   case X86ISD::MOVLPD:
3602   case X86ISD::MOVSHDUP:
3603   case X86ISD::MOVSLDUP:
3604   case X86ISD::MOVDDUP:
3605   case X86ISD::MOVSS:
3606   case X86ISD::MOVSD:
3607   case X86ISD::UNPCKL:
3608   case X86ISD::UNPCKH:
3609   case X86ISD::VPERMILPI:
3610   case X86ISD::VPERM2X128:
3611   case X86ISD::VPERMI:
3612     return true;
3613   }
3614 }
3615
3616 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3617                                     SDValue V1, SelectionDAG &DAG) {
3618   switch(Opc) {
3619   default: llvm_unreachable("Unknown x86 shuffle node");
3620   case X86ISD::MOVSHDUP:
3621   case X86ISD::MOVSLDUP:
3622   case X86ISD::MOVDDUP:
3623     return DAG.getNode(Opc, dl, VT, V1);
3624   }
3625 }
3626
3627 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3628                                     SDValue V1, unsigned TargetMask,
3629                                     SelectionDAG &DAG) {
3630   switch(Opc) {
3631   default: llvm_unreachable("Unknown x86 shuffle node");
3632   case X86ISD::PSHUFD:
3633   case X86ISD::PSHUFHW:
3634   case X86ISD::PSHUFLW:
3635   case X86ISD::VPERMILPI:
3636   case X86ISD::VPERMI:
3637     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
3638   }
3639 }
3640
3641 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3642                                     SDValue V1, SDValue V2, unsigned TargetMask,
3643                                     SelectionDAG &DAG) {
3644   switch(Opc) {
3645   default: llvm_unreachable("Unknown x86 shuffle node");
3646   case X86ISD::PALIGNR:
3647   case X86ISD::VALIGN:
3648   case X86ISD::SHUFP:
3649   case X86ISD::VPERM2X128:
3650     return DAG.getNode(Opc, dl, VT, V1, V2,
3651                        DAG.getConstant(TargetMask, MVT::i8));
3652   }
3653 }
3654
3655 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
3656                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
3657   switch(Opc) {
3658   default: llvm_unreachable("Unknown x86 shuffle node");
3659   case X86ISD::MOVLHPS:
3660   case X86ISD::MOVLHPD:
3661   case X86ISD::MOVHLPS:
3662   case X86ISD::MOVLPS:
3663   case X86ISD::MOVLPD:
3664   case X86ISD::MOVSS:
3665   case X86ISD::MOVSD:
3666   case X86ISD::UNPCKL:
3667   case X86ISD::UNPCKH:
3668     return DAG.getNode(Opc, dl, VT, V1, V2);
3669   }
3670 }
3671
3672 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3673   MachineFunction &MF = DAG.getMachineFunction();
3674   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
3675       DAG.getSubtarget().getRegisterInfo());
3676   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3677   int ReturnAddrIndex = FuncInfo->getRAIndex();
3678
3679   if (ReturnAddrIndex == 0) {
3680     // Set up a frame object for the return address.
3681     unsigned SlotSize = RegInfo->getSlotSize();
3682     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3683                                                            -(int64_t)SlotSize,
3684                                                            false);
3685     FuncInfo->setRAIndex(ReturnAddrIndex);
3686   }
3687
3688   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
3689 }
3690
3691 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3692                                        bool hasSymbolicDisplacement) {
3693   // Offset should fit into 32 bit immediate field.
3694   if (!isInt<32>(Offset))
3695     return false;
3696
3697   // If we don't have a symbolic displacement - we don't have any extra
3698   // restrictions.
3699   if (!hasSymbolicDisplacement)
3700     return true;
3701
3702   // FIXME: Some tweaks might be needed for medium code model.
3703   if (M != CodeModel::Small && M != CodeModel::Kernel)
3704     return false;
3705
3706   // For small code model we assume that latest object is 16MB before end of 31
3707   // bits boundary. We may also accept pretty large negative constants knowing
3708   // that all objects are in the positive half of address space.
3709   if (M == CodeModel::Small && Offset < 16*1024*1024)
3710     return true;
3711
3712   // For kernel code model we know that all object resist in the negative half
3713   // of 32bits address space. We may not accept negative offsets, since they may
3714   // be just off and we may accept pretty large positive ones.
3715   if (M == CodeModel::Kernel && Offset >= 0)
3716     return true;
3717
3718   return false;
3719 }
3720
3721 /// isCalleePop - Determines whether the callee is required to pop its
3722 /// own arguments. Callee pop is necessary to support tail calls.
3723 bool X86::isCalleePop(CallingConv::ID CallingConv,
3724                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
3725   switch (CallingConv) {
3726   default:
3727     return false;
3728   case CallingConv::X86_StdCall:
3729   case CallingConv::X86_FastCall:
3730   case CallingConv::X86_ThisCall:
3731     return !is64Bit;
3732   case CallingConv::Fast:
3733   case CallingConv::GHC:
3734   case CallingConv::HiPE:
3735     if (IsVarArg)
3736       return false;
3737     return TailCallOpt;
3738   }
3739 }
3740
3741 /// \brief Return true if the condition is an unsigned comparison operation.
3742 static bool isX86CCUnsigned(unsigned X86CC) {
3743   switch (X86CC) {
3744   default: llvm_unreachable("Invalid integer condition!");
3745   case X86::COND_E:     return true;
3746   case X86::COND_G:     return false;
3747   case X86::COND_GE:    return false;
3748   case X86::COND_L:     return false;
3749   case X86::COND_LE:    return false;
3750   case X86::COND_NE:    return true;
3751   case X86::COND_B:     return true;
3752   case X86::COND_A:     return true;
3753   case X86::COND_BE:    return true;
3754   case X86::COND_AE:    return true;
3755   }
3756   llvm_unreachable("covered switch fell through?!");
3757 }
3758
3759 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
3760 /// specific condition code, returning the condition code and the LHS/RHS of the
3761 /// comparison to make.
3762 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
3763                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
3764   if (!isFP) {
3765     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3766       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3767         // X > -1   -> X == 0, jump !sign.
3768         RHS = DAG.getConstant(0, RHS.getValueType());
3769         return X86::COND_NS;
3770       }
3771       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3772         // X < 0   -> X == 0, jump on sign.
3773         return X86::COND_S;
3774       }
3775       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3776         // X < 1   -> X <= 0
3777         RHS = DAG.getConstant(0, RHS.getValueType());
3778         return X86::COND_LE;
3779       }
3780     }
3781
3782     switch (SetCCOpcode) {
3783     default: llvm_unreachable("Invalid integer condition!");
3784     case ISD::SETEQ:  return X86::COND_E;
3785     case ISD::SETGT:  return X86::COND_G;
3786     case ISD::SETGE:  return X86::COND_GE;
3787     case ISD::SETLT:  return X86::COND_L;
3788     case ISD::SETLE:  return X86::COND_LE;
3789     case ISD::SETNE:  return X86::COND_NE;
3790     case ISD::SETULT: return X86::COND_B;
3791     case ISD::SETUGT: return X86::COND_A;
3792     case ISD::SETULE: return X86::COND_BE;
3793     case ISD::SETUGE: return X86::COND_AE;
3794     }
3795   }
3796
3797   // First determine if it is required or is profitable to flip the operands.
3798
3799   // If LHS is a foldable load, but RHS is not, flip the condition.
3800   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3801       !ISD::isNON_EXTLoad(RHS.getNode())) {
3802     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3803     std::swap(LHS, RHS);
3804   }
3805
3806   switch (SetCCOpcode) {
3807   default: break;
3808   case ISD::SETOLT:
3809   case ISD::SETOLE:
3810   case ISD::SETUGT:
3811   case ISD::SETUGE:
3812     std::swap(LHS, RHS);
3813     break;
3814   }
3815
3816   // On a floating point condition, the flags are set as follows:
3817   // ZF  PF  CF   op
3818   //  0 | 0 | 0 | X > Y
3819   //  0 | 0 | 1 | X < Y
3820   //  1 | 0 | 0 | X == Y
3821   //  1 | 1 | 1 | unordered
3822   switch (SetCCOpcode) {
3823   default: llvm_unreachable("Condcode should be pre-legalized away");
3824   case ISD::SETUEQ:
3825   case ISD::SETEQ:   return X86::COND_E;
3826   case ISD::SETOLT:              // flipped
3827   case ISD::SETOGT:
3828   case ISD::SETGT:   return X86::COND_A;
3829   case ISD::SETOLE:              // flipped
3830   case ISD::SETOGE:
3831   case ISD::SETGE:   return X86::COND_AE;
3832   case ISD::SETUGT:              // flipped
3833   case ISD::SETULT:
3834   case ISD::SETLT:   return X86::COND_B;
3835   case ISD::SETUGE:              // flipped
3836   case ISD::SETULE:
3837   case ISD::SETLE:   return X86::COND_BE;
3838   case ISD::SETONE:
3839   case ISD::SETNE:   return X86::COND_NE;
3840   case ISD::SETUO:   return X86::COND_P;
3841   case ISD::SETO:    return X86::COND_NP;
3842   case ISD::SETOEQ:
3843   case ISD::SETUNE:  return X86::COND_INVALID;
3844   }
3845 }
3846
3847 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
3848 /// code. Current x86 isa includes the following FP cmov instructions:
3849 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3850 static bool hasFPCMov(unsigned X86CC) {
3851   switch (X86CC) {
3852   default:
3853     return false;
3854   case X86::COND_B:
3855   case X86::COND_BE:
3856   case X86::COND_E:
3857   case X86::COND_P:
3858   case X86::COND_A:
3859   case X86::COND_AE:
3860   case X86::COND_NE:
3861   case X86::COND_NP:
3862     return true;
3863   }
3864 }
3865
3866 /// isFPImmLegal - Returns true if the target can instruction select the
3867 /// specified FP immediate natively. If false, the legalizer will
3868 /// materialize the FP immediate as a load from a constant pool.
3869 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
3870   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
3871     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
3872       return true;
3873   }
3874   return false;
3875 }
3876
3877 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
3878                                               ISD::LoadExtType ExtTy,
3879                                               EVT NewVT) const {
3880   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3881   // relocation target a movq or addq instruction: don't let the load shrink.
3882   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3883   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3884     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3885       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3886   return true;
3887 }
3888
3889 /// \brief Returns true if it is beneficial to convert a load of a constant
3890 /// to just the constant itself.
3891 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
3892                                                           Type *Ty) const {
3893   assert(Ty->isIntegerTy());
3894
3895   unsigned BitSize = Ty->getPrimitiveSizeInBits();
3896   if (BitSize == 0 || BitSize > 64)
3897     return false;
3898   return true;
3899 }
3900
3901 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
3902                                                 unsigned Index) const {
3903   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
3904     return false;
3905
3906   return (Index == 0 || Index == ResVT.getVectorNumElements());
3907 }
3908
3909 bool X86TargetLowering::isCheapToSpeculateCttz() const {
3910   // Speculate cttz only if we can directly use TZCNT.
3911   return Subtarget->hasBMI();
3912 }
3913
3914 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
3915   // Speculate ctlz only if we can directly use LZCNT.
3916   return Subtarget->hasLZCNT();
3917 }
3918
3919 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
3920 /// the specified range (L, H].
3921 static bool isUndefOrInRange(int Val, int Low, int Hi) {
3922   return (Val < 0) || (Val >= Low && Val < Hi);
3923 }
3924
3925 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
3926 /// specified value.
3927 static bool isUndefOrEqual(int Val, int CmpVal) {
3928   return (Val < 0 || Val == CmpVal);
3929 }
3930
3931 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
3932 /// from position Pos and ending in Pos+Size, falls within the specified
3933 /// sequential range (Low, Low+Size]. or is undef.
3934 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
3935                                        unsigned Pos, unsigned Size, int Low) {
3936   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
3937     if (!isUndefOrEqual(Mask[i], Low))
3938       return false;
3939   return true;
3940 }
3941
3942 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
3943 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
3944 /// operand - by default will match for first operand.
3945 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
3946                          bool TestSecondOperand = false) {
3947   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
3948       VT != MVT::v2f64 && VT != MVT::v2i64)
3949     return false;
3950
3951   unsigned NumElems = VT.getVectorNumElements();
3952   unsigned Lo = TestSecondOperand ? NumElems : 0;
3953   unsigned Hi = Lo + NumElems;
3954
3955   for (unsigned i = 0; i < NumElems; ++i)
3956     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
3957       return false;
3958
3959   return true;
3960 }
3961
3962 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
3963 /// is suitable for input to PSHUFHW.
3964 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3965   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3966     return false;
3967
3968   // Lower quadword copied in order or undef.
3969   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
3970     return false;
3971
3972   // Upper quadword shuffled.
3973   for (unsigned i = 4; i != 8; ++i)
3974     if (!isUndefOrInRange(Mask[i], 4, 8))
3975       return false;
3976
3977   if (VT == MVT::v16i16) {
3978     // Lower quadword copied in order or undef.
3979     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
3980       return false;
3981
3982     // Upper quadword shuffled.
3983     for (unsigned i = 12; i != 16; ++i)
3984       if (!isUndefOrInRange(Mask[i], 12, 16))
3985         return false;
3986   }
3987
3988   return true;
3989 }
3990
3991 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
3992 /// is suitable for input to PSHUFLW.
3993 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
3994   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
3995     return false;
3996
3997   // Upper quadword copied in order.
3998   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
3999     return false;
4000
4001   // Lower quadword shuffled.
4002   for (unsigned i = 0; i != 4; ++i)
4003     if (!isUndefOrInRange(Mask[i], 0, 4))
4004       return false;
4005
4006   if (VT == MVT::v16i16) {
4007     // Upper quadword copied in order.
4008     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
4009       return false;
4010
4011     // Lower quadword shuffled.
4012     for (unsigned i = 8; i != 12; ++i)
4013       if (!isUndefOrInRange(Mask[i], 8, 12))
4014         return false;
4015   }
4016
4017   return true;
4018 }
4019
4020 /// \brief Return true if the mask specifies a shuffle of elements that is
4021 /// suitable for input to intralane (palignr) or interlane (valign) vector
4022 /// right-shift.
4023 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
4024   unsigned NumElts = VT.getVectorNumElements();
4025   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
4026   unsigned NumLaneElts = NumElts/NumLanes;
4027
4028   // Do not handle 64-bit element shuffles with palignr.
4029   if (NumLaneElts == 2)
4030     return false;
4031
4032   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
4033     unsigned i;
4034     for (i = 0; i != NumLaneElts; ++i) {
4035       if (Mask[i+l] >= 0)
4036         break;
4037     }
4038
4039     // Lane is all undef, go to next lane
4040     if (i == NumLaneElts)
4041       continue;
4042
4043     int Start = Mask[i+l];
4044
4045     // Make sure its in this lane in one of the sources
4046     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
4047         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
4048       return false;
4049
4050     // If not lane 0, then we must match lane 0
4051     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
4052       return false;
4053
4054     // Correct second source to be contiguous with first source
4055     if (Start >= (int)NumElts)
4056       Start -= NumElts - NumLaneElts;
4057
4058     // Make sure we're shifting in the right direction.
4059     if (Start <= (int)(i+l))
4060       return false;
4061
4062     Start -= i;
4063
4064     // Check the rest of the elements to see if they are consecutive.
4065     for (++i; i != NumLaneElts; ++i) {
4066       int Idx = Mask[i+l];
4067
4068       // Make sure its in this lane
4069       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
4070           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
4071         return false;
4072
4073       // If not lane 0, then we must match lane 0
4074       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
4075         return false;
4076
4077       if (Idx >= (int)NumElts)
4078         Idx -= NumElts - NumLaneElts;
4079
4080       if (!isUndefOrEqual(Idx, Start+i))
4081         return false;
4082
4083     }
4084   }
4085
4086   return true;
4087 }
4088
4089 /// \brief Return true if the node specifies a shuffle of elements that is
4090 /// suitable for input to PALIGNR.
4091 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
4092                           const X86Subtarget *Subtarget) {
4093   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
4094       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
4095       VT.is512BitVector())
4096     // FIXME: Add AVX512BW.
4097     return false;
4098
4099   return isAlignrMask(Mask, VT, false);
4100 }
4101
4102 /// \brief Return true if the node specifies a shuffle of elements that is
4103 /// suitable for input to VALIGN.
4104 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
4105                           const X86Subtarget *Subtarget) {
4106   // FIXME: Add AVX512VL.
4107   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
4108     return false;
4109   return isAlignrMask(Mask, VT, true);
4110 }
4111
4112 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
4113 /// the two vector operands have swapped position.
4114 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
4115                                      unsigned NumElems) {
4116   for (unsigned i = 0; i != NumElems; ++i) {
4117     int idx = Mask[i];
4118     if (idx < 0)
4119       continue;
4120     else if (idx < (int)NumElems)
4121       Mask[i] = idx + NumElems;
4122     else
4123       Mask[i] = idx - NumElems;
4124   }
4125 }
4126
4127 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
4128 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
4129 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
4130 /// reverse of what x86 shuffles want.
4131 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
4132
4133   unsigned NumElems = VT.getVectorNumElements();
4134   unsigned NumLanes = VT.getSizeInBits()/128;
4135   unsigned NumLaneElems = NumElems/NumLanes;
4136
4137   if (NumLaneElems != 2 && NumLaneElems != 4)
4138     return false;
4139
4140   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4141   bool symetricMaskRequired =
4142     (VT.getSizeInBits() >= 256) && (EltSize == 32);
4143
4144   // VSHUFPSY divides the resulting vector into 4 chunks.
4145   // The sources are also splitted into 4 chunks, and each destination
4146   // chunk must come from a different source chunk.
4147   //
4148   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
4149   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
4150   //
4151   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
4152   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
4153   //
4154   // VSHUFPDY divides the resulting vector into 4 chunks.
4155   // The sources are also splitted into 4 chunks, and each destination
4156   // chunk must come from a different source chunk.
4157   //
4158   //  SRC1 =>      X3       X2       X1       X0
4159   //  SRC2 =>      Y3       Y2       Y1       Y0
4160   //
4161   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
4162   //
4163   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
4164   unsigned HalfLaneElems = NumLaneElems/2;
4165   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
4166     for (unsigned i = 0; i != NumLaneElems; ++i) {
4167       int Idx = Mask[i+l];
4168       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
4169       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
4170         return false;
4171       // For VSHUFPSY, the mask of the second half must be the same as the
4172       // first but with the appropriate offsets. This works in the same way as
4173       // VPERMILPS works with masks.
4174       if (!symetricMaskRequired || Idx < 0)
4175         continue;
4176       if (MaskVal[i] < 0) {
4177         MaskVal[i] = Idx - l;
4178         continue;
4179       }
4180       if ((signed)(Idx - l) != MaskVal[i])
4181         return false;
4182     }
4183   }
4184
4185   return true;
4186 }
4187
4188 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
4189 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
4190 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
4191   if (!VT.is128BitVector())
4192     return false;
4193
4194   unsigned NumElems = VT.getVectorNumElements();
4195
4196   if (NumElems != 4)
4197     return false;
4198
4199   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
4200   return isUndefOrEqual(Mask[0], 6) &&
4201          isUndefOrEqual(Mask[1], 7) &&
4202          isUndefOrEqual(Mask[2], 2) &&
4203          isUndefOrEqual(Mask[3], 3);
4204 }
4205
4206 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
4207 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
4208 /// <2, 3, 2, 3>
4209 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
4210   if (!VT.is128BitVector())
4211     return false;
4212
4213   unsigned NumElems = VT.getVectorNumElements();
4214
4215   if (NumElems != 4)
4216     return false;
4217
4218   return isUndefOrEqual(Mask[0], 2) &&
4219          isUndefOrEqual(Mask[1], 3) &&
4220          isUndefOrEqual(Mask[2], 2) &&
4221          isUndefOrEqual(Mask[3], 3);
4222 }
4223
4224 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
4225 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
4226 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
4227   if (!VT.is128BitVector())
4228     return false;
4229
4230   unsigned NumElems = VT.getVectorNumElements();
4231
4232   if (NumElems != 2 && NumElems != 4)
4233     return false;
4234
4235   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4236     if (!isUndefOrEqual(Mask[i], i + NumElems))
4237       return false;
4238
4239   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
4240     if (!isUndefOrEqual(Mask[i], i))
4241       return false;
4242
4243   return true;
4244 }
4245
4246 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
4247 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
4248 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
4249   if (!VT.is128BitVector())
4250     return false;
4251
4252   unsigned NumElems = VT.getVectorNumElements();
4253
4254   if (NumElems != 2 && NumElems != 4)
4255     return false;
4256
4257   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4258     if (!isUndefOrEqual(Mask[i], i))
4259       return false;
4260
4261   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
4262     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
4263       return false;
4264
4265   return true;
4266 }
4267
4268 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
4269 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
4270 /// i. e: If all but one element come from the same vector.
4271 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
4272   // TODO: Deal with AVX's VINSERTPS
4273   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
4274     return false;
4275
4276   unsigned CorrectPosV1 = 0;
4277   unsigned CorrectPosV2 = 0;
4278   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
4279     if (Mask[i] == -1) {
4280       ++CorrectPosV1;
4281       ++CorrectPosV2;
4282       continue;
4283     }
4284
4285     if (Mask[i] == i)
4286       ++CorrectPosV1;
4287     else if (Mask[i] == i + 4)
4288       ++CorrectPosV2;
4289   }
4290
4291   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
4292     // We have 3 elements (undefs count as elements from any vector) from one
4293     // vector, and one from another.
4294     return true;
4295
4296   return false;
4297 }
4298
4299 //
4300 // Some special combinations that can be optimized.
4301 //
4302 static
4303 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
4304                                SelectionDAG &DAG) {
4305   MVT VT = SVOp->getSimpleValueType(0);
4306   SDLoc dl(SVOp);
4307
4308   if (VT != MVT::v8i32 && VT != MVT::v8f32)
4309     return SDValue();
4310
4311   ArrayRef<int> Mask = SVOp->getMask();
4312
4313   // These are the special masks that may be optimized.
4314   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
4315   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
4316   bool MatchEvenMask = true;
4317   bool MatchOddMask  = true;
4318   for (int i=0; i<8; ++i) {
4319     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
4320       MatchEvenMask = false;
4321     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
4322       MatchOddMask = false;
4323   }
4324
4325   if (!MatchEvenMask && !MatchOddMask)
4326     return SDValue();
4327
4328   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
4329
4330   SDValue Op0 = SVOp->getOperand(0);
4331   SDValue Op1 = SVOp->getOperand(1);
4332
4333   if (MatchEvenMask) {
4334     // Shift the second operand right to 32 bits.
4335     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
4336     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
4337   } else {
4338     // Shift the first operand left to 32 bits.
4339     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
4340     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
4341   }
4342   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
4343   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
4344 }
4345
4346 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
4347 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
4348 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
4349                          bool HasInt256, bool V2IsSplat = false) {
4350
4351   assert(VT.getSizeInBits() >= 128 &&
4352          "Unsupported vector type for unpckl");
4353
4354   unsigned NumElts = VT.getVectorNumElements();
4355   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4356       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4357     return false;
4358
4359   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4360          "Unsupported vector type for unpckh");
4361
4362   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4363   unsigned NumLanes = VT.getSizeInBits()/128;
4364   unsigned NumLaneElts = NumElts/NumLanes;
4365
4366   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4367     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4368       int BitI  = Mask[l+i];
4369       int BitI1 = Mask[l+i+1];
4370       if (!isUndefOrEqual(BitI, j))
4371         return false;
4372       if (V2IsSplat) {
4373         if (!isUndefOrEqual(BitI1, NumElts))
4374           return false;
4375       } else {
4376         if (!isUndefOrEqual(BitI1, j + NumElts))
4377           return false;
4378       }
4379     }
4380   }
4381
4382   return true;
4383 }
4384
4385 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
4386 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
4387 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
4388                          bool HasInt256, bool V2IsSplat = false) {
4389   assert(VT.getSizeInBits() >= 128 &&
4390          "Unsupported vector type for unpckh");
4391
4392   unsigned NumElts = VT.getVectorNumElements();
4393   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4394       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4395     return false;
4396
4397   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
4398          "Unsupported vector type for unpckh");
4399
4400   // AVX defines UNPCK* to operate independently on 128-bit lanes.
4401   unsigned NumLanes = VT.getSizeInBits()/128;
4402   unsigned NumLaneElts = NumElts/NumLanes;
4403
4404   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4405     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4406       int BitI  = Mask[l+i];
4407       int BitI1 = Mask[l+i+1];
4408       if (!isUndefOrEqual(BitI, j))
4409         return false;
4410       if (V2IsSplat) {
4411         if (isUndefOrEqual(BitI1, NumElts))
4412           return false;
4413       } else {
4414         if (!isUndefOrEqual(BitI1, j+NumElts))
4415           return false;
4416       }
4417     }
4418   }
4419   return true;
4420 }
4421
4422 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
4423 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
4424 /// <0, 0, 1, 1>
4425 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4426   unsigned NumElts = VT.getVectorNumElements();
4427   bool Is256BitVec = VT.is256BitVector();
4428
4429   if (VT.is512BitVector())
4430     return false;
4431   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4432          "Unsupported vector type for unpckh");
4433
4434   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
4435       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4436     return false;
4437
4438   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
4439   // FIXME: Need a better way to get rid of this, there's no latency difference
4440   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
4441   // the former later. We should also remove the "_undef" special mask.
4442   if (NumElts == 4 && Is256BitVec)
4443     return false;
4444
4445   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4446   // independently on 128-bit lanes.
4447   unsigned NumLanes = VT.getSizeInBits()/128;
4448   unsigned NumLaneElts = NumElts/NumLanes;
4449
4450   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4451     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
4452       int BitI  = Mask[l+i];
4453       int BitI1 = Mask[l+i+1];
4454
4455       if (!isUndefOrEqual(BitI, j))
4456         return false;
4457       if (!isUndefOrEqual(BitI1, j))
4458         return false;
4459     }
4460   }
4461
4462   return true;
4463 }
4464
4465 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
4466 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
4467 /// <2, 2, 3, 3>
4468 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
4469   unsigned NumElts = VT.getVectorNumElements();
4470
4471   if (VT.is512BitVector())
4472     return false;
4473
4474   assert((VT.is128BitVector() || VT.is256BitVector()) &&
4475          "Unsupported vector type for unpckh");
4476
4477   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
4478       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
4479     return false;
4480
4481   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
4482   // independently on 128-bit lanes.
4483   unsigned NumLanes = VT.getSizeInBits()/128;
4484   unsigned NumLaneElts = NumElts/NumLanes;
4485
4486   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
4487     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
4488       int BitI  = Mask[l+i];
4489       int BitI1 = Mask[l+i+1];
4490       if (!isUndefOrEqual(BitI, j))
4491         return false;
4492       if (!isUndefOrEqual(BitI1, j))
4493         return false;
4494     }
4495   }
4496   return true;
4497 }
4498
4499 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
4500 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
4501 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
4502   if (!VT.is512BitVector())
4503     return false;
4504
4505   unsigned NumElts = VT.getVectorNumElements();
4506   unsigned HalfSize = NumElts/2;
4507   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
4508     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
4509       *Imm = 1;
4510       return true;
4511     }
4512   }
4513   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
4514     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
4515       *Imm = 0;
4516       return true;
4517     }
4518   }
4519   return false;
4520 }
4521
4522 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
4523 /// specifies a shuffle of elements that is suitable for input to MOVSS,
4524 /// MOVSD, and MOVD, i.e. setting the lowest element.
4525 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
4526   if (VT.getVectorElementType().getSizeInBits() < 32)
4527     return false;
4528   if (!VT.is128BitVector())
4529     return false;
4530
4531   unsigned NumElts = VT.getVectorNumElements();
4532
4533   if (!isUndefOrEqual(Mask[0], NumElts))
4534     return false;
4535
4536   for (unsigned i = 1; i != NumElts; ++i)
4537     if (!isUndefOrEqual(Mask[i], i))
4538       return false;
4539
4540   return true;
4541 }
4542
4543 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
4544 /// as permutations between 128-bit chunks or halves. As an example: this
4545 /// shuffle bellow:
4546 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
4547 /// The first half comes from the second half of V1 and the second half from the
4548 /// the second half of V2.
4549 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4550   if (!HasFp256 || !VT.is256BitVector())
4551     return false;
4552
4553   // The shuffle result is divided into half A and half B. In total the two
4554   // sources have 4 halves, namely: C, D, E, F. The final values of A and
4555   // B must come from C, D, E or F.
4556   unsigned HalfSize = VT.getVectorNumElements()/2;
4557   bool MatchA = false, MatchB = false;
4558
4559   // Check if A comes from one of C, D, E, F.
4560   for (unsigned Half = 0; Half != 4; ++Half) {
4561     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
4562       MatchA = true;
4563       break;
4564     }
4565   }
4566
4567   // Check if B comes from one of C, D, E, F.
4568   for (unsigned Half = 0; Half != 4; ++Half) {
4569     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
4570       MatchB = true;
4571       break;
4572     }
4573   }
4574
4575   return MatchA && MatchB;
4576 }
4577
4578 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
4579 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
4580 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
4581   MVT VT = SVOp->getSimpleValueType(0);
4582
4583   unsigned HalfSize = VT.getVectorNumElements()/2;
4584
4585   unsigned FstHalf = 0, SndHalf = 0;
4586   for (unsigned i = 0; i < HalfSize; ++i) {
4587     if (SVOp->getMaskElt(i) > 0) {
4588       FstHalf = SVOp->getMaskElt(i)/HalfSize;
4589       break;
4590     }
4591   }
4592   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
4593     if (SVOp->getMaskElt(i) > 0) {
4594       SndHalf = SVOp->getMaskElt(i)/HalfSize;
4595       break;
4596     }
4597   }
4598
4599   return (FstHalf | (SndHalf << 4));
4600 }
4601
4602 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
4603 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
4604   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4605   if (EltSize < 32)
4606     return false;
4607
4608   unsigned NumElts = VT.getVectorNumElements();
4609   Imm8 = 0;
4610   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
4611     for (unsigned i = 0; i != NumElts; ++i) {
4612       if (Mask[i] < 0)
4613         continue;
4614       Imm8 |= Mask[i] << (i*2);
4615     }
4616     return true;
4617   }
4618
4619   unsigned LaneSize = 4;
4620   SmallVector<int, 4> MaskVal(LaneSize, -1);
4621
4622   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4623     for (unsigned i = 0; i != LaneSize; ++i) {
4624       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4625         return false;
4626       if (Mask[i+l] < 0)
4627         continue;
4628       if (MaskVal[i] < 0) {
4629         MaskVal[i] = Mask[i+l] - l;
4630         Imm8 |= MaskVal[i] << (i*2);
4631         continue;
4632       }
4633       if (Mask[i+l] != (signed)(MaskVal[i]+l))
4634         return false;
4635     }
4636   }
4637   return true;
4638 }
4639
4640 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
4641 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
4642 /// Note that VPERMIL mask matching is different depending whether theunderlying
4643 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
4644 /// to the same elements of the low, but to the higher half of the source.
4645 /// In VPERMILPD the two lanes could be shuffled independently of each other
4646 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
4647 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
4648   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
4649   if (VT.getSizeInBits() < 256 || EltSize < 32)
4650     return false;
4651   bool symetricMaskRequired = (EltSize == 32);
4652   unsigned NumElts = VT.getVectorNumElements();
4653
4654   unsigned NumLanes = VT.getSizeInBits()/128;
4655   unsigned LaneSize = NumElts/NumLanes;
4656   // 2 or 4 elements in one lane
4657
4658   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
4659   for (unsigned l = 0; l != NumElts; l += LaneSize) {
4660     for (unsigned i = 0; i != LaneSize; ++i) {
4661       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
4662         return false;
4663       if (symetricMaskRequired) {
4664         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
4665           ExpectedMaskVal[i] = Mask[i+l] - l;
4666           continue;
4667         }
4668         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
4669           return false;
4670       }
4671     }
4672   }
4673   return true;
4674 }
4675
4676 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
4677 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
4678 /// element of vector 2 and the other elements to come from vector 1 in order.
4679 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
4680                                bool V2IsSplat = false, bool V2IsUndef = false) {
4681   if (!VT.is128BitVector())
4682     return false;
4683
4684   unsigned NumOps = VT.getVectorNumElements();
4685   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
4686     return false;
4687
4688   if (!isUndefOrEqual(Mask[0], 0))
4689     return false;
4690
4691   for (unsigned i = 1; i != NumOps; ++i)
4692     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
4693           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
4694           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
4695       return false;
4696
4697   return true;
4698 }
4699
4700 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4701 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
4702 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
4703 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
4704                            const X86Subtarget *Subtarget) {
4705   if (!Subtarget->hasSSE3())
4706     return false;
4707
4708   unsigned NumElems = VT.getVectorNumElements();
4709
4710   if ((VT.is128BitVector() && NumElems != 4) ||
4711       (VT.is256BitVector() && NumElems != 8) ||
4712       (VT.is512BitVector() && NumElems != 16))
4713     return false;
4714
4715   // "i+1" is the value the indexed mask element must have
4716   for (unsigned i = 0; i != NumElems; i += 2)
4717     if (!isUndefOrEqual(Mask[i], i+1) ||
4718         !isUndefOrEqual(Mask[i+1], i+1))
4719       return false;
4720
4721   return true;
4722 }
4723
4724 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4725 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
4726 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
4727 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
4728                            const X86Subtarget *Subtarget) {
4729   if (!Subtarget->hasSSE3())
4730     return false;
4731
4732   unsigned NumElems = VT.getVectorNumElements();
4733
4734   if ((VT.is128BitVector() && NumElems != 4) ||
4735       (VT.is256BitVector() && NumElems != 8) ||
4736       (VT.is512BitVector() && NumElems != 16))
4737     return false;
4738
4739   // "i" is the value the indexed mask element must have
4740   for (unsigned i = 0; i != NumElems; i += 2)
4741     if (!isUndefOrEqual(Mask[i], i) ||
4742         !isUndefOrEqual(Mask[i+1], i))
4743       return false;
4744
4745   return true;
4746 }
4747
4748 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
4749 /// specifies a shuffle of elements that is suitable for input to 256-bit
4750 /// version of MOVDDUP.
4751 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
4752   if (!HasFp256 || !VT.is256BitVector())
4753     return false;
4754
4755   unsigned NumElts = VT.getVectorNumElements();
4756   if (NumElts != 4)
4757     return false;
4758
4759   for (unsigned i = 0; i != NumElts/2; ++i)
4760     if (!isUndefOrEqual(Mask[i], 0))
4761       return false;
4762   for (unsigned i = NumElts/2; i != NumElts; ++i)
4763     if (!isUndefOrEqual(Mask[i], NumElts/2))
4764       return false;
4765   return true;
4766 }
4767
4768 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
4769 /// specifies a shuffle of elements that is suitable for input to 128-bit
4770 /// version of MOVDDUP.
4771 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
4772   if (!VT.is128BitVector())
4773     return false;
4774
4775   unsigned e = VT.getVectorNumElements() / 2;
4776   for (unsigned i = 0; i != e; ++i)
4777     if (!isUndefOrEqual(Mask[i], i))
4778       return false;
4779   for (unsigned i = 0; i != e; ++i)
4780     if (!isUndefOrEqual(Mask[e+i], i))
4781       return false;
4782   return true;
4783 }
4784
4785 /// isVEXTRACTIndex - Return true if the specified
4786 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
4787 /// suitable for instruction that extract 128 or 256 bit vectors
4788 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4789   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4790   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4791     return false;
4792
4793   // The index should be aligned on a vecWidth-bit boundary.
4794   uint64_t Index =
4795     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4796
4797   MVT VT = N->getSimpleValueType(0);
4798   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4799   bool Result = (Index * ElSize) % vecWidth == 0;
4800
4801   return Result;
4802 }
4803
4804 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
4805 /// operand specifies a subvector insert that is suitable for input to
4806 /// insertion of 128 or 256-bit subvectors
4807 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4808   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4809   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4810     return false;
4811   // The index should be aligned on a vecWidth-bit boundary.
4812   uint64_t Index =
4813     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4814
4815   MVT VT = N->getSimpleValueType(0);
4816   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4817   bool Result = (Index * ElSize) % vecWidth == 0;
4818
4819   return Result;
4820 }
4821
4822 bool X86::isVINSERT128Index(SDNode *N) {
4823   return isVINSERTIndex(N, 128);
4824 }
4825
4826 bool X86::isVINSERT256Index(SDNode *N) {
4827   return isVINSERTIndex(N, 256);
4828 }
4829
4830 bool X86::isVEXTRACT128Index(SDNode *N) {
4831   return isVEXTRACTIndex(N, 128);
4832 }
4833
4834 bool X86::isVEXTRACT256Index(SDNode *N) {
4835   return isVEXTRACTIndex(N, 256);
4836 }
4837
4838 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
4839 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
4840 /// Handles 128-bit and 256-bit.
4841 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
4842   MVT VT = N->getSimpleValueType(0);
4843
4844   assert((VT.getSizeInBits() >= 128) &&
4845          "Unsupported vector type for PSHUF/SHUFP");
4846
4847   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
4848   // independently on 128-bit lanes.
4849   unsigned NumElts = VT.getVectorNumElements();
4850   unsigned NumLanes = VT.getSizeInBits()/128;
4851   unsigned NumLaneElts = NumElts/NumLanes;
4852
4853   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
4854          "Only supports 2, 4 or 8 elements per lane");
4855
4856   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
4857   unsigned Mask = 0;
4858   for (unsigned i = 0; i != NumElts; ++i) {
4859     int Elt = N->getMaskElt(i);
4860     if (Elt < 0) continue;
4861     Elt &= NumLaneElts - 1;
4862     unsigned ShAmt = (i << Shift) % 8;
4863     Mask |= Elt << ShAmt;
4864   }
4865
4866   return Mask;
4867 }
4868
4869 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
4870 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
4871 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
4872   MVT VT = N->getSimpleValueType(0);
4873
4874   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4875          "Unsupported vector type for PSHUFHW");
4876
4877   unsigned NumElts = VT.getVectorNumElements();
4878
4879   unsigned Mask = 0;
4880   for (unsigned l = 0; l != NumElts; l += 8) {
4881     // 8 nodes per lane, but we only care about the last 4.
4882     for (unsigned i = 0; i < 4; ++i) {
4883       int Elt = N->getMaskElt(l+i+4);
4884       if (Elt < 0) continue;
4885       Elt &= 0x3; // only 2-bits.
4886       Mask |= Elt << (i * 2);
4887     }
4888   }
4889
4890   return Mask;
4891 }
4892
4893 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
4894 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
4895 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
4896   MVT VT = N->getSimpleValueType(0);
4897
4898   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
4899          "Unsupported vector type for PSHUFHW");
4900
4901   unsigned NumElts = VT.getVectorNumElements();
4902
4903   unsigned Mask = 0;
4904   for (unsigned l = 0; l != NumElts; l += 8) {
4905     // 8 nodes per lane, but we only care about the first 4.
4906     for (unsigned i = 0; i < 4; ++i) {
4907       int Elt = N->getMaskElt(l+i);
4908       if (Elt < 0) continue;
4909       Elt &= 0x3; // only 2-bits
4910       Mask |= Elt << (i * 2);
4911     }
4912   }
4913
4914   return Mask;
4915 }
4916
4917 /// \brief Return the appropriate immediate to shuffle the specified
4918 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
4919 /// VALIGN (if Interlane is true) instructions.
4920 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
4921                                            bool InterLane) {
4922   MVT VT = SVOp->getSimpleValueType(0);
4923   unsigned EltSize = InterLane ? 1 :
4924     VT.getVectorElementType().getSizeInBits() >> 3;
4925
4926   unsigned NumElts = VT.getVectorNumElements();
4927   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
4928   unsigned NumLaneElts = NumElts/NumLanes;
4929
4930   int Val = 0;
4931   unsigned i;
4932   for (i = 0; i != NumElts; ++i) {
4933     Val = SVOp->getMaskElt(i);
4934     if (Val >= 0)
4935       break;
4936   }
4937   if (Val >= (int)NumElts)
4938     Val -= NumElts - NumLaneElts;
4939
4940   assert(Val - i > 0 && "PALIGNR imm should be positive");
4941   return (Val - i) * EltSize;
4942 }
4943
4944 /// \brief Return the appropriate immediate to shuffle the specified
4945 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
4946 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
4947   return getShuffleAlignrImmediate(SVOp, false);
4948 }
4949
4950 /// \brief Return the appropriate immediate to shuffle the specified
4951 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
4952 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
4953   return getShuffleAlignrImmediate(SVOp, true);
4954 }
4955
4956
4957 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4958   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4959   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4960     llvm_unreachable("Illegal extract subvector for VEXTRACT");
4961
4962   uint64_t Index =
4963     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4964
4965   MVT VecVT = N->getOperand(0).getSimpleValueType();
4966   MVT ElVT = VecVT.getVectorElementType();
4967
4968   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4969   return Index / NumElemsPerChunk;
4970 }
4971
4972 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4973   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4974   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4975     llvm_unreachable("Illegal insert subvector for VINSERT");
4976
4977   uint64_t Index =
4978     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4979
4980   MVT VecVT = N->getSimpleValueType(0);
4981   MVT ElVT = VecVT.getVectorElementType();
4982
4983   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4984   return Index / NumElemsPerChunk;
4985 }
4986
4987 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
4988 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
4989 /// and VINSERTI128 instructions.
4990 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4991   return getExtractVEXTRACTImmediate(N, 128);
4992 }
4993
4994 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
4995 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
4996 /// and VINSERTI64x4 instructions.
4997 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4998   return getExtractVEXTRACTImmediate(N, 256);
4999 }
5000
5001 /// getInsertVINSERT128Immediate - Return the appropriate immediate
5002 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
5003 /// and VINSERTI128 instructions.
5004 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
5005   return getInsertVINSERTImmediate(N, 128);
5006 }
5007
5008 /// getInsertVINSERT256Immediate - Return the appropriate immediate
5009 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
5010 /// and VINSERTI64x4 instructions.
5011 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
5012   return getInsertVINSERTImmediate(N, 256);
5013 }
5014
5015 /// isZero - Returns true if Elt is a constant integer zero
5016 static bool isZero(SDValue V) {
5017   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
5018   return C && C->isNullValue();
5019 }
5020
5021 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
5022 /// constant +0.0.
5023 bool X86::isZeroNode(SDValue Elt) {
5024   if (isZero(Elt))
5025     return true;
5026   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
5027     return CFP->getValueAPF().isPosZero();
5028   return false;
5029 }
5030
5031 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
5032 /// match movhlps. The lower half elements should come from upper half of
5033 /// V1 (and in order), and the upper half elements should come from the upper
5034 /// half of V2 (and in order).
5035 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
5036   if (!VT.is128BitVector())
5037     return false;
5038   if (VT.getVectorNumElements() != 4)
5039     return false;
5040   for (unsigned i = 0, e = 2; i != e; ++i)
5041     if (!isUndefOrEqual(Mask[i], i+2))
5042       return false;
5043   for (unsigned i = 2; i != 4; ++i)
5044     if (!isUndefOrEqual(Mask[i], i+4))
5045       return false;
5046   return true;
5047 }
5048
5049 /// isScalarLoadToVector - Returns true if the node is a scalar load that
5050 /// is promoted to a vector. It also returns the LoadSDNode by reference if
5051 /// required.
5052 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
5053   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
5054     return false;
5055   N = N->getOperand(0).getNode();
5056   if (!ISD::isNON_EXTLoad(N))
5057     return false;
5058   if (LD)
5059     *LD = cast<LoadSDNode>(N);
5060   return true;
5061 }
5062
5063 // Test whether the given value is a vector value which will be legalized
5064 // into a load.
5065 static bool WillBeConstantPoolLoad(SDNode *N) {
5066   if (N->getOpcode() != ISD::BUILD_VECTOR)
5067     return false;
5068
5069   // Check for any non-constant elements.
5070   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
5071     switch (N->getOperand(i).getNode()->getOpcode()) {
5072     case ISD::UNDEF:
5073     case ISD::ConstantFP:
5074     case ISD::Constant:
5075       break;
5076     default:
5077       return false;
5078     }
5079
5080   // Vectors of all-zeros and all-ones are materialized with special
5081   // instructions rather than being loaded.
5082   return !ISD::isBuildVectorAllZeros(N) &&
5083          !ISD::isBuildVectorAllOnes(N);
5084 }
5085
5086 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
5087 /// match movlp{s|d}. The lower half elements should come from lower half of
5088 /// V1 (and in order), and the upper half elements should come from the upper
5089 /// half of V2 (and in order). And since V1 will become the source of the
5090 /// MOVLP, it must be either a vector load or a scalar load to vector.
5091 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
5092                                ArrayRef<int> Mask, MVT VT) {
5093   if (!VT.is128BitVector())
5094     return false;
5095
5096   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
5097     return false;
5098   // Is V2 is a vector load, don't do this transformation. We will try to use
5099   // load folding shufps op.
5100   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
5101     return false;
5102
5103   unsigned NumElems = VT.getVectorNumElements();
5104
5105   if (NumElems != 2 && NumElems != 4)
5106     return false;
5107   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
5108     if (!isUndefOrEqual(Mask[i], i))
5109       return false;
5110   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
5111     if (!isUndefOrEqual(Mask[i], i+NumElems))
5112       return false;
5113   return true;
5114 }
5115
5116 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
5117 /// to an zero vector.
5118 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
5119 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
5120   SDValue V1 = N->getOperand(0);
5121   SDValue V2 = N->getOperand(1);
5122   unsigned NumElems = N->getValueType(0).getVectorNumElements();
5123   for (unsigned i = 0; i != NumElems; ++i) {
5124     int Idx = N->getMaskElt(i);
5125     if (Idx >= (int)NumElems) {
5126       unsigned Opc = V2.getOpcode();
5127       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
5128         continue;
5129       if (Opc != ISD::BUILD_VECTOR ||
5130           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
5131         return false;
5132     } else if (Idx >= 0) {
5133       unsigned Opc = V1.getOpcode();
5134       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
5135         continue;
5136       if (Opc != ISD::BUILD_VECTOR ||
5137           !X86::isZeroNode(V1.getOperand(Idx)))
5138         return false;
5139     }
5140   }
5141   return true;
5142 }
5143
5144 /// getZeroVector - Returns a vector of specified type with all zero elements.
5145 ///
5146 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
5147                              SelectionDAG &DAG, SDLoc dl) {
5148   assert(VT.isVector() && "Expected a vector type");
5149
5150   // Always build SSE zero vectors as <4 x i32> bitcasted
5151   // to their dest type. This ensures they get CSE'd.
5152   SDValue Vec;
5153   if (VT.is128BitVector()) {  // SSE
5154     if (Subtarget->hasSSE2()) {  // SSE2
5155       SDValue Cst = DAG.getConstant(0, MVT::i32);
5156       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5157     } else { // SSE1
5158       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5159       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
5160     }
5161   } else if (VT.is256BitVector()) { // AVX
5162     if (Subtarget->hasInt256()) { // AVX2
5163       SDValue Cst = DAG.getConstant(0, MVT::i32);
5164       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5165       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5166     } else {
5167       // 256-bit logic and arithmetic instructions in AVX are all
5168       // floating-point, no support for integer ops. Emit fp zeroed vectors.
5169       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
5170       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5171       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
5172     }
5173   } else if (VT.is512BitVector()) { // AVX-512
5174       SDValue Cst = DAG.getConstant(0, MVT::i32);
5175       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
5176                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5177       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
5178   } else if (VT.getScalarType() == MVT::i1) {
5179     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
5180     SDValue Cst = DAG.getConstant(0, MVT::i1);
5181     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
5182     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
5183   } else
5184     llvm_unreachable("Unexpected vector type");
5185
5186   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5187 }
5188
5189 /// getOnesVector - Returns a vector of specified type with all bits set.
5190 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5191 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
5192 /// Then bitcast to their original type, ensuring they get CSE'd.
5193 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
5194                              SDLoc dl) {
5195   assert(VT.isVector() && "Expected a vector type");
5196
5197   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
5198   SDValue Vec;
5199   if (VT.is256BitVector()) {
5200     if (HasInt256) { // AVX2
5201       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
5202       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
5203     } else { // AVX
5204       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5205       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5206     }
5207   } else if (VT.is128BitVector()) {
5208     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
5209   } else
5210     llvm_unreachable("Unexpected vector type");
5211
5212   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
5213 }
5214
5215 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
5216 /// that point to V2 points to its first element.
5217 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
5218   for (unsigned i = 0; i != NumElems; ++i) {
5219     if (Mask[i] > (int)NumElems) {
5220       Mask[i] = NumElems;
5221     }
5222   }
5223 }
5224
5225 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
5226 /// operation of specified width.
5227 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
5228                        SDValue V2) {
5229   unsigned NumElems = VT.getVectorNumElements();
5230   SmallVector<int, 8> Mask;
5231   Mask.push_back(NumElems);
5232   for (unsigned i = 1; i != NumElems; ++i)
5233     Mask.push_back(i);
5234   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5235 }
5236
5237 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
5238 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5239                           SDValue V2) {
5240   unsigned NumElems = VT.getVectorNumElements();
5241   SmallVector<int, 8> Mask;
5242   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
5243     Mask.push_back(i);
5244     Mask.push_back(i + NumElems);
5245   }
5246   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5247 }
5248
5249 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
5250 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
5251                           SDValue V2) {
5252   unsigned NumElems = VT.getVectorNumElements();
5253   SmallVector<int, 8> Mask;
5254   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
5255     Mask.push_back(i + Half);
5256     Mask.push_back(i + NumElems + Half);
5257   }
5258   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
5259 }
5260
5261 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
5262 // a generic shuffle instruction because the target has no such instructions.
5263 // Generate shuffles which repeat i16 and i8 several times until they can be
5264 // represented by v4f32 and then be manipulated by target suported shuffles.
5265 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
5266   MVT VT = V.getSimpleValueType();
5267   int NumElems = VT.getVectorNumElements();
5268   SDLoc dl(V);
5269
5270   while (NumElems > 4) {
5271     if (EltNo < NumElems/2) {
5272       V = getUnpackl(DAG, dl, VT, V, V);
5273     } else {
5274       V = getUnpackh(DAG, dl, VT, V, V);
5275       EltNo -= NumElems/2;
5276     }
5277     NumElems >>= 1;
5278   }
5279   return V;
5280 }
5281
5282 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
5283 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
5284   MVT VT = V.getSimpleValueType();
5285   SDLoc dl(V);
5286
5287   if (VT.is128BitVector()) {
5288     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
5289     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
5290     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
5291                              &SplatMask[0]);
5292   } else if (VT.is256BitVector()) {
5293     // To use VPERMILPS to splat scalars, the second half of indicies must
5294     // refer to the higher part, which is a duplication of the lower one,
5295     // because VPERMILPS can only handle in-lane permutations.
5296     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
5297                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
5298
5299     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
5300     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
5301                              &SplatMask[0]);
5302   } else
5303     llvm_unreachable("Vector size not supported");
5304
5305   return DAG.getNode(ISD::BITCAST, dl, VT, V);
5306 }
5307
5308 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
5309 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
5310   MVT SrcVT = SV->getSimpleValueType(0);
5311   SDValue V1 = SV->getOperand(0);
5312   SDLoc dl(SV);
5313
5314   int EltNo = SV->getSplatIndex();
5315   int NumElems = SrcVT.getVectorNumElements();
5316   bool Is256BitVec = SrcVT.is256BitVector();
5317
5318   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
5319          "Unknown how to promote splat for type");
5320
5321   // Extract the 128-bit part containing the splat element and update
5322   // the splat element index when it refers to the higher register.
5323   if (Is256BitVec) {
5324     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
5325     if (EltNo >= NumElems/2)
5326       EltNo -= NumElems/2;
5327   }
5328
5329   // All i16 and i8 vector types can't be used directly by a generic shuffle
5330   // instruction because the target has no such instruction. Generate shuffles
5331   // which repeat i16 and i8 several times until they fit in i32, and then can
5332   // be manipulated by target suported shuffles.
5333   MVT EltVT = SrcVT.getVectorElementType();
5334   if (EltVT == MVT::i8 || EltVT == MVT::i16)
5335     V1 = PromoteSplati8i16(V1, DAG, EltNo);
5336
5337   // Recreate the 256-bit vector and place the same 128-bit vector
5338   // into the low and high part. This is necessary because we want
5339   // to use VPERM* to shuffle the vectors
5340   if (Is256BitVec) {
5341     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
5342   }
5343
5344   return getLegalSplat(DAG, V1, EltNo);
5345 }
5346
5347 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
5348 /// vector of zero or undef vector.  This produces a shuffle where the low
5349 /// element of V2 is swizzled into the zero/undef vector, landing at element
5350 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5351 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
5352                                            bool IsZero,
5353                                            const X86Subtarget *Subtarget,
5354                                            SelectionDAG &DAG) {
5355   MVT VT = V2.getSimpleValueType();
5356   SDValue V1 = IsZero
5357     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5358   unsigned NumElems = VT.getVectorNumElements();
5359   SmallVector<int, 16> MaskVec;
5360   for (unsigned i = 0; i != NumElems; ++i)
5361     // If this is the insertion idx, put the low elt of V2 here.
5362     MaskVec.push_back(i == Idx ? NumElems : i);
5363   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
5364 }
5365
5366 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
5367 /// target specific opcode. Returns true if the Mask could be calculated. Sets
5368 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
5369 /// shuffles which use a single input multiple times, and in those cases it will
5370 /// adjust the mask to only have indices within that single input.
5371 static bool getTargetShuffleMask(SDNode *N, MVT VT,
5372                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5373   unsigned NumElems = VT.getVectorNumElements();
5374   SDValue ImmN;
5375
5376   IsUnary = false;
5377   bool IsFakeUnary = false;
5378   switch(N->getOpcode()) {
5379   case X86ISD::BLENDI:
5380     ImmN = N->getOperand(N->getNumOperands()-1);
5381     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5382     break;
5383   case X86ISD::SHUFP:
5384     ImmN = N->getOperand(N->getNumOperands()-1);
5385     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5386     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5387     break;
5388   case X86ISD::UNPCKH:
5389     DecodeUNPCKHMask(VT, Mask);
5390     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5391     break;
5392   case X86ISD::UNPCKL:
5393     DecodeUNPCKLMask(VT, Mask);
5394     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5395     break;
5396   case X86ISD::MOVHLPS:
5397     DecodeMOVHLPSMask(NumElems, Mask);
5398     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5399     break;
5400   case X86ISD::MOVLHPS:
5401     DecodeMOVLHPSMask(NumElems, Mask);
5402     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5403     break;
5404   case X86ISD::PALIGNR:
5405     ImmN = N->getOperand(N->getNumOperands()-1);
5406     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5407     break;
5408   case X86ISD::PSHUFD:
5409   case X86ISD::VPERMILPI:
5410     ImmN = N->getOperand(N->getNumOperands()-1);
5411     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5412     IsUnary = true;
5413     break;
5414   case X86ISD::PSHUFHW:
5415     ImmN = N->getOperand(N->getNumOperands()-1);
5416     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5417     IsUnary = true;
5418     break;
5419   case X86ISD::PSHUFLW:
5420     ImmN = N->getOperand(N->getNumOperands()-1);
5421     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5422     IsUnary = true;
5423     break;
5424   case X86ISD::PSHUFB: {
5425     IsUnary = true;
5426     SDValue MaskNode = N->getOperand(1);
5427     while (MaskNode->getOpcode() == ISD::BITCAST)
5428       MaskNode = MaskNode->getOperand(0);
5429
5430     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
5431       // If we have a build-vector, then things are easy.
5432       EVT VT = MaskNode.getValueType();
5433       assert(VT.isVector() &&
5434              "Can't produce a non-vector with a build_vector!");
5435       if (!VT.isInteger())
5436         return false;
5437
5438       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
5439
5440       SmallVector<uint64_t, 32> RawMask;
5441       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
5442         SDValue Op = MaskNode->getOperand(i);
5443         if (Op->getOpcode() == ISD::UNDEF) {
5444           RawMask.push_back((uint64_t)SM_SentinelUndef);
5445           continue;
5446         }
5447         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
5448         if (!CN)
5449           return false;
5450         APInt MaskElement = CN->getAPIntValue();
5451
5452         // We now have to decode the element which could be any integer size and
5453         // extract each byte of it.
5454         for (int j = 0; j < NumBytesPerElement; ++j) {
5455           // Note that this is x86 and so always little endian: the low byte is
5456           // the first byte of the mask.
5457           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
5458           MaskElement = MaskElement.lshr(8);
5459         }
5460       }
5461       DecodePSHUFBMask(RawMask, Mask);
5462       break;
5463     }
5464
5465     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
5466     if (!MaskLoad)
5467       return false;
5468
5469     SDValue Ptr = MaskLoad->getBasePtr();
5470     if (Ptr->getOpcode() == X86ISD::Wrapper)
5471       Ptr = Ptr->getOperand(0);
5472
5473     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
5474     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
5475       return false;
5476
5477     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
5478       DecodePSHUFBMask(C, Mask);
5479       if (Mask.empty())
5480         return false;
5481       break;
5482     }
5483
5484     return false;
5485   }
5486   case X86ISD::VPERMI:
5487     ImmN = N->getOperand(N->getNumOperands()-1);
5488     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5489     IsUnary = true;
5490     break;
5491   case X86ISD::MOVSS:
5492   case X86ISD::MOVSD: {
5493     // The index 0 always comes from the first element of the second source,
5494     // this is why MOVSS and MOVSD are used in the first place. The other
5495     // elements come from the other positions of the first source vector
5496     Mask.push_back(NumElems);
5497     for (unsigned i = 1; i != NumElems; ++i) {
5498       Mask.push_back(i);
5499     }
5500     break;
5501   }
5502   case X86ISD::VPERM2X128:
5503     ImmN = N->getOperand(N->getNumOperands()-1);
5504     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5505     if (Mask.empty()) return false;
5506     break;
5507   case X86ISD::MOVSLDUP:
5508     DecodeMOVSLDUPMask(VT, Mask);
5509     break;
5510   case X86ISD::MOVSHDUP:
5511     DecodeMOVSHDUPMask(VT, Mask);
5512     break;
5513   case X86ISD::MOVDDUP:
5514   case X86ISD::MOVLHPD:
5515   case X86ISD::MOVLPD:
5516   case X86ISD::MOVLPS:
5517     // Not yet implemented
5518     return false;
5519   default: llvm_unreachable("unknown target shuffle node");
5520   }
5521
5522   // If we have a fake unary shuffle, the shuffle mask is spread across two
5523   // inputs that are actually the same node. Re-map the mask to always point
5524   // into the first input.
5525   if (IsFakeUnary)
5526     for (int &M : Mask)
5527       if (M >= (int)Mask.size())
5528         M -= Mask.size();
5529
5530   return true;
5531 }
5532
5533 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
5534 /// element of the result of the vector shuffle.
5535 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5536                                    unsigned Depth) {
5537   if (Depth == 6)
5538     return SDValue();  // Limit search depth.
5539
5540   SDValue V = SDValue(N, 0);
5541   EVT VT = V.getValueType();
5542   unsigned Opcode = V.getOpcode();
5543
5544   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5545   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5546     int Elt = SV->getMaskElt(Index);
5547
5548     if (Elt < 0)
5549       return DAG.getUNDEF(VT.getVectorElementType());
5550
5551     unsigned NumElems = VT.getVectorNumElements();
5552     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5553                                          : SV->getOperand(1);
5554     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5555   }
5556
5557   // Recurse into target specific vector shuffles to find scalars.
5558   if (isTargetShuffle(Opcode)) {
5559     MVT ShufVT = V.getSimpleValueType();
5560     unsigned NumElems = ShufVT.getVectorNumElements();
5561     SmallVector<int, 16> ShuffleMask;
5562     bool IsUnary;
5563
5564     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
5565       return SDValue();
5566
5567     int Elt = ShuffleMask[Index];
5568     if (Elt < 0)
5569       return DAG.getUNDEF(ShufVT.getVectorElementType());
5570
5571     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
5572                                          : N->getOperand(1);
5573     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5574                                Depth+1);
5575   }
5576
5577   // Actual nodes that may contain scalar elements
5578   if (Opcode == ISD::BITCAST) {
5579     V = V.getOperand(0);
5580     EVT SrcVT = V.getValueType();
5581     unsigned NumElems = VT.getVectorNumElements();
5582
5583     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5584       return SDValue();
5585   }
5586
5587   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5588     return (Index == 0) ? V.getOperand(0)
5589                         : DAG.getUNDEF(VT.getVectorElementType());
5590
5591   if (V.getOpcode() == ISD::BUILD_VECTOR)
5592     return V.getOperand(Index);
5593
5594   return SDValue();
5595 }
5596
5597 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
5598 /// shuffle operation which come from a consecutively from a zero. The
5599 /// search can start in two different directions, from left or right.
5600 /// We count undefs as zeros until PreferredNum is reached.
5601 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
5602                                          unsigned NumElems, bool ZerosFromLeft,
5603                                          SelectionDAG &DAG,
5604                                          unsigned PreferredNum = -1U) {
5605   unsigned NumZeros = 0;
5606   for (unsigned i = 0; i != NumElems; ++i) {
5607     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
5608     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
5609     if (!Elt.getNode())
5610       break;
5611
5612     if (X86::isZeroNode(Elt))
5613       ++NumZeros;
5614     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
5615       NumZeros = std::min(NumZeros + 1, PreferredNum);
5616     else
5617       break;
5618   }
5619
5620   return NumZeros;
5621 }
5622
5623 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
5624 /// correspond consecutively to elements from one of the vector operands,
5625 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
5626 static
5627 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
5628                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
5629                               unsigned NumElems, unsigned &OpNum) {
5630   bool SeenV1 = false;
5631   bool SeenV2 = false;
5632
5633   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
5634     int Idx = SVOp->getMaskElt(i);
5635     // Ignore undef indicies
5636     if (Idx < 0)
5637       continue;
5638
5639     if (Idx < (int)NumElems)
5640       SeenV1 = true;
5641     else
5642       SeenV2 = true;
5643
5644     // Only accept consecutive elements from the same vector
5645     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
5646       return false;
5647   }
5648
5649   OpNum = SeenV1 ? 0 : 1;
5650   return true;
5651 }
5652
5653 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
5654 /// logical left shift of a vector.
5655 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5656                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5657   unsigned NumElems =
5658     SVOp->getSimpleValueType(0).getVectorNumElements();
5659   unsigned NumZeros = getNumOfConsecutiveZeros(
5660       SVOp, NumElems, false /* check zeros from right */, DAG,
5661       SVOp->getMaskElt(0));
5662   unsigned OpSrc;
5663
5664   if (!NumZeros)
5665     return false;
5666
5667   // Considering the elements in the mask that are not consecutive zeros,
5668   // check if they consecutively come from only one of the source vectors.
5669   //
5670   //               V1 = {X, A, B, C}     0
5671   //                         \  \  \    /
5672   //   vector_shuffle V1, V2 <1, 2, 3, X>
5673   //
5674   if (!isShuffleMaskConsecutive(SVOp,
5675             0,                   // Mask Start Index
5676             NumElems-NumZeros,   // Mask End Index(exclusive)
5677             NumZeros,            // Where to start looking in the src vector
5678             NumElems,            // Number of elements in vector
5679             OpSrc))              // Which source operand ?
5680     return false;
5681
5682   isLeft = false;
5683   ShAmt = NumZeros;
5684   ShVal = SVOp->getOperand(OpSrc);
5685   return true;
5686 }
5687
5688 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
5689 /// logical left shift of a vector.
5690 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5691                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5692   unsigned NumElems =
5693     SVOp->getSimpleValueType(0).getVectorNumElements();
5694   unsigned NumZeros = getNumOfConsecutiveZeros(
5695       SVOp, NumElems, true /* check zeros from left */, DAG,
5696       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
5697   unsigned OpSrc;
5698
5699   if (!NumZeros)
5700     return false;
5701
5702   // Considering the elements in the mask that are not consecutive zeros,
5703   // check if they consecutively come from only one of the source vectors.
5704   //
5705   //                           0    { A, B, X, X } = V2
5706   //                          / \    /  /
5707   //   vector_shuffle V1, V2 <X, X, 4, 5>
5708   //
5709   if (!isShuffleMaskConsecutive(SVOp,
5710             NumZeros,     // Mask Start Index
5711             NumElems,     // Mask End Index(exclusive)
5712             0,            // Where to start looking in the src vector
5713             NumElems,     // Number of elements in vector
5714             OpSrc))       // Which source operand ?
5715     return false;
5716
5717   isLeft = true;
5718   ShAmt = NumZeros;
5719   ShVal = SVOp->getOperand(OpSrc);
5720   return true;
5721 }
5722
5723 /// isVectorShift - Returns true if the shuffle can be implemented as a
5724 /// logical left or right shift of a vector.
5725 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
5726                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
5727   // Although the logic below support any bitwidth size, there are no
5728   // shift instructions which handle more than 128-bit vectors.
5729   if (!SVOp->getSimpleValueType(0).is128BitVector())
5730     return false;
5731
5732   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
5733       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
5734     return true;
5735
5736   return false;
5737 }
5738
5739 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
5740 ///
5741 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5742                                        unsigned NumNonZero, unsigned NumZero,
5743                                        SelectionDAG &DAG,
5744                                        const X86Subtarget* Subtarget,
5745                                        const TargetLowering &TLI) {
5746   if (NumNonZero > 8)
5747     return SDValue();
5748
5749   SDLoc dl(Op);
5750   SDValue V;
5751   bool First = true;
5752   for (unsigned i = 0; i < 16; ++i) {
5753     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5754     if (ThisIsNonZero && First) {
5755       if (NumZero)
5756         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5757       else
5758         V = DAG.getUNDEF(MVT::v8i16);
5759       First = false;
5760     }
5761
5762     if ((i & 1) != 0) {
5763       SDValue ThisElt, LastElt;
5764       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5765       if (LastIsNonZero) {
5766         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5767                               MVT::i16, Op.getOperand(i-1));
5768       }
5769       if (ThisIsNonZero) {
5770         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5771         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5772                               ThisElt, DAG.getConstant(8, MVT::i8));
5773         if (LastIsNonZero)
5774           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5775       } else
5776         ThisElt = LastElt;
5777
5778       if (ThisElt.getNode())
5779         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5780                         DAG.getIntPtrConstant(i/2));
5781     }
5782   }
5783
5784   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
5785 }
5786
5787 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
5788 ///
5789 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5790                                      unsigned NumNonZero, unsigned NumZero,
5791                                      SelectionDAG &DAG,
5792                                      const X86Subtarget* Subtarget,
5793                                      const TargetLowering &TLI) {
5794   if (NumNonZero > 4)
5795     return SDValue();
5796
5797   SDLoc dl(Op);
5798   SDValue V;
5799   bool First = true;
5800   for (unsigned i = 0; i < 8; ++i) {
5801     bool isNonZero = (NonZeros & (1 << i)) != 0;
5802     if (isNonZero) {
5803       if (First) {
5804         if (NumZero)
5805           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5806         else
5807           V = DAG.getUNDEF(MVT::v8i16);
5808         First = false;
5809       }
5810       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5811                       MVT::v8i16, V, Op.getOperand(i),
5812                       DAG.getIntPtrConstant(i));
5813     }
5814   }
5815
5816   return V;
5817 }
5818
5819 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
5820 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5821                                      const X86Subtarget *Subtarget,
5822                                      const TargetLowering &TLI) {
5823   // Find all zeroable elements.
5824   bool Zeroable[4];
5825   for (int i=0; i < 4; ++i) {
5826     SDValue Elt = Op->getOperand(i);
5827     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
5828   }
5829   assert(std::count_if(&Zeroable[0], &Zeroable[4],
5830                        [](bool M) { return !M; }) > 1 &&
5831          "We expect at least two non-zero elements!");
5832
5833   // We only know how to deal with build_vector nodes where elements are either
5834   // zeroable or extract_vector_elt with constant index.
5835   SDValue FirstNonZero;
5836   unsigned FirstNonZeroIdx;
5837   for (unsigned i=0; i < 4; ++i) {
5838     if (Zeroable[i])
5839       continue;
5840     SDValue Elt = Op->getOperand(i);
5841     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5842         !isa<ConstantSDNode>(Elt.getOperand(1)))
5843       return SDValue();
5844     // Make sure that this node is extracting from a 128-bit vector.
5845     MVT VT = Elt.getOperand(0).getSimpleValueType();
5846     if (!VT.is128BitVector())
5847       return SDValue();
5848     if (!FirstNonZero.getNode()) {
5849       FirstNonZero = Elt;
5850       FirstNonZeroIdx = i;
5851     }
5852   }
5853
5854   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5855   SDValue V1 = FirstNonZero.getOperand(0);
5856   MVT VT = V1.getSimpleValueType();
5857
5858   // See if this build_vector can be lowered as a blend with zero.
5859   SDValue Elt;
5860   unsigned EltMaskIdx, EltIdx;
5861   int Mask[4];
5862   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5863     if (Zeroable[EltIdx]) {
5864       // The zero vector will be on the right hand side.
5865       Mask[EltIdx] = EltIdx+4;
5866       continue;
5867     }
5868
5869     Elt = Op->getOperand(EltIdx);
5870     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5871     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5872     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5873       break;
5874     Mask[EltIdx] = EltIdx;
5875   }
5876
5877   if (EltIdx == 4) {
5878     // Let the shuffle legalizer deal with blend operations.
5879     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5880     if (V1.getSimpleValueType() != VT)
5881       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
5882     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
5883   }
5884
5885   // See if we can lower this build_vector to a INSERTPS.
5886   if (!Subtarget->hasSSE41())
5887     return SDValue();
5888
5889   SDValue V2 = Elt.getOperand(0);
5890   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5891     V1 = SDValue();
5892
5893   bool CanFold = true;
5894   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5895     if (Zeroable[i])
5896       continue;
5897
5898     SDValue Current = Op->getOperand(i);
5899     SDValue SrcVector = Current->getOperand(0);
5900     if (!V1.getNode())
5901       V1 = SrcVector;
5902     CanFold = SrcVector == V1 &&
5903       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5904   }
5905
5906   if (!CanFold)
5907     return SDValue();
5908
5909   assert(V1.getNode() && "Expected at least two non-zero elements!");
5910   if (V1.getSimpleValueType() != MVT::v4f32)
5911     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
5912   if (V2.getSimpleValueType() != MVT::v4f32)
5913     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
5914
5915   // Ok, we can emit an INSERTPS instruction.
5916   unsigned ZMask = 0;
5917   for (int i = 0; i < 4; ++i)
5918     if (Zeroable[i])
5919       ZMask |= 1 << i;
5920
5921   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5922   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5923   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
5924                                DAG.getIntPtrConstant(InsertPSMask));
5925   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
5926 }
5927
5928 /// getVShift - Return a vector logical shift node.
5929 ///
5930 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
5931                          unsigned NumBits, SelectionDAG &DAG,
5932                          const TargetLowering &TLI, SDLoc dl) {
5933   assert(VT.is128BitVector() && "Unknown type for VShift");
5934   EVT ShVT = MVT::v2i64;
5935   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5936   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
5937   return DAG.getNode(ISD::BITCAST, dl, VT,
5938                      DAG.getNode(Opc, dl, ShVT, SrcOp,
5939                              DAG.getConstant(NumBits,
5940                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
5941 }
5942
5943 static SDValue
5944 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
5945
5946   // Check if the scalar load can be widened into a vector load. And if
5947   // the address is "base + cst" see if the cst can be "absorbed" into
5948   // the shuffle mask.
5949   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5950     SDValue Ptr = LD->getBasePtr();
5951     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5952       return SDValue();
5953     EVT PVT = LD->getValueType(0);
5954     if (PVT != MVT::i32 && PVT != MVT::f32)
5955       return SDValue();
5956
5957     int FI = -1;
5958     int64_t Offset = 0;
5959     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5960       FI = FINode->getIndex();
5961       Offset = 0;
5962     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5963                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5964       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5965       Offset = Ptr.getConstantOperandVal(1);
5966       Ptr = Ptr.getOperand(0);
5967     } else {
5968       return SDValue();
5969     }
5970
5971     // FIXME: 256-bit vector instructions don't require a strict alignment,
5972     // improve this code to support it better.
5973     unsigned RequiredAlign = VT.getSizeInBits()/8;
5974     SDValue Chain = LD->getChain();
5975     // Make sure the stack object alignment is at least 16 or 32.
5976     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5977     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5978       if (MFI->isFixedObjectIndex(FI)) {
5979         // Can't change the alignment. FIXME: It's possible to compute
5980         // the exact stack offset and reference FI + adjust offset instead.
5981         // If someone *really* cares about this. That's the way to implement it.
5982         return SDValue();
5983       } else {
5984         MFI->setObjectAlignment(FI, RequiredAlign);
5985       }
5986     }
5987
5988     // (Offset % 16 or 32) must be multiple of 4. Then address is then
5989     // Ptr + (Offset & ~15).
5990     if (Offset < 0)
5991       return SDValue();
5992     if ((Offset % RequiredAlign) & 3)
5993       return SDValue();
5994     int64_t StartOffset = Offset & ~(RequiredAlign-1);
5995     if (StartOffset)
5996       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
5997                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
5998
5999     int EltNo = (Offset - StartOffset) >> 2;
6000     unsigned NumElems = VT.getVectorNumElements();
6001
6002     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6003     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6004                              LD->getPointerInfo().getWithOffset(StartOffset),
6005                              false, false, false, 0);
6006
6007     SmallVector<int, 8> Mask;
6008     for (unsigned i = 0; i != NumElems; ++i)
6009       Mask.push_back(EltNo);
6010
6011     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
6012   }
6013
6014   return SDValue();
6015 }
6016
6017 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
6018 /// vector of type 'VT', see if the elements can be replaced by a single large
6019 /// load which has the same value as a build_vector whose operands are 'elts'.
6020 ///
6021 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
6022 ///
6023 /// FIXME: we'd also like to handle the case where the last elements are zero
6024 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
6025 /// There's even a handy isZeroNode for that purpose.
6026 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
6027                                         SDLoc &DL, SelectionDAG &DAG,
6028                                         bool isAfterLegalize) {
6029   EVT EltVT = VT.getVectorElementType();
6030   unsigned NumElems = Elts.size();
6031
6032   LoadSDNode *LDBase = nullptr;
6033   unsigned LastLoadedElt = -1U;
6034
6035   // For each element in the initializer, see if we've found a load or an undef.
6036   // If we don't find an initial load element, or later load elements are
6037   // non-consecutive, bail out.
6038   for (unsigned i = 0; i < NumElems; ++i) {
6039     SDValue Elt = Elts[i];
6040
6041     if (!Elt.getNode() ||
6042         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
6043       return SDValue();
6044     if (!LDBase) {
6045       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
6046         return SDValue();
6047       LDBase = cast<LoadSDNode>(Elt.getNode());
6048       LastLoadedElt = i;
6049       continue;
6050     }
6051     if (Elt.getOpcode() == ISD::UNDEF)
6052       continue;
6053
6054     LoadSDNode *LD = cast<LoadSDNode>(Elt);
6055     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
6056       return SDValue();
6057     LastLoadedElt = i;
6058   }
6059
6060   // If we have found an entire vector of loads and undefs, then return a large
6061   // load of the entire vector width starting at the base pointer.  If we found
6062   // consecutive loads for the low half, generate a vzext_load node.
6063   if (LastLoadedElt == NumElems - 1) {
6064
6065     if (isAfterLegalize &&
6066         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
6067       return SDValue();
6068
6069     SDValue NewLd = SDValue();
6070
6071     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6072                         LDBase->getPointerInfo(), LDBase->isVolatile(),
6073                         LDBase->isNonTemporal(), LDBase->isInvariant(),
6074                         LDBase->getAlignment());
6075
6076     if (LDBase->hasAnyUseOfValue(1)) {
6077       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6078                                      SDValue(LDBase, 1),
6079                                      SDValue(NewLd.getNode(), 1));
6080       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6081       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6082                              SDValue(NewLd.getNode(), 1));
6083     }
6084
6085     return NewLd;
6086   }
6087
6088   //TODO: The code below fires only for for loading the low v2i32 / v2f32
6089   //of a v4i32 / v4f32. It's probably worth generalizing.
6090   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
6091       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
6092     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
6093     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6094     SDValue ResNode =
6095         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
6096                                 LDBase->getPointerInfo(),
6097                                 LDBase->getAlignment(),
6098                                 false/*isVolatile*/, true/*ReadMem*/,
6099                                 false/*WriteMem*/);
6100
6101     // Make sure the newly-created LOAD is in the same position as LDBase in
6102     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
6103     // update uses of LDBase's output chain to use the TokenFactor.
6104     if (LDBase->hasAnyUseOfValue(1)) {
6105       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
6106                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
6107       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6108       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6109                              SDValue(ResNode.getNode(), 1));
6110     }
6111
6112     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
6113   }
6114   return SDValue();
6115 }
6116
6117 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
6118 /// to generate a splat value for the following cases:
6119 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
6120 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6121 /// a scalar load, or a constant.
6122 /// The VBROADCAST node is returned when a pattern is found,
6123 /// or SDValue() otherwise.
6124 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
6125                                     SelectionDAG &DAG) {
6126   // VBROADCAST requires AVX.
6127   // TODO: Splats could be generated for non-AVX CPUs using SSE
6128   // instructions, but there's less potential gain for only 128-bit vectors.
6129   if (!Subtarget->hasAVX())
6130     return SDValue();
6131
6132   MVT VT = Op.getSimpleValueType();
6133   SDLoc dl(Op);
6134
6135   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6136          "Unsupported vector type for broadcast.");
6137
6138   SDValue Ld;
6139   bool ConstSplatVal;
6140
6141   switch (Op.getOpcode()) {
6142     default:
6143       // Unknown pattern found.
6144       return SDValue();
6145
6146     case ISD::BUILD_VECTOR: {
6147       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
6148       BitVector UndefElements;
6149       SDValue Splat = BVOp->getSplatValue(&UndefElements);
6150
6151       // We need a splat of a single value to use broadcast, and it doesn't
6152       // make any sense if the value is only in one element of the vector.
6153       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
6154         return SDValue();
6155
6156       Ld = Splat;
6157       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6158                        Ld.getOpcode() == ISD::ConstantFP);
6159
6160       // Make sure that all of the users of a non-constant load are from the
6161       // BUILD_VECTOR node.
6162       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6163         return SDValue();
6164       break;
6165     }
6166
6167     case ISD::VECTOR_SHUFFLE: {
6168       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
6169
6170       // Shuffles must have a splat mask where the first element is
6171       // broadcasted.
6172       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
6173         return SDValue();
6174
6175       SDValue Sc = Op.getOperand(0);
6176       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
6177           Sc.getOpcode() != ISD::BUILD_VECTOR) {
6178
6179         if (!Subtarget->hasInt256())
6180           return SDValue();
6181
6182         // Use the register form of the broadcast instruction available on AVX2.
6183         if (VT.getSizeInBits() >= 256)
6184           Sc = Extract128BitVector(Sc, 0, DAG, dl);
6185         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
6186       }
6187
6188       Ld = Sc.getOperand(0);
6189       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
6190                        Ld.getOpcode() == ISD::ConstantFP);
6191
6192       // The scalar_to_vector node and the suspected
6193       // load node must have exactly one user.
6194       // Constants may have multiple users.
6195
6196       // AVX-512 has register version of the broadcast
6197       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
6198         Ld.getValueType().getSizeInBits() >= 32;
6199       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
6200           !hasRegVer))
6201         return SDValue();
6202       break;
6203     }
6204   }
6205
6206   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
6207   bool IsGE256 = (VT.getSizeInBits() >= 256);
6208
6209   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6210   // instruction to save 8 or more bytes of constant pool data.
6211   // TODO: If multiple splats are generated to load the same constant,
6212   // it may be detrimental to overall size. There needs to be a way to detect
6213   // that condition to know if this is truly a size win.
6214   const Function *F = DAG.getMachineFunction().getFunction();
6215   bool OptForSize = F->getAttributes().
6216     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
6217
6218   // Handle broadcasting a single constant scalar from the constant pool
6219   // into a vector.
6220   // On Sandybridge (no AVX2), it is still better to load a constant vector
6221   // from the constant pool and not to broadcast it from a scalar.
6222   // But override that restriction when optimizing for size.
6223   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6224   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
6225     EVT CVT = Ld.getValueType();
6226     assert(!CVT.isVector() && "Must not broadcast a vector type");
6227
6228     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6229     // For size optimization, also splat v2f64 and v2i64, and for size opt
6230     // with AVX2, also splat i8 and i16.
6231     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6232     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6233         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
6234       const Constant *C = nullptr;
6235       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6236         C = CI->getConstantIntValue();
6237       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6238         C = CF->getConstantFPValue();
6239
6240       assert(C && "Invalid constant type");
6241
6242       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6243       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
6244       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6245       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
6246                        MachinePointerInfo::getConstantPool(),
6247                        false, false, false, Alignment);
6248
6249       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6250     }
6251   }
6252
6253   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6254
6255   // Handle AVX2 in-register broadcasts.
6256   if (!IsLoad && Subtarget->hasInt256() &&
6257       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6258     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6259
6260   // The scalar source must be a normal load.
6261   if (!IsLoad)
6262     return SDValue();
6263
6264   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6265       (Subtarget->hasVLX() && ScalarSize == 64))
6266     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6267
6268   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6269   // double since there is no vbroadcastsd xmm
6270   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
6271     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6272       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6273   }
6274
6275   // Unsupported broadcast.
6276   return SDValue();
6277 }
6278
6279 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6280 /// underlying vector and index.
6281 ///
6282 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6283 /// index.
6284 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6285                                          SDValue ExtIdx) {
6286   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6287   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6288     return Idx;
6289
6290   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6291   // lowered this:
6292   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6293   // to:
6294   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6295   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6296   //                           undef)
6297   //                       Constant<0>)
6298   // In this case the vector is the extract_subvector expression and the index
6299   // is 2, as specified by the shuffle.
6300   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6301   SDValue ShuffleVec = SVOp->getOperand(0);
6302   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6303   assert(ShuffleVecVT.getVectorElementType() ==
6304          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6305
6306   int ShuffleIdx = SVOp->getMaskElt(Idx);
6307   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6308     ExtractedFromVec = ShuffleVec;
6309     return ShuffleIdx;
6310   }
6311   return Idx;
6312 }
6313
6314 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6315   MVT VT = Op.getSimpleValueType();
6316
6317   // Skip if insert_vec_elt is not supported.
6318   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6319   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6320     return SDValue();
6321
6322   SDLoc DL(Op);
6323   unsigned NumElems = Op.getNumOperands();
6324
6325   SDValue VecIn1;
6326   SDValue VecIn2;
6327   SmallVector<unsigned, 4> InsertIndices;
6328   SmallVector<int, 8> Mask(NumElems, -1);
6329
6330   for (unsigned i = 0; i != NumElems; ++i) {
6331     unsigned Opc = Op.getOperand(i).getOpcode();
6332
6333     if (Opc == ISD::UNDEF)
6334       continue;
6335
6336     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6337       // Quit if more than 1 elements need inserting.
6338       if (InsertIndices.size() > 1)
6339         return SDValue();
6340
6341       InsertIndices.push_back(i);
6342       continue;
6343     }
6344
6345     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6346     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6347     // Quit if non-constant index.
6348     if (!isa<ConstantSDNode>(ExtIdx))
6349       return SDValue();
6350     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6351
6352     // Quit if extracted from vector of different type.
6353     if (ExtractedFromVec.getValueType() != VT)
6354       return SDValue();
6355
6356     if (!VecIn1.getNode())
6357       VecIn1 = ExtractedFromVec;
6358     else if (VecIn1 != ExtractedFromVec) {
6359       if (!VecIn2.getNode())
6360         VecIn2 = ExtractedFromVec;
6361       else if (VecIn2 != ExtractedFromVec)
6362         // Quit if more than 2 vectors to shuffle
6363         return SDValue();
6364     }
6365
6366     if (ExtractedFromVec == VecIn1)
6367       Mask[i] = Idx;
6368     else if (ExtractedFromVec == VecIn2)
6369       Mask[i] = Idx + NumElems;
6370   }
6371
6372   if (!VecIn1.getNode())
6373     return SDValue();
6374
6375   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6376   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
6377   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6378     unsigned Idx = InsertIndices[i];
6379     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6380                      DAG.getIntPtrConstant(Idx));
6381   }
6382
6383   return NV;
6384 }
6385
6386 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6387 SDValue
6388 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6389
6390   MVT VT = Op.getSimpleValueType();
6391   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
6392          "Unexpected type in LowerBUILD_VECTORvXi1!");
6393
6394   SDLoc dl(Op);
6395   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6396     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
6397     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6398     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6399   }
6400
6401   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
6402     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
6403     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
6404     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
6405   }
6406
6407   bool AllContants = true;
6408   uint64_t Immediate = 0;
6409   int NonConstIdx = -1;
6410   bool IsSplat = true;
6411   unsigned NumNonConsts = 0;
6412   unsigned NumConsts = 0;
6413   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6414     SDValue In = Op.getOperand(idx);
6415     if (In.getOpcode() == ISD::UNDEF)
6416       continue;
6417     if (!isa<ConstantSDNode>(In)) {
6418       AllContants = false;
6419       NonConstIdx = idx;
6420       NumNonConsts++;
6421     } else {
6422       NumConsts++;
6423       if (cast<ConstantSDNode>(In)->getZExtValue())
6424       Immediate |= (1ULL << idx);
6425     }
6426     if (In != Op.getOperand(0))
6427       IsSplat = false;
6428   }
6429
6430   if (AllContants) {
6431     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
6432       DAG.getConstant(Immediate, MVT::i16));
6433     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
6434                        DAG.getIntPtrConstant(0));
6435   }
6436
6437   if (NumNonConsts == 1 && NonConstIdx != 0) {
6438     SDValue DstVec;
6439     if (NumConsts) {
6440       SDValue VecAsImm = DAG.getConstant(Immediate,
6441                                          MVT::getIntegerVT(VT.getSizeInBits()));
6442       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
6443     }
6444     else
6445       DstVec = DAG.getUNDEF(VT);
6446     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6447                        Op.getOperand(NonConstIdx),
6448                        DAG.getIntPtrConstant(NonConstIdx));
6449   }
6450   if (!IsSplat && (NonConstIdx != 0))
6451     llvm_unreachable("Unsupported BUILD_VECTOR operation");
6452   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
6453   SDValue Select;
6454   if (IsSplat)
6455     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6456                           DAG.getConstant(-1, SelectVT),
6457                           DAG.getConstant(0, SelectVT));
6458   else
6459     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
6460                          DAG.getConstant((Immediate | 1), SelectVT),
6461                          DAG.getConstant(Immediate, SelectVT));
6462   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
6463 }
6464
6465 /// \brief Return true if \p N implements a horizontal binop and return the
6466 /// operands for the horizontal binop into V0 and V1.
6467 ///
6468 /// This is a helper function of PerformBUILD_VECTORCombine.
6469 /// This function checks that the build_vector \p N in input implements a
6470 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6471 /// operation to match.
6472 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6473 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6474 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6475 /// arithmetic sub.
6476 ///
6477 /// This function only analyzes elements of \p N whose indices are
6478 /// in range [BaseIdx, LastIdx).
6479 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6480                               SelectionDAG &DAG,
6481                               unsigned BaseIdx, unsigned LastIdx,
6482                               SDValue &V0, SDValue &V1) {
6483   EVT VT = N->getValueType(0);
6484
6485   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6486   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6487          "Invalid Vector in input!");
6488
6489   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6490   bool CanFold = true;
6491   unsigned ExpectedVExtractIdx = BaseIdx;
6492   unsigned NumElts = LastIdx - BaseIdx;
6493   V0 = DAG.getUNDEF(VT);
6494   V1 = DAG.getUNDEF(VT);
6495
6496   // Check if N implements a horizontal binop.
6497   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6498     SDValue Op = N->getOperand(i + BaseIdx);
6499
6500     // Skip UNDEFs.
6501     if (Op->getOpcode() == ISD::UNDEF) {
6502       // Update the expected vector extract index.
6503       if (i * 2 == NumElts)
6504         ExpectedVExtractIdx = BaseIdx;
6505       ExpectedVExtractIdx += 2;
6506       continue;
6507     }
6508
6509     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6510
6511     if (!CanFold)
6512       break;
6513
6514     SDValue Op0 = Op.getOperand(0);
6515     SDValue Op1 = Op.getOperand(1);
6516
6517     // Try to match the following pattern:
6518     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6519     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6520         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6521         Op0.getOperand(0) == Op1.getOperand(0) &&
6522         isa<ConstantSDNode>(Op0.getOperand(1)) &&
6523         isa<ConstantSDNode>(Op1.getOperand(1)));
6524     if (!CanFold)
6525       break;
6526
6527     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6528     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6529
6530     if (i * 2 < NumElts) {
6531       if (V0.getOpcode() == ISD::UNDEF)
6532         V0 = Op0.getOperand(0);
6533     } else {
6534       if (V1.getOpcode() == ISD::UNDEF)
6535         V1 = Op0.getOperand(0);
6536       if (i * 2 == NumElts)
6537         ExpectedVExtractIdx = BaseIdx;
6538     }
6539
6540     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6541     if (I0 == ExpectedVExtractIdx)
6542       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6543     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6544       // Try to match the following dag sequence:
6545       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6546       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6547     } else
6548       CanFold = false;
6549
6550     ExpectedVExtractIdx += 2;
6551   }
6552
6553   return CanFold;
6554 }
6555
6556 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6557 /// a concat_vector.
6558 ///
6559 /// This is a helper function of PerformBUILD_VECTORCombine.
6560 /// This function expects two 256-bit vectors called V0 and V1.
6561 /// At first, each vector is split into two separate 128-bit vectors.
6562 /// Then, the resulting 128-bit vectors are used to implement two
6563 /// horizontal binary operations.
6564 ///
6565 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6566 ///
6567 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6568 /// the two new horizontal binop.
6569 /// When Mode is set, the first horizontal binop dag node would take as input
6570 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6571 /// horizontal binop dag node would take as input the lower 128-bit of V1
6572 /// and the upper 128-bit of V1.
6573 ///   Example:
6574 ///     HADD V0_LO, V0_HI
6575 ///     HADD V1_LO, V1_HI
6576 ///
6577 /// Otherwise, the first horizontal binop dag node takes as input the lower
6578 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6579 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
6580 ///   Example:
6581 ///     HADD V0_LO, V1_LO
6582 ///     HADD V0_HI, V1_HI
6583 ///
6584 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6585 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6586 /// the upper 128-bits of the result.
6587 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6588                                      SDLoc DL, SelectionDAG &DAG,
6589                                      unsigned X86Opcode, bool Mode,
6590                                      bool isUndefLO, bool isUndefHI) {
6591   EVT VT = V0.getValueType();
6592   assert(VT.is256BitVector() && VT == V1.getValueType() &&
6593          "Invalid nodes in input!");
6594
6595   unsigned NumElts = VT.getVectorNumElements();
6596   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
6597   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
6598   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
6599   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
6600   EVT NewVT = V0_LO.getValueType();
6601
6602   SDValue LO = DAG.getUNDEF(NewVT);
6603   SDValue HI = DAG.getUNDEF(NewVT);
6604
6605   if (Mode) {
6606     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6607     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
6608       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6609     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
6610       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6611   } else {
6612     // Don't emit a horizontal binop if the result is expected to be UNDEF.
6613     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
6614                        V1_LO->getOpcode() != ISD::UNDEF))
6615       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6616
6617     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
6618                        V1_HI->getOpcode() != ISD::UNDEF))
6619       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6620   }
6621
6622   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6623 }
6624
6625 /// \brief Try to fold a build_vector that performs an 'addsub' into the
6626 /// sequence of 'vadd + vsub + blendi'.
6627 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
6628                            const X86Subtarget *Subtarget) {
6629   SDLoc DL(BV);
6630   EVT VT = BV->getValueType(0);
6631   unsigned NumElts = VT.getVectorNumElements();
6632   SDValue InVec0 = DAG.getUNDEF(VT);
6633   SDValue InVec1 = DAG.getUNDEF(VT);
6634
6635   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6636           VT == MVT::v2f64) && "build_vector with an invalid type found!");
6637
6638   // Odd-numbered elements in the input build vector are obtained from
6639   // adding two integer/float elements.
6640   // Even-numbered elements in the input build vector are obtained from
6641   // subtracting two integer/float elements.
6642   unsigned ExpectedOpcode = ISD::FSUB;
6643   unsigned NextExpectedOpcode = ISD::FADD;
6644   bool AddFound = false;
6645   bool SubFound = false;
6646
6647   for (unsigned i = 0, e = NumElts; i != e; i++) {
6648     SDValue Op = BV->getOperand(i);
6649
6650     // Skip 'undef' values.
6651     unsigned Opcode = Op.getOpcode();
6652     if (Opcode == ISD::UNDEF) {
6653       std::swap(ExpectedOpcode, NextExpectedOpcode);
6654       continue;
6655     }
6656
6657     // Early exit if we found an unexpected opcode.
6658     if (Opcode != ExpectedOpcode)
6659       return SDValue();
6660
6661     SDValue Op0 = Op.getOperand(0);
6662     SDValue Op1 = Op.getOperand(1);
6663
6664     // Try to match the following pattern:
6665     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6666     // Early exit if we cannot match that sequence.
6667     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6668         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6669         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6670         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6671         Op0.getOperand(1) != Op1.getOperand(1))
6672       return SDValue();
6673
6674     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6675     if (I0 != i)
6676       return SDValue();
6677
6678     // We found a valid add/sub node. Update the information accordingly.
6679     if (i & 1)
6680       AddFound = true;
6681     else
6682       SubFound = true;
6683
6684     // Update InVec0 and InVec1.
6685     if (InVec0.getOpcode() == ISD::UNDEF)
6686       InVec0 = Op0.getOperand(0);
6687     if (InVec1.getOpcode() == ISD::UNDEF)
6688       InVec1 = Op1.getOperand(0);
6689
6690     // Make sure that operands in input to each add/sub node always
6691     // come from a same pair of vectors.
6692     if (InVec0 != Op0.getOperand(0)) {
6693       if (ExpectedOpcode == ISD::FSUB)
6694         return SDValue();
6695
6696       // FADD is commutable. Try to commute the operands
6697       // and then test again.
6698       std::swap(Op0, Op1);
6699       if (InVec0 != Op0.getOperand(0))
6700         return SDValue();
6701     }
6702
6703     if (InVec1 != Op1.getOperand(0))
6704       return SDValue();
6705
6706     // Update the pair of expected opcodes.
6707     std::swap(ExpectedOpcode, NextExpectedOpcode);
6708   }
6709
6710   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6711   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
6712       InVec1.getOpcode() != ISD::UNDEF)
6713     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6714
6715   return SDValue();
6716 }
6717
6718 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
6719                                           const X86Subtarget *Subtarget) {
6720   SDLoc DL(N);
6721   EVT VT = N->getValueType(0);
6722   unsigned NumElts = VT.getVectorNumElements();
6723   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
6724   SDValue InVec0, InVec1;
6725
6726   // Try to match an ADDSUB.
6727   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
6728       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
6729     SDValue Value = matchAddSub(BV, DAG, Subtarget);
6730     if (Value.getNode())
6731       return Value;
6732   }
6733
6734   // Try to match horizontal ADD/SUB.
6735   unsigned NumUndefsLO = 0;
6736   unsigned NumUndefsHI = 0;
6737   unsigned Half = NumElts/2;
6738
6739   // Count the number of UNDEF operands in the build_vector in input.
6740   for (unsigned i = 0, e = Half; i != e; ++i)
6741     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6742       NumUndefsLO++;
6743
6744   for (unsigned i = Half, e = NumElts; i != e; ++i)
6745     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
6746       NumUndefsHI++;
6747
6748   // Early exit if this is either a build_vector of all UNDEFs or all the
6749   // operands but one are UNDEF.
6750   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6751     return SDValue();
6752
6753   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
6754     // Try to match an SSE3 float HADD/HSUB.
6755     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6756       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6757
6758     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6759       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6760   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
6761     // Try to match an SSSE3 integer HADD/HSUB.
6762     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6763       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6764
6765     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6766       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6767   }
6768
6769   if (!Subtarget->hasAVX())
6770     return SDValue();
6771
6772   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6773     // Try to match an AVX horizontal add/sub of packed single/double
6774     // precision floating point values from 256-bit vectors.
6775     SDValue InVec2, InVec3;
6776     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6777         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6778         ((InVec0.getOpcode() == ISD::UNDEF ||
6779           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6780         ((InVec1.getOpcode() == ISD::UNDEF ||
6781           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6782       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6783
6784     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6785         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6786         ((InVec0.getOpcode() == ISD::UNDEF ||
6787           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6788         ((InVec1.getOpcode() == ISD::UNDEF ||
6789           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6790       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6791   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6792     // Try to match an AVX2 horizontal add/sub of signed integers.
6793     SDValue InVec2, InVec3;
6794     unsigned X86Opcode;
6795     bool CanFold = true;
6796
6797     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6798         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6799         ((InVec0.getOpcode() == ISD::UNDEF ||
6800           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6801         ((InVec1.getOpcode() == ISD::UNDEF ||
6802           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6803       X86Opcode = X86ISD::HADD;
6804     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6805         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6806         ((InVec0.getOpcode() == ISD::UNDEF ||
6807           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
6808         ((InVec1.getOpcode() == ISD::UNDEF ||
6809           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
6810       X86Opcode = X86ISD::HSUB;
6811     else
6812       CanFold = false;
6813
6814     if (CanFold) {
6815       // Fold this build_vector into a single horizontal add/sub.
6816       // Do this only if the target has AVX2.
6817       if (Subtarget->hasAVX2())
6818         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6819
6820       // Do not try to expand this build_vector into a pair of horizontal
6821       // add/sub if we can emit a pair of scalar add/sub.
6822       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6823         return SDValue();
6824
6825       // Convert this build_vector into a pair of horizontal binop followed by
6826       // a concat vector.
6827       bool isUndefLO = NumUndefsLO == Half;
6828       bool isUndefHI = NumUndefsHI == Half;
6829       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6830                                    isUndefLO, isUndefHI);
6831     }
6832   }
6833
6834   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6835        VT == MVT::v16i16) && Subtarget->hasAVX()) {
6836     unsigned X86Opcode;
6837     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6838       X86Opcode = X86ISD::HADD;
6839     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6840       X86Opcode = X86ISD::HSUB;
6841     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6842       X86Opcode = X86ISD::FHADD;
6843     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6844       X86Opcode = X86ISD::FHSUB;
6845     else
6846       return SDValue();
6847
6848     // Don't try to expand this build_vector into a pair of horizontal add/sub
6849     // if we can simply emit a pair of scalar add/sub.
6850     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6851       return SDValue();
6852
6853     // Convert this build_vector into two horizontal add/sub followed by
6854     // a concat vector.
6855     bool isUndefLO = NumUndefsLO == Half;
6856     bool isUndefHI = NumUndefsHI == Half;
6857     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6858                                  isUndefLO, isUndefHI);
6859   }
6860
6861   return SDValue();
6862 }
6863
6864 SDValue
6865 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6866   SDLoc dl(Op);
6867
6868   MVT VT = Op.getSimpleValueType();
6869   MVT ExtVT = VT.getVectorElementType();
6870   unsigned NumElems = Op.getNumOperands();
6871
6872   // Generate vectors for predicate vectors.
6873   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
6874     return LowerBUILD_VECTORvXi1(Op, DAG);
6875
6876   // Vectors containing all zeros can be matched by pxor and xorps later
6877   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6878     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6879     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6880     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6881       return Op;
6882
6883     return getZeroVector(VT, Subtarget, DAG, dl);
6884   }
6885
6886   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6887   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6888   // vpcmpeqd on 256-bit vectors.
6889   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6890     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
6891       return Op;
6892
6893     if (!VT.is512BitVector())
6894       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
6895   }
6896
6897   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
6898   if (Broadcast.getNode())
6899     return Broadcast;
6900
6901   unsigned EVTBits = ExtVT.getSizeInBits();
6902
6903   unsigned NumZero  = 0;
6904   unsigned NumNonZero = 0;
6905   unsigned NonZeros = 0;
6906   bool IsAllConstants = true;
6907   SmallSet<SDValue, 8> Values;
6908   for (unsigned i = 0; i < NumElems; ++i) {
6909     SDValue Elt = Op.getOperand(i);
6910     if (Elt.getOpcode() == ISD::UNDEF)
6911       continue;
6912     Values.insert(Elt);
6913     if (Elt.getOpcode() != ISD::Constant &&
6914         Elt.getOpcode() != ISD::ConstantFP)
6915       IsAllConstants = false;
6916     if (X86::isZeroNode(Elt))
6917       NumZero++;
6918     else {
6919       NonZeros |= (1 << i);
6920       NumNonZero++;
6921     }
6922   }
6923
6924   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6925   if (NumNonZero == 0)
6926     return DAG.getUNDEF(VT);
6927
6928   // Special case for single non-zero, non-undef, element.
6929   if (NumNonZero == 1) {
6930     unsigned Idx = countTrailingZeros(NonZeros);
6931     SDValue Item = Op.getOperand(Idx);
6932
6933     // If this is an insertion of an i64 value on x86-32, and if the top bits of
6934     // the value are obviously zero, truncate the value to i32 and do the
6935     // insertion that way.  Only do this if the value is non-constant or if the
6936     // value is a constant being inserted into element 0.  It is cheaper to do
6937     // a constant pool load than it is to do a movd + shuffle.
6938     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
6939         (!IsAllConstants || Idx == 0)) {
6940       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6941         // Handle SSE only.
6942         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6943         EVT VecVT = MVT::v4i32;
6944         unsigned VecElts = 4;
6945
6946         // Truncate the value (which may itself be a constant) to i32, and
6947         // convert it to a vector with movd (S2V+shuffle to zero extend).
6948         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6949         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6950
6951         // If using the new shuffle lowering, just directly insert this.
6952         if (ExperimentalVectorShuffleLowering)
6953           return DAG.getNode(
6954               ISD::BITCAST, dl, VT,
6955               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
6956
6957         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6958
6959         // Now we have our 32-bit value zero extended in the low element of
6960         // a vector.  If Idx != 0, swizzle it into place.
6961         if (Idx != 0) {
6962           SmallVector<int, 4> Mask;
6963           Mask.push_back(Idx);
6964           for (unsigned i = 1; i != VecElts; ++i)
6965             Mask.push_back(i);
6966           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
6967                                       &Mask[0]);
6968         }
6969         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
6970       }
6971     }
6972
6973     // If we have a constant or non-constant insertion into the low element of
6974     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6975     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6976     // depending on what the source datatype is.
6977     if (Idx == 0) {
6978       if (NumZero == 0)
6979         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6980
6981       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6982           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
6983         if (VT.is256BitVector() || VT.is512BitVector()) {
6984           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6985           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6986                              Item, DAG.getIntPtrConstant(0));
6987         }
6988         assert(VT.is128BitVector() && "Expected an SSE value type!");
6989         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6990         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6991         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6992       }
6993
6994       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6995         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6996         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6997         if (VT.is256BitVector()) {
6998           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
6999           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7000         } else {
7001           assert(VT.is128BitVector() && "Expected an SSE value type!");
7002           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7003         }
7004         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
7005       }
7006     }
7007
7008     // Is it a vector logical left shift?
7009     if (NumElems == 2 && Idx == 1 &&
7010         X86::isZeroNode(Op.getOperand(0)) &&
7011         !X86::isZeroNode(Op.getOperand(1))) {
7012       unsigned NumBits = VT.getSizeInBits();
7013       return getVShift(true, VT,
7014                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7015                                    VT, Op.getOperand(1)),
7016                        NumBits/2, DAG, *this, dl);
7017     }
7018
7019     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7020       return SDValue();
7021
7022     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7023     // is a non-constant being inserted into an element other than the low one,
7024     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7025     // movd/movss) to move this into the low element, then shuffle it into
7026     // place.
7027     if (EVTBits == 32) {
7028       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7029
7030       // If using the new shuffle lowering, just directly insert this.
7031       if (ExperimentalVectorShuffleLowering)
7032         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7033
7034       // Turn it into a shuffle of zero and zero-extended scalar to vector.
7035       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
7036       SmallVector<int, 8> MaskVec;
7037       for (unsigned i = 0; i != NumElems; ++i)
7038         MaskVec.push_back(i == Idx ? 0 : 1);
7039       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
7040     }
7041   }
7042
7043   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7044   if (Values.size() == 1) {
7045     if (EVTBits == 32) {
7046       // Instead of a shuffle like this:
7047       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7048       // Check if it's possible to issue this instead.
7049       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7050       unsigned Idx = countTrailingZeros(NonZeros);
7051       SDValue Item = Op.getOperand(Idx);
7052       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7053         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7054     }
7055     return SDValue();
7056   }
7057
7058   // A vector full of immediates; various special cases are already
7059   // handled, so this is best done with a single constant-pool load.
7060   if (IsAllConstants)
7061     return SDValue();
7062
7063   // For AVX-length vectors, see if we can use a vector load to get all of the
7064   // elements, otherwise build the individual 128-bit pieces and use
7065   // shuffles to put them in place.
7066   if (VT.is256BitVector() || VT.is512BitVector()) {
7067     SmallVector<SDValue, 64> V;
7068     for (unsigned i = 0; i != NumElems; ++i)
7069       V.push_back(Op.getOperand(i));
7070
7071     // Check for a build vector of consecutive loads.
7072     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
7073       return LD;
7074
7075     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7076
7077     // Build both the lower and upper subvector.
7078     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7079                                 makeArrayRef(&V[0], NumElems/2));
7080     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
7081                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
7082
7083     // Recreate the wider vector with the lower and upper part.
7084     if (VT.is256BitVector())
7085       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7086     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7087   }
7088
7089   // Let legalizer expand 2-wide build_vectors.
7090   if (EVTBits == 64) {
7091     if (NumNonZero == 1) {
7092       // One half is zero or undef.
7093       unsigned Idx = countTrailingZeros(NonZeros);
7094       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7095                                  Op.getOperand(Idx));
7096       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7097     }
7098     return SDValue();
7099   }
7100
7101   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7102   if (EVTBits == 8 && NumElems == 16) {
7103     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
7104                                         Subtarget, *this);
7105     if (V.getNode()) return V;
7106   }
7107
7108   if (EVTBits == 16 && NumElems == 8) {
7109     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
7110                                       Subtarget, *this);
7111     if (V.getNode()) return V;
7112   }
7113
7114   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7115   if (EVTBits == 32 && NumElems == 4) {
7116     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
7117     if (V.getNode())
7118       return V;
7119   }
7120
7121   // If element VT is == 32 bits, turn it into a number of shuffles.
7122   SmallVector<SDValue, 8> V(NumElems);
7123   if (NumElems == 4 && NumZero > 0) {
7124     for (unsigned i = 0; i < 4; ++i) {
7125       bool isZero = !(NonZeros & (1 << i));
7126       if (isZero)
7127         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
7128       else
7129         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7130     }
7131
7132     for (unsigned i = 0; i < 2; ++i) {
7133       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7134         default: break;
7135         case 0:
7136           V[i] = V[i*2];  // Must be a zero vector.
7137           break;
7138         case 1:
7139           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
7140           break;
7141         case 2:
7142           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
7143           break;
7144         case 3:
7145           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
7146           break;
7147       }
7148     }
7149
7150     bool Reverse1 = (NonZeros & 0x3) == 2;
7151     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7152     int MaskVec[] = {
7153       Reverse1 ? 1 : 0,
7154       Reverse1 ? 0 : 1,
7155       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7156       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7157     };
7158     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
7159   }
7160
7161   if (Values.size() > 1 && VT.is128BitVector()) {
7162     // Check for a build vector of consecutive loads.
7163     for (unsigned i = 0; i < NumElems; ++i)
7164       V[i] = Op.getOperand(i);
7165
7166     // Check for elements which are consecutive loads.
7167     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
7168     if (LD.getNode())
7169       return LD;
7170
7171     // Check for a build vector from mostly shuffle plus few inserting.
7172     SDValue Sh = buildFromShuffleMostly(Op, DAG);
7173     if (Sh.getNode())
7174       return Sh;
7175
7176     // For SSE 4.1, use insertps to put the high elements into the low element.
7177     if (getSubtarget()->hasSSE41()) {
7178       SDValue Result;
7179       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
7180         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7181       else
7182         Result = DAG.getUNDEF(VT);
7183
7184       for (unsigned i = 1; i < NumElems; ++i) {
7185         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
7186         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7187                              Op.getOperand(i), DAG.getIntPtrConstant(i));
7188       }
7189       return Result;
7190     }
7191
7192     // Otherwise, expand into a number of unpckl*, start by extending each of
7193     // our (non-undef) elements to the full vector width with the element in the
7194     // bottom slot of the vector (which generates no code for SSE).
7195     for (unsigned i = 0; i < NumElems; ++i) {
7196       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
7197         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7198       else
7199         V[i] = DAG.getUNDEF(VT);
7200     }
7201
7202     // Next, we iteratively mix elements, e.g. for v4f32:
7203     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7204     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7205     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7206     unsigned EltStride = NumElems >> 1;
7207     while (EltStride != 0) {
7208       for (unsigned i = 0; i < EltStride; ++i) {
7209         // If V[i+EltStride] is undef and this is the first round of mixing,
7210         // then it is safe to just drop this shuffle: V[i] is already in the
7211         // right place, the one element (since it's the first round) being
7212         // inserted as undef can be dropped.  This isn't safe for successive
7213         // rounds because they will permute elements within both vectors.
7214         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
7215             EltStride == NumElems/2)
7216           continue;
7217
7218         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
7219       }
7220       EltStride >>= 1;
7221     }
7222     return V[0];
7223   }
7224   return SDValue();
7225 }
7226
7227 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
7228 // to create 256-bit vectors from two other 128-bit ones.
7229 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7230   SDLoc dl(Op);
7231   MVT ResVT = Op.getSimpleValueType();
7232
7233   assert((ResVT.is256BitVector() ||
7234           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7235
7236   SDValue V1 = Op.getOperand(0);
7237   SDValue V2 = Op.getOperand(1);
7238   unsigned NumElems = ResVT.getVectorNumElements();
7239   if(ResVT.is256BitVector())
7240     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7241
7242   if (Op.getNumOperands() == 4) {
7243     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
7244                                 ResVT.getVectorNumElements()/2);
7245     SDValue V3 = Op.getOperand(2);
7246     SDValue V4 = Op.getOperand(3);
7247     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
7248       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
7249   }
7250   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7251 }
7252
7253 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7254   MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType();
7255   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7256          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7257           Op.getNumOperands() == 4)));
7258
7259   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7260   // from two other 128-bit ones.
7261
7262   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7263   return LowerAVXCONCAT_VECTORS(Op, DAG);
7264 }
7265
7266
7267 //===----------------------------------------------------------------------===//
7268 // Vector shuffle lowering
7269 //
7270 // This is an experimental code path for lowering vector shuffles on x86. It is
7271 // designed to handle arbitrary vector shuffles and blends, gracefully
7272 // degrading performance as necessary. It works hard to recognize idiomatic
7273 // shuffles and lower them to optimal instruction patterns without leaving
7274 // a framework that allows reasonably efficient handling of all vector shuffle
7275 // patterns.
7276 //===----------------------------------------------------------------------===//
7277
7278 /// \brief Tiny helper function to identify a no-op mask.
7279 ///
7280 /// This is a somewhat boring predicate function. It checks whether the mask
7281 /// array input, which is assumed to be a single-input shuffle mask of the kind
7282 /// used by the X86 shuffle instructions (not a fully general
7283 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7284 /// in-place shuffle are 'no-op's.
7285 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7286   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7287     if (Mask[i] != -1 && Mask[i] != i)
7288       return false;
7289   return true;
7290 }
7291
7292 /// \brief Helper function to classify a mask as a single-input mask.
7293 ///
7294 /// This isn't a generic single-input test because in the vector shuffle
7295 /// lowering we canonicalize single inputs to be the first input operand. This
7296 /// means we can more quickly test for a single input by only checking whether
7297 /// an input from the second operand exists. We also assume that the size of
7298 /// mask corresponds to the size of the input vectors which isn't true in the
7299 /// fully general case.
7300 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
7301   for (int M : Mask)
7302     if (M >= (int)Mask.size())
7303       return false;
7304   return true;
7305 }
7306
7307 /// \brief Test whether there are elements crossing 128-bit lanes in this
7308 /// shuffle mask.
7309 ///
7310 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7311 /// and we routinely test for these.
7312 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7313   int LaneSize = 128 / VT.getScalarSizeInBits();
7314   int Size = Mask.size();
7315   for (int i = 0; i < Size; ++i)
7316     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7317       return true;
7318   return false;
7319 }
7320
7321 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
7322 ///
7323 /// This checks a shuffle mask to see if it is performing the same
7324 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
7325 /// that it is also not lane-crossing. It may however involve a blend from the
7326 /// same lane of a second vector.
7327 ///
7328 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7329 /// non-trivial to compute in the face of undef lanes. The representation is
7330 /// *not* suitable for use with existing 128-bit shuffles as it will contain
7331 /// entries from both V1 and V2 inputs to the wider mask.
7332 static bool
7333 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7334                                 SmallVectorImpl<int> &RepeatedMask) {
7335   int LaneSize = 128 / VT.getScalarSizeInBits();
7336   RepeatedMask.resize(LaneSize, -1);
7337   int Size = Mask.size();
7338   for (int i = 0; i < Size; ++i) {
7339     if (Mask[i] < 0)
7340       continue;
7341     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7342       // This entry crosses lanes, so there is no way to model this shuffle.
7343       return false;
7344
7345     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7346     if (RepeatedMask[i % LaneSize] == -1)
7347       // This is the first non-undef entry in this slot of a 128-bit lane.
7348       RepeatedMask[i % LaneSize] =
7349           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
7350     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
7351       // Found a mismatch with the repeated mask.
7352       return false;
7353   }
7354   return true;
7355 }
7356
7357 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
7358 // 2013 will allow us to use it as a non-type template parameter.
7359 namespace {
7360
7361 /// \brief Implementation of the \c isShuffleEquivalent variadic functor.
7362 ///
7363 /// See its documentation for details.
7364 bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) {
7365   if (Mask.size() != Args.size())
7366     return false;
7367   for (int i = 0, e = Mask.size(); i < e; ++i) {
7368     assert(*Args[i] >= 0 && "Arguments must be positive integers!");
7369     if (Mask[i] != -1 && Mask[i] != *Args[i])
7370       return false;
7371   }
7372   return true;
7373 }
7374
7375 } // namespace
7376
7377 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7378 /// arguments.
7379 ///
7380 /// This is a fast way to test a shuffle mask against a fixed pattern:
7381 ///
7382 ///   if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... }
7383 ///
7384 /// It returns true if the mask is exactly as wide as the argument list, and
7385 /// each element of the mask is either -1 (signifying undef) or the value given
7386 /// in the argument.
7387 static const VariadicFunction1<
7388     bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {};
7389
7390 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7391 ///
7392 /// This helper function produces an 8-bit shuffle immediate corresponding to
7393 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7394 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7395 /// example.
7396 ///
7397 /// NB: We rely heavily on "undef" masks preserving the input lane.
7398 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
7399                                           SelectionDAG &DAG) {
7400   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7401   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7402   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7403   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7404   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7405
7406   unsigned Imm = 0;
7407   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
7408   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
7409   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
7410   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
7411   return DAG.getConstant(Imm, MVT::i8);
7412 }
7413
7414 /// \brief Try to emit a blend instruction for a shuffle.
7415 ///
7416 /// This doesn't do any checks for the availability of instructions for blending
7417 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7418 /// be matched in the backend with the type given. What it does check for is
7419 /// that the shuffle mask is in fact a blend.
7420 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
7421                                          SDValue V2, ArrayRef<int> Mask,
7422                                          const X86Subtarget *Subtarget,
7423                                          SelectionDAG &DAG) {
7424
7425   unsigned BlendMask = 0;
7426   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7427     if (Mask[i] >= Size) {
7428       if (Mask[i] != i + Size)
7429         return SDValue(); // Shuffled V2 input!
7430       BlendMask |= 1u << i;
7431       continue;
7432     }
7433     if (Mask[i] >= 0 && Mask[i] != i)
7434       return SDValue(); // Shuffled V1 input!
7435   }
7436   switch (VT.SimpleTy) {
7437   case MVT::v2f64:
7438   case MVT::v4f32:
7439   case MVT::v4f64:
7440   case MVT::v8f32:
7441     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7442                        DAG.getConstant(BlendMask, MVT::i8));
7443
7444   case MVT::v4i64:
7445   case MVT::v8i32:
7446     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7447     // FALLTHROUGH
7448   case MVT::v2i64:
7449   case MVT::v4i32:
7450     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7451     // that instruction.
7452     if (Subtarget->hasAVX2()) {
7453       // Scale the blend by the number of 32-bit dwords per element.
7454       int Scale =  VT.getScalarSizeInBits() / 32;
7455       BlendMask = 0;
7456       for (int i = 0, Size = Mask.size(); i < Size; ++i)
7457         if (Mask[i] >= Size)
7458           for (int j = 0; j < Scale; ++j)
7459             BlendMask |= 1u << (i * Scale + j);
7460
7461       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7462       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
7463       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
7464       return DAG.getNode(ISD::BITCAST, DL, VT,
7465                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7466                                      DAG.getConstant(BlendMask, MVT::i8)));
7467     }
7468     // FALLTHROUGH
7469   case MVT::v8i16: {
7470     // For integer shuffles we need to expand the mask and cast the inputs to
7471     // v8i16s prior to blending.
7472     int Scale = 8 / VT.getVectorNumElements();
7473     BlendMask = 0;
7474     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7475       if (Mask[i] >= Size)
7476         for (int j = 0; j < Scale; ++j)
7477           BlendMask |= 1u << (i * Scale + j);
7478
7479     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
7480     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
7481     return DAG.getNode(ISD::BITCAST, DL, VT,
7482                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7483                                    DAG.getConstant(BlendMask, MVT::i8)));
7484   }
7485
7486   case MVT::v16i16: {
7487     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7488     SmallVector<int, 8> RepeatedMask;
7489     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7490       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7491       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7492       BlendMask = 0;
7493       for (int i = 0; i < 8; ++i)
7494         if (RepeatedMask[i] >= 16)
7495           BlendMask |= 1u << i;
7496       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7497                          DAG.getConstant(BlendMask, MVT::i8));
7498     }
7499   }
7500     // FALLTHROUGH
7501   case MVT::v32i8: {
7502     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
7503     // Scale the blend by the number of bytes per element.
7504     int Scale =  VT.getScalarSizeInBits() / 8;
7505     assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
7506
7507     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7508     // mix of LLVM's code generator and the x86 backend. We tell the code
7509     // generator that boolean values in the elements of an x86 vector register
7510     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7511     // mapping a select to operand #1, and 'false' mapping to operand #2. The
7512     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7513     // of the element (the remaining are ignored) and 0 in that high bit would
7514     // mean operand #1 while 1 in the high bit would mean operand #2. So while
7515     // the LLVM model for boolean values in vector elements gets the relevant
7516     // bit set, it is set backwards and over constrained relative to x86's
7517     // actual model.
7518     SDValue VSELECTMask[32];
7519     for (int i = 0, Size = Mask.size(); i < Size; ++i)
7520       for (int j = 0; j < Scale; ++j)
7521         VSELECTMask[Scale * i + j] =
7522             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7523                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8);
7524
7525     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
7526     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
7527     return DAG.getNode(
7528         ISD::BITCAST, DL, VT,
7529         DAG.getNode(ISD::VSELECT, DL, MVT::v32i8,
7530                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask),
7531                     V1, V2));
7532   }
7533
7534   default:
7535     llvm_unreachable("Not a supported integer vector type!");
7536   }
7537 }
7538
7539 /// \brief Generic routine to lower a shuffle and blend as a decomposed set of
7540 /// unblended shuffles followed by an unshuffled blend.
7541 ///
7542 /// This matches the extremely common pattern for handling combined
7543 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7544 /// operations.
7545 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
7546                                                           SDValue V1,
7547                                                           SDValue V2,
7548                                                           ArrayRef<int> Mask,
7549                                                           SelectionDAG &DAG) {
7550   // Shuffle the input elements into the desired positions in V1 and V2 and
7551   // blend them together.
7552   SmallVector<int, 32> V1Mask(Mask.size(), -1);
7553   SmallVector<int, 32> V2Mask(Mask.size(), -1);
7554   SmallVector<int, 32> BlendMask(Mask.size(), -1);
7555   for (int i = 0, Size = Mask.size(); i < Size; ++i)
7556     if (Mask[i] >= 0 && Mask[i] < Size) {
7557       V1Mask[i] = Mask[i];
7558       BlendMask[i] = i;
7559     } else if (Mask[i] >= Size) {
7560       V2Mask[i] = Mask[i] - Size;
7561       BlendMask[i] = i + Size;
7562     }
7563
7564   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7565   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7566   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7567 }
7568
7569 /// \brief Try to lower a vector shuffle as a byte rotation.
7570 ///
7571 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7572 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7573 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7574 /// try to generically lower a vector shuffle through such an pattern. It
7575 /// does not check for the profitability of lowering either as PALIGNR or
7576 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7577 /// This matches shuffle vectors that look like:
7578 ///
7579 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7580 ///
7581 /// Essentially it concatenates V1 and V2, shifts right by some number of
7582 /// elements, and takes the low elements as the result. Note that while this is
7583 /// specified as a *right shift* because x86 is little-endian, it is a *left
7584 /// rotate* of the vector lanes.
7585 ///
7586 /// Note that this only handles 128-bit vector widths currently.
7587 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
7588                                               SDValue V2,
7589                                               ArrayRef<int> Mask,
7590                                               const X86Subtarget *Subtarget,
7591                                               SelectionDAG &DAG) {
7592   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7593
7594   // We need to detect various ways of spelling a rotation:
7595   //   [11, 12, 13, 14, 15,  0,  1,  2]
7596   //   [-1, 12, 13, 14, -1, -1,  1, -1]
7597   //   [-1, -1, -1, -1, -1, -1,  1,  2]
7598   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7599   //   [-1,  4,  5,  6, -1, -1,  9, -1]
7600   //   [-1,  4,  5,  6, -1, -1, -1, -1]
7601   int Rotation = 0;
7602   SDValue Lo, Hi;
7603   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7604     if (Mask[i] == -1)
7605       continue;
7606     assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!");
7607
7608     // Based on the mod-Size value of this mask element determine where
7609     // a rotated vector would have started.
7610     int StartIdx = i - (Mask[i] % Size);
7611     if (StartIdx == 0)
7612       // The identity rotation isn't interesting, stop.
7613       return SDValue();
7614
7615     // If we found the tail of a vector the rotation must be the missing
7616     // front. If we found the head of a vector, it must be how much of the head.
7617     int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx;
7618
7619     if (Rotation == 0)
7620       Rotation = CandidateRotation;
7621     else if (Rotation != CandidateRotation)
7622       // The rotations don't match, so we can't match this mask.
7623       return SDValue();
7624
7625     // Compute which value this mask is pointing at.
7626     SDValue MaskV = Mask[i] < Size ? V1 : V2;
7627
7628     // Compute which of the two target values this index should be assigned to.
7629     // This reflects whether the high elements are remaining or the low elements
7630     // are remaining.
7631     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7632
7633     // Either set up this value if we've not encountered it before, or check
7634     // that it remains consistent.
7635     if (!TargetV)
7636       TargetV = MaskV;
7637     else if (TargetV != MaskV)
7638       // This may be a rotation, but it pulls from the inputs in some
7639       // unsupported interleaving.
7640       return SDValue();
7641   }
7642
7643   // Check that we successfully analyzed the mask, and normalize the results.
7644   assert(Rotation != 0 && "Failed to locate a viable rotation!");
7645   assert((Lo || Hi) && "Failed to find a rotated input vector!");
7646   if (!Lo)
7647     Lo = Hi;
7648   else if (!Hi)
7649     Hi = Lo;
7650
7651   assert(VT.getSizeInBits() == 128 &&
7652          "Rotate-based lowering only supports 128-bit lowering!");
7653   assert(Mask.size() <= 16 &&
7654          "Can shuffle at most 16 bytes in a 128-bit vector!");
7655
7656   // The actual rotate instruction rotates bytes, so we need to scale the
7657   // rotation based on how many bytes are in the vector.
7658   int Scale = 16 / Mask.size();
7659
7660   // SSSE3 targets can use the palignr instruction
7661   if (Subtarget->hasSSSE3()) {
7662     // Cast the inputs to v16i8 to match PALIGNR.
7663     Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo);
7664     Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi);
7665
7666     return DAG.getNode(ISD::BITCAST, DL, VT,
7667                        DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo,
7668                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
7669   }
7670
7671   // Default SSE2 implementation
7672   int LoByteShift = 16 - Rotation * Scale;
7673   int HiByteShift = Rotation * Scale;
7674
7675   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
7676   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
7677   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
7678
7679   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
7680                                 DAG.getConstant(8 * LoByteShift, MVT::i8));
7681   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
7682                                 DAG.getConstant(8 * HiByteShift, MVT::i8));
7683   return DAG.getNode(ISD::BITCAST, DL, VT,
7684                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
7685 }
7686
7687 /// \brief Compute whether each element of a shuffle is zeroable.
7688 ///
7689 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7690 /// Either it is an undef element in the shuffle mask, the element of the input
7691 /// referenced is undef, or the element of the input referenced is known to be
7692 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7693 /// as many lanes with this technique as possible to simplify the remaining
7694 /// shuffle.
7695 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7696                                                      SDValue V1, SDValue V2) {
7697   SmallBitVector Zeroable(Mask.size(), false);
7698
7699   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7700   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7701
7702   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7703     int M = Mask[i];
7704     // Handle the easy cases.
7705     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7706       Zeroable[i] = true;
7707       continue;
7708     }
7709
7710     // If this is an index into a build_vector node, dig out the input value and
7711     // use it.
7712     SDValue V = M < Size ? V1 : V2;
7713     if (V.getOpcode() != ISD::BUILD_VECTOR)
7714       continue;
7715
7716     SDValue Input = V.getOperand(M % Size);
7717     // The UNDEF opcode check really should be dead code here, but not quite
7718     // worth asserting on (it isn't invalid, just unexpected).
7719     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
7720       Zeroable[i] = true;
7721   }
7722
7723   return Zeroable;
7724 }
7725
7726 /// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
7727 ///
7728 /// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
7729 /// byte-shift instructions. The mask must consist of a shifted sequential
7730 /// shuffle from one of the input vectors and zeroable elements for the
7731 /// remaining 'shifted in' elements.
7732 ///
7733 /// Note that this only handles 128-bit vector widths currently.
7734 static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1,
7735                                              SDValue V2, ArrayRef<int> Mask,
7736                                              SelectionDAG &DAG) {
7737   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7738
7739   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7740
7741   int Size = Mask.size();
7742   int Scale = 16 / Size;
7743
7744   for (int Shift = 1; Shift < Size; Shift++) {
7745     int ByteShift = Shift * Scale;
7746
7747     // PSRLDQ : (little-endian) right byte shift
7748     // [ 5,  6,  7, zz, zz, zz, zz, zz]
7749     // [ -1, 5,  6,  7, zz, zz, zz, zz]
7750     // [  1, 2, -1, -1, -1, -1, zz, zz]
7751     bool ZeroableRight = true;
7752     for (int i = Size - Shift; i < Size; i++) {
7753       ZeroableRight &= Zeroable[i];
7754     }
7755
7756     if (ZeroableRight) {
7757       bool ValidShiftRight1 =
7758           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift);
7759       bool ValidShiftRight2 =
7760           isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift);
7761
7762       if (ValidShiftRight1 || ValidShiftRight2) {
7763         // Cast the inputs to v2i64 to match PSRLDQ.
7764         SDValue &TargetV = ValidShiftRight1 ? V1 : V2;
7765         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7766         SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V,
7767                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7768         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7769       }
7770     }
7771
7772     // PSLLDQ : (little-endian) left byte shift
7773     // [ zz,  0,  1,  2,  3,  4,  5,  6]
7774     // [ zz, zz, -1, -1,  2,  3,  4, -1]
7775     // [ zz, zz, zz, zz, zz, zz, -1,  1]
7776     bool ZeroableLeft = true;
7777     for (int i = 0; i < Shift; i++) {
7778       ZeroableLeft &= Zeroable[i];
7779     }
7780
7781     if (ZeroableLeft) {
7782       bool ValidShiftLeft1 =
7783           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0);
7784       bool ValidShiftLeft2 =
7785           isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size);
7786
7787       if (ValidShiftLeft1 || ValidShiftLeft2) {
7788         // Cast the inputs to v2i64 to match PSLLDQ.
7789         SDValue &TargetV = ValidShiftLeft1 ? V1 : V2;
7790         SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV);
7791         SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V,
7792                                       DAG.getConstant(ByteShift * 8, MVT::i8));
7793         return DAG.getNode(ISD::BITCAST, DL, VT, Shifted);
7794       }
7795     }
7796   }
7797
7798   return SDValue();
7799 }
7800
7801 /// \brief Lower a vector shuffle as a zero or any extension.
7802 ///
7803 /// Given a specific number of elements, element bit width, and extension
7804 /// stride, produce either a zero or any extension based on the available
7805 /// features of the subtarget.
7806 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7807     SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV,
7808     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7809   assert(Scale > 1 && "Need a scale to extend.");
7810   int EltBits = VT.getSizeInBits() / NumElements;
7811   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
7812          "Only 8, 16, and 32 bit elements can be extended.");
7813   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
7814
7815   // Found a valid zext mask! Try various lowering strategies based on the
7816   // input type and available ISA extensions.
7817   if (Subtarget->hasSSE41()) {
7818     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7819     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
7820                                  NumElements / Scale);
7821     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7822     return DAG.getNode(ISD::BITCAST, DL, VT,
7823                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
7824   }
7825
7826   // For any extends we can cheat for larger element sizes and use shuffle
7827   // instructions that can fold with a load and/or copy.
7828   if (AnyExt && EltBits == 32) {
7829     int PSHUFDMask[4] = {0, -1, 1, -1};
7830     return DAG.getNode(
7831         ISD::BITCAST, DL, VT,
7832         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7833                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7834                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
7835   }
7836   if (AnyExt && EltBits == 16 && Scale > 2) {
7837     int PSHUFDMask[4] = {0, -1, 0, -1};
7838     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
7839                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
7840                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
7841     int PSHUFHWMask[4] = {1, -1, -1, -1};
7842     return DAG.getNode(
7843         ISD::BITCAST, DL, VT,
7844         DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16,
7845                     DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV),
7846                     getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG)));
7847   }
7848
7849   // If this would require more than 2 unpack instructions to expand, use
7850   // pshufb when available. We can only use more than 2 unpack instructions
7851   // when zero extending i8 elements which also makes it easier to use pshufb.
7852   if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) {
7853     assert(NumElements == 16 && "Unexpected byte vector width!");
7854     SDValue PSHUFBMask[16];
7855     for (int i = 0; i < 16; ++i)
7856       PSHUFBMask[i] =
7857           DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8);
7858     InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV);
7859     return DAG.getNode(ISD::BITCAST, DL, VT,
7860                        DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
7861                                    DAG.getNode(ISD::BUILD_VECTOR, DL,
7862                                                MVT::v16i8, PSHUFBMask)));
7863   }
7864
7865   // Otherwise emit a sequence of unpacks.
7866   do {
7867     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
7868     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
7869                          : getZeroVector(InputVT, Subtarget, DAG, DL);
7870     InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV);
7871     InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext);
7872     Scale /= 2;
7873     EltBits *= 2;
7874     NumElements /= 2;
7875   } while (Scale > 1);
7876   return DAG.getNode(ISD::BITCAST, DL, VT, InputV);
7877 }
7878
7879 /// \brief Try to lower a vector shuffle as a zero extension on any micrarch.
7880 ///
7881 /// This routine will try to do everything in its power to cleverly lower
7882 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
7883 /// check for the profitability of this lowering,  it tries to aggressively
7884 /// match this pattern. It will use all of the micro-architectural details it
7885 /// can to emit an efficient lowering. It handles both blends with all-zero
7886 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
7887 /// masking out later).
7888 ///
7889 /// The reason we have dedicated lowering for zext-style shuffles is that they
7890 /// are both incredibly common and often quite performance sensitive.
7891 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
7892     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7893     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7894   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7895
7896   int Bits = VT.getSizeInBits();
7897   int NumElements = Mask.size();
7898
7899   // Define a helper function to check a particular ext-scale and lower to it if
7900   // valid.
7901   auto Lower = [&](int Scale) -> SDValue {
7902     SDValue InputV;
7903     bool AnyExt = true;
7904     for (int i = 0; i < NumElements; ++i) {
7905       if (Mask[i] == -1)
7906         continue; // Valid anywhere but doesn't tell us anything.
7907       if (i % Scale != 0) {
7908         // Each of the extend elements needs to be zeroable.
7909         if (!Zeroable[i])
7910           return SDValue();
7911
7912         // We no lorger are in the anyext case.
7913         AnyExt = false;
7914         continue;
7915       }
7916
7917       // Each of the base elements needs to be consecutive indices into the
7918       // same input vector.
7919       SDValue V = Mask[i] < NumElements ? V1 : V2;
7920       if (!InputV)
7921         InputV = V;
7922       else if (InputV != V)
7923         return SDValue(); // Flip-flopping inputs.
7924
7925       if (Mask[i] % NumElements != i / Scale)
7926         return SDValue(); // Non-consecutive strided elemenst.
7927     }
7928
7929     // If we fail to find an input, we have a zero-shuffle which should always
7930     // have already been handled.
7931     // FIXME: Maybe handle this here in case during blending we end up with one?
7932     if (!InputV)
7933       return SDValue();
7934
7935     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
7936         DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG);
7937   };
7938
7939   // The widest scale possible for extending is to a 64-bit integer.
7940   assert(Bits % 64 == 0 &&
7941          "The number of bits in a vector must be divisible by 64 on x86!");
7942   int NumExtElements = Bits / 64;
7943
7944   // Each iteration, try extending the elements half as much, but into twice as
7945   // many elements.
7946   for (; NumExtElements < NumElements; NumExtElements *= 2) {
7947     assert(NumElements % NumExtElements == 0 &&
7948            "The input vector size must be divisble by the extended size.");
7949     if (SDValue V = Lower(NumElements / NumExtElements))
7950       return V;
7951   }
7952
7953   // No viable ext lowering found.
7954   return SDValue();
7955 }
7956
7957 /// \brief Try to get a scalar value for a specific element of a vector.
7958 ///
7959 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
7960 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
7961                                               SelectionDAG &DAG) {
7962   MVT VT = V.getSimpleValueType();
7963   MVT EltVT = VT.getVectorElementType();
7964   while (V.getOpcode() == ISD::BITCAST)
7965     V = V.getOperand(0);
7966   // If the bitcasts shift the element size, we can't extract an equivalent
7967   // element from it.
7968   MVT NewVT = V.getSimpleValueType();
7969   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
7970     return SDValue();
7971
7972   if (V.getOpcode() == ISD::BUILD_VECTOR ||
7973       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR))
7974     return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx));
7975
7976   return SDValue();
7977 }
7978
7979 /// \brief Helper to test for a load that can be folded with x86 shuffles.
7980 ///
7981 /// This is particularly important because the set of instructions varies
7982 /// significantly based on whether the operand is a load or not.
7983 static bool isShuffleFoldableLoad(SDValue V) {
7984   while (V.getOpcode() == ISD::BITCAST)
7985     V = V.getOperand(0);
7986
7987   return ISD::isNON_EXTLoad(V.getNode());
7988 }
7989
7990 /// \brief Try to lower insertion of a single element into a zero vector.
7991 ///
7992 /// This is a common pattern that we have especially efficient patterns to lower
7993 /// across all subtarget feature sets.
7994 static SDValue lowerVectorShuffleAsElementInsertion(
7995     MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask,
7996     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
7997   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7998   MVT ExtVT = VT;
7999   MVT EltVT = VT.getVectorElementType();
8000
8001   int V2Index = std::find_if(Mask.begin(), Mask.end(),
8002                              [&Mask](int M) { return M >= (int)Mask.size(); }) -
8003                 Mask.begin();
8004   bool IsV1Zeroable = true;
8005   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8006     if (i != V2Index && !Zeroable[i]) {
8007       IsV1Zeroable = false;
8008       break;
8009     }
8010
8011   // Check for a single input from a SCALAR_TO_VECTOR node.
8012   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8013   // all the smarts here sunk into that routine. However, the current
8014   // lowering of BUILD_VECTOR makes that nearly impossible until the old
8015   // vector shuffle lowering is dead.
8016   if (SDValue V2S = getScalarValueForVectorElement(
8017           V2, Mask[V2Index] - Mask.size(), DAG)) {
8018     // We need to zext the scalar if it is smaller than an i32.
8019     V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S);
8020     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8021       // Using zext to expand a narrow element won't work for non-zero
8022       // insertions.
8023       if (!IsV1Zeroable)
8024         return SDValue();
8025
8026       // Zero-extend directly to i32.
8027       ExtVT = MVT::v4i32;
8028       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8029     }
8030     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8031   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8032              EltVT == MVT::i16) {
8033     // Either not inserting from the low element of the input or the input
8034     // element size is too small to use VZEXT_MOVL to clear the high bits.
8035     return SDValue();
8036   }
8037
8038   if (!IsV1Zeroable) {
8039     // If V1 can't be treated as a zero vector we have fewer options to lower
8040     // this. We can't support integer vectors or non-zero targets cheaply, and
8041     // the V1 elements can't be permuted in any way.
8042     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8043     if (!VT.isFloatingPoint() || V2Index != 0)
8044       return SDValue();
8045     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8046     V1Mask[V2Index] = -1;
8047     if (!isNoopShuffleMask(V1Mask))
8048       return SDValue();
8049     // This is essentially a special case blend operation, but if we have
8050     // general purpose blend operations, they are always faster. Bail and let
8051     // the rest of the lowering handle these as blends.
8052     if (Subtarget->hasSSE41())
8053       return SDValue();
8054
8055     // Otherwise, use MOVSD or MOVSS.
8056     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8057            "Only two types of floating point element types to handle!");
8058     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8059                        ExtVT, V1, V2);
8060   }
8061
8062   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8063   if (ExtVT != VT)
8064     V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8065
8066   if (V2Index != 0) {
8067     // If we have 4 or fewer lanes we can cheaply shuffle the element into
8068     // the desired position. Otherwise it is more efficient to do a vector
8069     // shift left. We know that we can do a vector shift left because all
8070     // the inputs are zero.
8071     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8072       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8073       V2Shuffle[V2Index] = 0;
8074       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8075     } else {
8076       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2);
8077       V2 = DAG.getNode(
8078           X86ISD::VSHLDQ, DL, MVT::v2i64, V2,
8079           DAG.getConstant(
8080               V2Index * EltVT.getSizeInBits(),
8081               DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64)));
8082       V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2);
8083     }
8084   }
8085   return V2;
8086 }
8087
8088 /// \brief Try to lower broadcast of a single element.
8089 ///
8090 /// For convenience, this code also bundles all of the subtarget feature set
8091 /// filtering. While a little annoying to re-dispatch on type here, there isn't
8092 /// a convenient way to factor it out.
8093 static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V,
8094                                              ArrayRef<int> Mask,
8095                                              const X86Subtarget *Subtarget,
8096                                              SelectionDAG &DAG) {
8097   if (!Subtarget->hasAVX())
8098     return SDValue();
8099   if (VT.isInteger() && !Subtarget->hasAVX2())
8100     return SDValue();
8101
8102   // Check that the mask is a broadcast.
8103   int BroadcastIdx = -1;
8104   for (int M : Mask)
8105     if (M >= 0 && BroadcastIdx == -1)
8106       BroadcastIdx = M;
8107     else if (M >= 0 && M != BroadcastIdx)
8108       return SDValue();
8109
8110   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8111                                             "a sorted mask where the broadcast "
8112                                             "comes from V1.");
8113
8114   // Go up the chain of (vector) values to try and find a scalar load that
8115   // we can combine with the broadcast.
8116   for (;;) {
8117     switch (V.getOpcode()) {
8118     case ISD::CONCAT_VECTORS: {
8119       int OperandSize = Mask.size() / V.getNumOperands();
8120       V = V.getOperand(BroadcastIdx / OperandSize);
8121       BroadcastIdx %= OperandSize;
8122       continue;
8123     }
8124
8125     case ISD::INSERT_SUBVECTOR: {
8126       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8127       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8128       if (!ConstantIdx)
8129         break;
8130
8131       int BeginIdx = (int)ConstantIdx->getZExtValue();
8132       int EndIdx =
8133           BeginIdx + (int)VInner.getValueType().getVectorNumElements();
8134       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8135         BroadcastIdx -= BeginIdx;
8136         V = VInner;
8137       } else {
8138         V = VOuter;
8139       }
8140       continue;
8141     }
8142     }
8143     break;
8144   }
8145
8146   // Check if this is a broadcast of a scalar. We special case lowering
8147   // for scalars so that we can more effectively fold with loads.
8148   if (V.getOpcode() == ISD::BUILD_VECTOR ||
8149       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8150     V = V.getOperand(BroadcastIdx);
8151
8152     // If the scalar isn't a load we can't broadcast from it in AVX1, only with
8153     // AVX2.
8154     if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V))
8155       return SDValue();
8156   } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) {
8157     // We can't broadcast from a vector register w/o AVX2, and we can only
8158     // broadcast from the zero-element of a vector register.
8159     return SDValue();
8160   }
8161
8162   return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
8163 }
8164
8165 // Check for whether we can use INSERTPS to perform the shuffle. We only use
8166 // INSERTPS when the V1 elements are already in the correct locations
8167 // because otherwise we can just always use two SHUFPS instructions which
8168 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8169 // perform INSERTPS if a single V1 element is out of place and all V2
8170 // elements are zeroable.
8171 static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
8172                                             ArrayRef<int> Mask,
8173                                             SelectionDAG &DAG) {
8174   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8175   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8176   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8177   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8178
8179   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8180
8181   unsigned ZMask = 0;
8182   int V1DstIndex = -1;
8183   int V2DstIndex = -1;
8184   bool V1UsedInPlace = false;
8185
8186   for (int i = 0; i < 4; i++) {
8187     // Synthesize a zero mask from the zeroable elements (includes undefs).
8188     if (Zeroable[i]) {
8189       ZMask |= 1 << i;
8190       continue;
8191     }
8192
8193     // Flag if we use any V1 inputs in place.
8194     if (i == Mask[i]) {
8195       V1UsedInPlace = true;
8196       continue;
8197     }
8198
8199     // We can only insert a single non-zeroable element.
8200     if (V1DstIndex != -1 || V2DstIndex != -1)
8201       return SDValue();
8202
8203     if (Mask[i] < 4) {
8204       // V1 input out of place for insertion.
8205       V1DstIndex = i;
8206     } else {
8207       // V2 input for insertion.
8208       V2DstIndex = i;
8209     }
8210   }
8211
8212   // Don't bother if we have no (non-zeroable) element for insertion.
8213   if (V1DstIndex == -1 && V2DstIndex == -1)
8214     return SDValue();
8215
8216   // Determine element insertion src/dst indices. The src index is from the
8217   // start of the inserted vector, not the start of the concatenated vector.
8218   unsigned V2SrcIndex = 0;
8219   if (V1DstIndex != -1) {
8220     // If we have a V1 input out of place, we use V1 as the V2 element insertion
8221     // and don't use the original V2 at all.
8222     V2SrcIndex = Mask[V1DstIndex];
8223     V2DstIndex = V1DstIndex;
8224     V2 = V1;
8225   } else {
8226     V2SrcIndex = Mask[V2DstIndex] - 4;
8227   }
8228
8229   // If no V1 inputs are used in place, then the result is created only from
8230   // the zero mask and the V2 insertion - so remove V1 dependency.
8231   if (!V1UsedInPlace)
8232     V1 = DAG.getUNDEF(MVT::v4f32);
8233
8234   unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8235   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8236
8237   // Insert the V2 element into the desired position.
8238   SDLoc DL(Op);
8239   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8240                      DAG.getConstant(InsertPSMask, MVT::i8));
8241 }
8242
8243 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8244 ///
8245 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
8246 /// support for floating point shuffles but not integer shuffles. These
8247 /// instructions will incur a domain crossing penalty on some chips though so
8248 /// it is better to avoid lowering through this for integer vectors where
8249 /// possible.
8250 static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8251                                        const X86Subtarget *Subtarget,
8252                                        SelectionDAG &DAG) {
8253   SDLoc DL(Op);
8254   assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!");
8255   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8256   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8257   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8258   ArrayRef<int> Mask = SVOp->getMask();
8259   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8260
8261   if (isSingleInputShuffleMask(Mask)) {
8262     // Straight shuffle of a single input vector. Simulate this by using the
8263     // single input as both of the "inputs" to this instruction..
8264     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8265
8266     if (Subtarget->hasAVX()) {
8267       // If we have AVX, we can use VPERMILPS which will allow folding a load
8268       // into the shuffle.
8269       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8270                          DAG.getConstant(SHUFPDMask, MVT::i8));
8271     }
8272
8273     return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1,
8274                        DAG.getConstant(SHUFPDMask, MVT::i8));
8275   }
8276   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8277   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8278
8279   // Use dedicated unpack instructions for masks that match their pattern.
8280   if (isShuffleEquivalent(Mask, 0, 2))
8281     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2);
8282   if (isShuffleEquivalent(Mask, 1, 3))
8283     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
8284
8285   // If we have a single input, insert that into V1 if we can do so cheaply.
8286   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8287     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8288             MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG))
8289       return Insertion;
8290     // Try inverting the insertion since for v2 masks it is easy to do and we
8291     // can't reliably sort the mask one way or the other.
8292     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8293                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8294     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8295             MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG))
8296       return Insertion;
8297   }
8298
8299   // Try to use one of the special instruction patterns to handle two common
8300   // blend patterns if a zero-blend above didn't work.
8301   if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3))
8302     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8303       // We can either use a special instruction to load over the low double or
8304       // to move just the low double.
8305       return DAG.getNode(
8306           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8307           DL, MVT::v2f64, V2,
8308           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8309
8310   if (Subtarget->hasSSE41())
8311     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8312                                                   Subtarget, DAG))
8313       return Blend;
8314
8315   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8316   return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
8317                      DAG.getConstant(SHUFPDMask, MVT::i8));
8318 }
8319
8320 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8321 ///
8322 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8323 /// the integer unit to minimize domain crossing penalties. However, for blends
8324 /// it falls back to the floating point shuffle operation with appropriate bit
8325 /// casting.
8326 static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8327                                        const X86Subtarget *Subtarget,
8328                                        SelectionDAG &DAG) {
8329   SDLoc DL(Op);
8330   assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!");
8331   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8332   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8333   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8334   ArrayRef<int> Mask = SVOp->getMask();
8335   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8336
8337   if (isSingleInputShuffleMask(Mask)) {
8338     // Check for being able to broadcast a single element.
8339     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1,
8340                                                           Mask, Subtarget, DAG))
8341       return Broadcast;
8342
8343     // Straight shuffle of a single input vector. For everything from SSE2
8344     // onward this has a single fast instruction with no scary immediates.
8345     // We have to map the mask as it is actually a v4i32 shuffle instruction.
8346     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1);
8347     int WidenedMask[4] = {
8348         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
8349         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
8350     return DAG.getNode(
8351         ISD::BITCAST, DL, MVT::v2i64,
8352         DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1,
8353                     getV4X86ShuffleImm8ForMask(WidenedMask, DAG)));
8354   }
8355
8356   // Try to use byte shift instructions.
8357   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8358           DL, MVT::v2i64, V1, V2, Mask, DAG))
8359     return Shift;
8360
8361   // If we have a single input from V2 insert that into V1 if we can do so
8362   // cheaply.
8363   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8364     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8365             MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG))
8366       return Insertion;
8367     // Try inverting the insertion since for v2 masks it is easy to do and we
8368     // can't reliably sort the mask one way or the other.
8369     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8370                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8371     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8372             MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG))
8373       return Insertion;
8374   }
8375
8376   // Use dedicated unpack instructions for masks that match their pattern.
8377   if (isShuffleEquivalent(Mask, 0, 2))
8378     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2);
8379   if (isShuffleEquivalent(Mask, 1, 3))
8380     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2);
8381
8382   if (Subtarget->hasSSE41())
8383     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
8384                                                   Subtarget, DAG))
8385       return Blend;
8386
8387   // Try to use byte rotation instructions.
8388   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8389   if (Subtarget->hasSSSE3())
8390     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8391             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8392       return Rotate;
8393
8394   // We implement this with SHUFPD which is pretty lame because it will likely
8395   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
8396   // However, all the alternatives are still more cycles and newer chips don't
8397   // have this problem. It would be really nice if x86 had better shuffles here.
8398   V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1);
8399   V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2);
8400   return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
8401                      DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
8402 }
8403
8404 /// \brief Lower a vector shuffle using the SHUFPS instruction.
8405 ///
8406 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
8407 /// It makes no assumptions about whether this is the *best* lowering, it simply
8408 /// uses it.
8409 static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT,
8410                                             ArrayRef<int> Mask, SDValue V1,
8411                                             SDValue V2, SelectionDAG &DAG) {
8412   SDValue LowV = V1, HighV = V2;
8413   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
8414
8415   int NumV2Elements =
8416       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8417
8418   if (NumV2Elements == 1) {
8419     int V2Index =
8420         std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
8421         Mask.begin();
8422
8423     // Compute the index adjacent to V2Index and in the same half by toggling
8424     // the low bit.
8425     int V2AdjIndex = V2Index ^ 1;
8426
8427     if (Mask[V2AdjIndex] == -1) {
8428       // Handles all the cases where we have a single V2 element and an undef.
8429       // This will only ever happen in the high lanes because we commute the
8430       // vector otherwise.
8431       if (V2Index < 2)
8432         std::swap(LowV, HighV);
8433       NewMask[V2Index] -= 4;
8434     } else {
8435       // Handle the case where the V2 element ends up adjacent to a V1 element.
8436       // To make this work, blend them together as the first step.
8437       int V1Index = V2AdjIndex;
8438       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
8439       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
8440                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8441
8442       // Now proceed to reconstruct the final blend as we have the necessary
8443       // high or low half formed.
8444       if (V2Index < 2) {
8445         LowV = V2;
8446         HighV = V1;
8447       } else {
8448         HighV = V2;
8449       }
8450       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
8451       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
8452     }
8453   } else if (NumV2Elements == 2) {
8454     if (Mask[0] < 4 && Mask[1] < 4) {
8455       // Handle the easy case where we have V1 in the low lanes and V2 in the
8456       // high lanes.
8457       NewMask[2] -= 4;
8458       NewMask[3] -= 4;
8459     } else if (Mask[2] < 4 && Mask[3] < 4) {
8460       // We also handle the reversed case because this utility may get called
8461       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
8462       // arrange things in the right direction.
8463       NewMask[0] -= 4;
8464       NewMask[1] -= 4;
8465       HighV = V1;
8466       LowV = V2;
8467     } else {
8468       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
8469       // trying to place elements directly, just blend them and set up the final
8470       // shuffle to place them.
8471
8472       // The first two blend mask elements are for V1, the second two are for
8473       // V2.
8474       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
8475                           Mask[2] < 4 ? Mask[2] : Mask[3],
8476                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
8477                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
8478       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
8479                        getV4X86ShuffleImm8ForMask(BlendMask, DAG));
8480
8481       // Now we do a normal shuffle of V1 by giving V1 as both operands to
8482       // a blend.
8483       LowV = HighV = V1;
8484       NewMask[0] = Mask[0] < 4 ? 0 : 2;
8485       NewMask[1] = Mask[0] < 4 ? 2 : 0;
8486       NewMask[2] = Mask[2] < 4 ? 1 : 3;
8487       NewMask[3] = Mask[2] < 4 ? 3 : 1;
8488     }
8489   }
8490   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
8491                      getV4X86ShuffleImm8ForMask(NewMask, DAG));
8492 }
8493
8494 /// \brief Lower 4-lane 32-bit floating point shuffles.
8495 ///
8496 /// Uses instructions exclusively from the floating point unit to minimize
8497 /// domain crossing penalties, as these are sufficient to implement all v4f32
8498 /// shuffles.
8499 static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8500                                        const X86Subtarget *Subtarget,
8501                                        SelectionDAG &DAG) {
8502   SDLoc DL(Op);
8503   assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
8504   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8505   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8506   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8507   ArrayRef<int> Mask = SVOp->getMask();
8508   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8509
8510   int NumV2Elements =
8511       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8512
8513   if (NumV2Elements == 0) {
8514     // Check for being able to broadcast a single element.
8515     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1,
8516                                                           Mask, Subtarget, DAG))
8517       return Broadcast;
8518
8519     if (Subtarget->hasAVX()) {
8520       // If we have AVX, we can use VPERMILPS which will allow folding a load
8521       // into the shuffle.
8522       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
8523                          getV4X86ShuffleImm8ForMask(Mask, DAG));
8524     }
8525
8526     // Otherwise, use a straight shuffle of a single input vector. We pass the
8527     // input vector to both operands to simulate this with a SHUFPS.
8528     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
8529                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8530   }
8531
8532   // Use dedicated unpack instructions for masks that match their pattern.
8533   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8534     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2);
8535   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8536     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
8537
8538   // There are special ways we can lower some single-element blends. However, we
8539   // have custom ways we can lower more complex single-element blends below that
8540   // we defer to if both this and BLENDPS fail to match, so restrict this to
8541   // when the V2 input is targeting element 0 of the mask -- that is the fast
8542   // case here.
8543   if (NumV2Elements == 1 && Mask[0] >= 4)
8544     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2,
8545                                                          Mask, Subtarget, DAG))
8546       return V;
8547
8548   if (Subtarget->hasSSE41()) {
8549     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
8550                                                   Subtarget, DAG))
8551       return Blend;
8552
8553     // Use INSERTPS if we can complete the shuffle efficiently.
8554     if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
8555       return V;
8556   }
8557
8558   // Otherwise fall back to a SHUFPS lowering strategy.
8559   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
8560 }
8561
8562 /// \brief Lower 4-lane i32 vector shuffles.
8563 ///
8564 /// We try to handle these with integer-domain shuffles where we can, but for
8565 /// blends we use the floating point domain blend instructions.
8566 static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
8567                                        const X86Subtarget *Subtarget,
8568                                        SelectionDAG &DAG) {
8569   SDLoc DL(Op);
8570   assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!");
8571   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8572   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
8573   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
8574   ArrayRef<int> Mask = SVOp->getMask();
8575   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8576
8577   // Whenever we can lower this as a zext, that instruction is strictly faster
8578   // than any alternative. It also allows us to fold memory operands into the
8579   // shuffle in many cases.
8580   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
8581                                                          Mask, Subtarget, DAG))
8582     return ZExt;
8583
8584   int NumV2Elements =
8585       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
8586
8587   if (NumV2Elements == 0) {
8588     // Check for being able to broadcast a single element.
8589     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1,
8590                                                           Mask, Subtarget, DAG))
8591       return Broadcast;
8592
8593     // Straight shuffle of a single input vector. For everything from SSE2
8594     // onward this has a single fast instruction with no scary immediates.
8595     // We coerce the shuffle pattern to be compatible with UNPCK instructions
8596     // but we aren't actually going to use the UNPCK instruction because doing
8597     // so prevents folding a load into this instruction or making a copy.
8598     const int UnpackLoMask[] = {0, 0, 1, 1};
8599     const int UnpackHiMask[] = {2, 2, 3, 3};
8600     if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
8601       Mask = UnpackLoMask;
8602     else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
8603       Mask = UnpackHiMask;
8604
8605     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
8606                        getV4X86ShuffleImm8ForMask(Mask, DAG));
8607   }
8608
8609   // Try to use byte shift instructions.
8610   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8611           DL, MVT::v4i32, V1, V2, Mask, DAG))
8612     return Shift;
8613
8614   // There are special ways we can lower some single-element blends.
8615   if (NumV2Elements == 1)
8616     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2,
8617                                                          Mask, Subtarget, DAG))
8618       return V;
8619
8620   // Use dedicated unpack instructions for masks that match their pattern.
8621   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
8622     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
8623   if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
8624     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
8625
8626   if (Subtarget->hasSSE41())
8627     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
8628                                                   Subtarget, DAG))
8629       return Blend;
8630
8631   // Try to use byte rotation instructions.
8632   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
8633   if (Subtarget->hasSSSE3())
8634     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8635             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
8636       return Rotate;
8637
8638   // We implement this with SHUFPS because it can blend from two vectors.
8639   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
8640   // up the inputs, bypassing domain shift penalties that we would encur if we
8641   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
8642   // relevant.
8643   return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32,
8644                      DAG.getVectorShuffle(
8645                          MVT::v4f32, DL,
8646                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1),
8647                          DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask));
8648 }
8649
8650 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
8651 /// shuffle lowering, and the most complex part.
8652 ///
8653 /// The lowering strategy is to try to form pairs of input lanes which are
8654 /// targeted at the same half of the final vector, and then use a dword shuffle
8655 /// to place them onto the right half, and finally unpack the paired lanes into
8656 /// their final position.
8657 ///
8658 /// The exact breakdown of how to form these dword pairs and align them on the
8659 /// correct sides is really tricky. See the comments within the function for
8660 /// more of the details.
8661 static SDValue lowerV8I16SingleInputVectorShuffle(
8662     SDLoc DL, SDValue V, MutableArrayRef<int> Mask,
8663     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
8664   assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
8665   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
8666   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
8667
8668   SmallVector<int, 4> LoInputs;
8669   std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
8670                [](int M) { return M >= 0; });
8671   std::sort(LoInputs.begin(), LoInputs.end());
8672   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
8673   SmallVector<int, 4> HiInputs;
8674   std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
8675                [](int M) { return M >= 0; });
8676   std::sort(HiInputs.begin(), HiInputs.end());
8677   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
8678   int NumLToL =
8679       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
8680   int NumHToL = LoInputs.size() - NumLToL;
8681   int NumLToH =
8682       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
8683   int NumHToH = HiInputs.size() - NumLToH;
8684   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
8685   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
8686   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
8687   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
8688
8689   // Check for being able to broadcast a single element.
8690   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V,
8691                                                         Mask, Subtarget, DAG))
8692     return Broadcast;
8693
8694   // Try to use byte shift instructions.
8695   if (SDValue Shift = lowerVectorShuffleAsByteShift(
8696           DL, MVT::v8i16, V, V, Mask, DAG))
8697     return Shift;
8698
8699   // Use dedicated unpack instructions for masks that match their pattern.
8700   if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3))
8701     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V);
8702   if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7))
8703     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V);
8704
8705   // Try to use byte rotation instructions.
8706   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
8707           DL, MVT::v8i16, V, V, Mask, Subtarget, DAG))
8708     return Rotate;
8709
8710   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
8711   // such inputs we can swap two of the dwords across the half mark and end up
8712   // with <=2 inputs to each half in each half. Once there, we can fall through
8713   // to the generic code below. For example:
8714   //
8715   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8716   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
8717   //
8718   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
8719   // and an existing 2-into-2 on the other half. In this case we may have to
8720   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
8721   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
8722   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
8723   // because any other situation (including a 3-into-1 or 1-into-3 in the other
8724   // half than the one we target for fixing) will be fixed when we re-enter this
8725   // path. We will also combine away any sequence of PSHUFD instructions that
8726   // result into a single instruction. Here is an example of the tricky case:
8727   //
8728   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
8729   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
8730   //
8731   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
8732   //
8733   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
8734   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
8735   //
8736   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
8737   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
8738   //
8739   // The result is fine to be handled by the generic logic.
8740   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
8741                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
8742                           int AOffset, int BOffset) {
8743     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
8744            "Must call this with A having 3 or 1 inputs from the A half.");
8745     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
8746            "Must call this with B having 1 or 3 inputs from the B half.");
8747     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
8748            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
8749
8750     // Compute the index of dword with only one word among the three inputs in
8751     // a half by taking the sum of the half with three inputs and subtracting
8752     // the sum of the actual three inputs. The difference is the remaining
8753     // slot.
8754     int ADWord, BDWord;
8755     int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord;
8756     int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord;
8757     int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset;
8758     ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs;
8759     int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0];
8760     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
8761     int TripleNonInputIdx =
8762         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
8763     TripleDWord = TripleNonInputIdx / 2;
8764
8765     // We use xor with one to compute the adjacent DWord to whichever one the
8766     // OneInput is in.
8767     OneInputDWord = (OneInput / 2) ^ 1;
8768
8769     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
8770     // and BToA inputs. If there is also such a problem with the BToB and AToB
8771     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
8772     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
8773     // is essential that we don't *create* a 3<-1 as then we might oscillate.
8774     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
8775       // Compute how many inputs will be flipped by swapping these DWords. We
8776       // need
8777       // to balance this to ensure we don't form a 3-1 shuffle in the other
8778       // half.
8779       int NumFlippedAToBInputs =
8780           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
8781           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
8782       int NumFlippedBToBInputs =
8783           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
8784           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
8785       if ((NumFlippedAToBInputs == 1 &&
8786            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
8787           (NumFlippedBToBInputs == 1 &&
8788            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
8789         // We choose whether to fix the A half or B half based on whether that
8790         // half has zero flipped inputs. At zero, we may not be able to fix it
8791         // with that half. We also bias towards fixing the B half because that
8792         // will more commonly be the high half, and we have to bias one way.
8793         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
8794                                                        ArrayRef<int> Inputs) {
8795           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
8796           bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
8797                                          PinnedIdx ^ 1) != Inputs.end();
8798           // Determine whether the free index is in the flipped dword or the
8799           // unflipped dword based on where the pinned index is. We use this bit
8800           // in an xor to conditionally select the adjacent dword.
8801           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
8802           bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8803                                              FixFreeIdx) != Inputs.end();
8804           if (IsFixIdxInput == IsFixFreeIdxInput)
8805             FixFreeIdx += 1;
8806           IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
8807                                         FixFreeIdx) != Inputs.end();
8808           assert(IsFixIdxInput != IsFixFreeIdxInput &&
8809                  "We need to be changing the number of flipped inputs!");
8810           int PSHUFHalfMask[] = {0, 1, 2, 3};
8811           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
8812           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
8813                           MVT::v8i16, V,
8814                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG));
8815
8816           for (int &M : Mask)
8817             if (M != -1 && M == FixIdx)
8818               M = FixFreeIdx;
8819             else if (M != -1 && M == FixFreeIdx)
8820               M = FixIdx;
8821         };
8822         if (NumFlippedBToBInputs != 0) {
8823           int BPinnedIdx =
8824               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8825           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
8826         } else {
8827           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
8828           int APinnedIdx =
8829               AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
8830           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
8831         }
8832       }
8833     }
8834
8835     int PSHUFDMask[] = {0, 1, 2, 3};
8836     PSHUFDMask[ADWord] = BDWord;
8837     PSHUFDMask[BDWord] = ADWord;
8838     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
8839                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8840                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
8841                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
8842
8843     // Adjust the mask to match the new locations of A and B.
8844     for (int &M : Mask)
8845       if (M != -1 && M/2 == ADWord)
8846         M = 2 * BDWord + M % 2;
8847       else if (M != -1 && M/2 == BDWord)
8848         M = 2 * ADWord + M % 2;
8849
8850     // Recurse back into this routine to re-compute state now that this isn't
8851     // a 3 and 1 problem.
8852     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
8853                                 Mask);
8854   };
8855   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
8856     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
8857   else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
8858     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
8859
8860   // At this point there are at most two inputs to the low and high halves from
8861   // each half. That means the inputs can always be grouped into dwords and
8862   // those dwords can then be moved to the correct half with a dword shuffle.
8863   // We use at most one low and one high word shuffle to collect these paired
8864   // inputs into dwords, and finally a dword shuffle to place them.
8865   int PSHUFLMask[4] = {-1, -1, -1, -1};
8866   int PSHUFHMask[4] = {-1, -1, -1, -1};
8867   int PSHUFDMask[4] = {-1, -1, -1, -1};
8868
8869   // First fix the masks for all the inputs that are staying in their
8870   // original halves. This will then dictate the targets of the cross-half
8871   // shuffles.
8872   auto fixInPlaceInputs =
8873       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
8874                     MutableArrayRef<int> SourceHalfMask,
8875                     MutableArrayRef<int> HalfMask, int HalfOffset) {
8876     if (InPlaceInputs.empty())
8877       return;
8878     if (InPlaceInputs.size() == 1) {
8879       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8880           InPlaceInputs[0] - HalfOffset;
8881       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
8882       return;
8883     }
8884     if (IncomingInputs.empty()) {
8885       // Just fix all of the in place inputs.
8886       for (int Input : InPlaceInputs) {
8887         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
8888         PSHUFDMask[Input / 2] = Input / 2;
8889       }
8890       return;
8891     }
8892
8893     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
8894     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
8895         InPlaceInputs[0] - HalfOffset;
8896     // Put the second input next to the first so that they are packed into
8897     // a dword. We find the adjacent index by toggling the low bit.
8898     int AdjIndex = InPlaceInputs[0] ^ 1;
8899     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
8900     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
8901     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
8902   };
8903   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
8904   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
8905
8906   // Now gather the cross-half inputs and place them into a free dword of
8907   // their target half.
8908   // FIXME: This operation could almost certainly be simplified dramatically to
8909   // look more like the 3-1 fixing operation.
8910   auto moveInputsToRightHalf = [&PSHUFDMask](
8911       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
8912       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
8913       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
8914       int DestOffset) {
8915     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
8916       return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word;
8917     };
8918     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
8919                                                int Word) {
8920       int LowWord = Word & ~1;
8921       int HighWord = Word | 1;
8922       return isWordClobbered(SourceHalfMask, LowWord) ||
8923              isWordClobbered(SourceHalfMask, HighWord);
8924     };
8925
8926     if (IncomingInputs.empty())
8927       return;
8928
8929     if (ExistingInputs.empty()) {
8930       // Map any dwords with inputs from them into the right half.
8931       for (int Input : IncomingInputs) {
8932         // If the source half mask maps over the inputs, turn those into
8933         // swaps and use the swapped lane.
8934         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
8935           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) {
8936             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
8937                 Input - SourceOffset;
8938             // We have to swap the uses in our half mask in one sweep.
8939             for (int &M : HalfMask)
8940               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
8941                 M = Input;
8942               else if (M == Input)
8943                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8944           } else {
8945             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
8946                        Input - SourceOffset &&
8947                    "Previous placement doesn't match!");
8948           }
8949           // Note that this correctly re-maps both when we do a swap and when
8950           // we observe the other side of the swap above. We rely on that to
8951           // avoid swapping the members of the input list directly.
8952           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
8953         }
8954
8955         // Map the input's dword into the correct half.
8956         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1)
8957           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
8958         else
8959           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
8960                      Input / 2 &&
8961                  "Previous placement doesn't match!");
8962       }
8963
8964       // And just directly shift any other-half mask elements to be same-half
8965       // as we will have mirrored the dword containing the element into the
8966       // same position within that half.
8967       for (int &M : HalfMask)
8968         if (M >= SourceOffset && M < SourceOffset + 4) {
8969           M = M - SourceOffset + DestOffset;
8970           assert(M >= 0 && "This should never wrap below zero!");
8971         }
8972       return;
8973     }
8974
8975     // Ensure we have the input in a viable dword of its current half. This
8976     // is particularly tricky because the original position may be clobbered
8977     // by inputs being moved and *staying* in that half.
8978     if (IncomingInputs.size() == 1) {
8979       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8980         int InputFixed = std::find(std::begin(SourceHalfMask),
8981                                    std::end(SourceHalfMask), -1) -
8982                          std::begin(SourceHalfMask) + SourceOffset;
8983         SourceHalfMask[InputFixed - SourceOffset] =
8984             IncomingInputs[0] - SourceOffset;
8985         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
8986                      InputFixed);
8987         IncomingInputs[0] = InputFixed;
8988       }
8989     } else if (IncomingInputs.size() == 2) {
8990       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
8991           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
8992         // We have two non-adjacent or clobbered inputs we need to extract from
8993         // the source half. To do this, we need to map them into some adjacent
8994         // dword slot in the source mask.
8995         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
8996                               IncomingInputs[1] - SourceOffset};
8997
8998         // If there is a free slot in the source half mask adjacent to one of
8999         // the inputs, place the other input in it. We use (Index XOR 1) to
9000         // compute an adjacent index.
9001         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9002             SourceHalfMask[InputsFixed[0] ^ 1] == -1) {
9003           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9004           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9005           InputsFixed[1] = InputsFixed[0] ^ 1;
9006         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9007                    SourceHalfMask[InputsFixed[1] ^ 1] == -1) {
9008           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9009           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9010           InputsFixed[0] = InputsFixed[1] ^ 1;
9011         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 &&
9012                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) {
9013           // The two inputs are in the same DWord but it is clobbered and the
9014           // adjacent DWord isn't used at all. Move both inputs to the free
9015           // slot.
9016           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9017           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9018           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9019           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9020         } else {
9021           // The only way we hit this point is if there is no clobbering
9022           // (because there are no off-half inputs to this half) and there is no
9023           // free slot adjacent to one of the inputs. In this case, we have to
9024           // swap an input with a non-input.
9025           for (int i = 0; i < 4; ++i)
9026             assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) &&
9027                    "We can't handle any clobbers here!");
9028           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9029                  "Cannot have adjacent inputs here!");
9030
9031           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9032           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9033
9034           // We also have to update the final source mask in this case because
9035           // it may need to undo the above swap.
9036           for (int &M : FinalSourceHalfMask)
9037             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9038               M = InputsFixed[1] + SourceOffset;
9039             else if (M == InputsFixed[1] + SourceOffset)
9040               M = (InputsFixed[0] ^ 1) + SourceOffset;
9041
9042           InputsFixed[1] = InputsFixed[0] ^ 1;
9043         }
9044
9045         // Point everything at the fixed inputs.
9046         for (int &M : HalfMask)
9047           if (M == IncomingInputs[0])
9048             M = InputsFixed[0] + SourceOffset;
9049           else if (M == IncomingInputs[1])
9050             M = InputsFixed[1] + SourceOffset;
9051
9052         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9053         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9054       }
9055     } else {
9056       llvm_unreachable("Unhandled input size!");
9057     }
9058
9059     // Now hoist the DWord down to the right half.
9060     int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2;
9061     assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free");
9062     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9063     for (int &M : HalfMask)
9064       for (int Input : IncomingInputs)
9065         if (M == Input)
9066           M = FreeDWord * 2 + Input % 2;
9067   };
9068   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9069                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
9070   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9071                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
9072
9073   // Now enact all the shuffles we've computed to move the inputs into their
9074   // target half.
9075   if (!isNoopShuffleMask(PSHUFLMask))
9076     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9077                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG));
9078   if (!isNoopShuffleMask(PSHUFHMask))
9079     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9080                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG));
9081   if (!isNoopShuffleMask(PSHUFDMask))
9082     V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9083                     DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9084                                 DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V),
9085                                 getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
9086
9087   // At this point, each half should contain all its inputs, and we can then
9088   // just shuffle them into their final position.
9089   assert(std::count_if(LoMask.begin(), LoMask.end(),
9090                        [](int M) { return M >= 4; }) == 0 &&
9091          "Failed to lift all the high half inputs to the low mask!");
9092   assert(std::count_if(HiMask.begin(), HiMask.end(),
9093                        [](int M) { return M >= 0 && M < 4; }) == 0 &&
9094          "Failed to lift all the low half inputs to the high mask!");
9095
9096   // Do a half shuffle for the low mask.
9097   if (!isNoopShuffleMask(LoMask))
9098     V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V,
9099                     getV4X86ShuffleImm8ForMask(LoMask, DAG));
9100
9101   // Do a half shuffle with the high mask after shifting its values down.
9102   for (int &M : HiMask)
9103     if (M >= 0)
9104       M -= 4;
9105   if (!isNoopShuffleMask(HiMask))
9106     V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V,
9107                     getV4X86ShuffleImm8ForMask(HiMask, DAG));
9108
9109   return V;
9110 }
9111
9112 /// \brief Detect whether the mask pattern should be lowered through
9113 /// interleaving.
9114 ///
9115 /// This essentially tests whether viewing the mask as an interleaving of two
9116 /// sub-sequences reduces the cross-input traffic of a blend operation. If so,
9117 /// lowering it through interleaving is a significantly better strategy.
9118 static bool shouldLowerAsInterleaving(ArrayRef<int> Mask) {
9119   int NumEvenInputs[2] = {0, 0};
9120   int NumOddInputs[2] = {0, 0};
9121   int NumLoInputs[2] = {0, 0};
9122   int NumHiInputs[2] = {0, 0};
9123   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9124     if (Mask[i] < 0)
9125       continue;
9126
9127     int InputIdx = Mask[i] >= Size;
9128
9129     if (i < Size / 2)
9130       ++NumLoInputs[InputIdx];
9131     else
9132       ++NumHiInputs[InputIdx];
9133
9134     if ((i % 2) == 0)
9135       ++NumEvenInputs[InputIdx];
9136     else
9137       ++NumOddInputs[InputIdx];
9138   }
9139
9140   // The minimum number of cross-input results for both the interleaved and
9141   // split cases. If interleaving results in fewer cross-input results, return
9142   // true.
9143   int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0],
9144                                     NumEvenInputs[0] + NumOddInputs[1]);
9145   int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0],
9146                               NumLoInputs[0] + NumHiInputs[1]);
9147   return InterleavedCrosses < SplitCrosses;
9148 }
9149
9150 /// \brief Blend two v8i16 vectors using a naive unpack strategy.
9151 ///
9152 /// This strategy only works when the inputs from each vector fit into a single
9153 /// half of that vector, and generally there are not so many inputs as to leave
9154 /// the in-place shuffles required highly constrained (and thus expensive). It
9155 /// shifts all the inputs into a single side of both input vectors and then
9156 /// uses an unpack to interleave these inputs in a single vector. At that
9157 /// point, we will fall back on the generic single input shuffle lowering.
9158 static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1,
9159                                                  SDValue V2,
9160                                                  MutableArrayRef<int> Mask,
9161                                                  const X86Subtarget *Subtarget,
9162                                                  SelectionDAG &DAG) {
9163   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9164   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!");
9165   SmallVector<int, 3> LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs;
9166   for (int i = 0; i < 8; ++i)
9167     if (Mask[i] >= 0 && Mask[i] < 4)
9168       LoV1Inputs.push_back(i);
9169     else if (Mask[i] >= 4 && Mask[i] < 8)
9170       HiV1Inputs.push_back(i);
9171     else if (Mask[i] >= 8 && Mask[i] < 12)
9172       LoV2Inputs.push_back(i);
9173     else if (Mask[i] >= 12)
9174       HiV2Inputs.push_back(i);
9175
9176   int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size();
9177   int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size();
9178   (void)NumV1Inputs;
9179   (void)NumV2Inputs;
9180   assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported");
9181   assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported");
9182   assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs");
9183
9184   bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >=
9185                      HiV1Inputs.size() + HiV2Inputs.size();
9186
9187   auto moveInputsToHalf = [&](SDValue V, ArrayRef<int> LoInputs,
9188                               ArrayRef<int> HiInputs, bool MoveToLo,
9189                               int MaskOffset) {
9190     ArrayRef<int> GoodInputs = MoveToLo ? LoInputs : HiInputs;
9191     ArrayRef<int> BadInputs = MoveToLo ? HiInputs : LoInputs;
9192     if (BadInputs.empty())
9193       return V;
9194
9195     int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9196     int MoveOffset = MoveToLo ? 0 : 4;
9197
9198     if (GoodInputs.empty()) {
9199       for (int BadInput : BadInputs) {
9200         MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset;
9201         Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset;
9202       }
9203     } else {
9204       if (GoodInputs.size() == 2) {
9205         // If the low inputs are spread across two dwords, pack them into
9206         // a single dword.
9207         MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset;
9208         MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset;
9209         Mask[GoodInputs[0]] = MoveOffset + MaskOffset;
9210         Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset;
9211       } else {
9212         // Otherwise pin the good inputs.
9213         for (int GoodInput : GoodInputs)
9214           MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset;
9215       }
9216
9217       if (BadInputs.size() == 2) {
9218         // If we have two bad inputs then there may be either one or two good
9219         // inputs fixed in place. Find a fixed input, and then find the *other*
9220         // two adjacent indices by using modular arithmetic.
9221         int GoodMaskIdx =
9222             std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask),
9223                          [](int M) { return M >= 0; }) -
9224             std::begin(MoveMask);
9225         int MoveMaskIdx =
9226             ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset;
9227         assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot");
9228         assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot");
9229         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9230         MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset;
9231         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9232         Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset;
9233       } else {
9234         assert(BadInputs.size() == 1 && "All sizes handled");
9235         int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset,
9236                                     std::end(MoveMask), -1) -
9237                           std::begin(MoveMask);
9238         MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset;
9239         Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset;
9240       }
9241     }
9242
9243     return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16),
9244                                 MoveMask);
9245   };
9246   V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo,
9247                         /*MaskOffset*/ 0);
9248   V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo,
9249                         /*MaskOffset*/ 8);
9250
9251   // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes
9252   // cross-half traffic in the final shuffle.
9253
9254   // Munge the mask to be a single-input mask after the unpack merges the
9255   // results.
9256   for (int &M : Mask)
9257     if (M != -1)
9258       M = 2 * (M % 4) + (M / 8);
9259
9260   return DAG.getVectorShuffle(
9261       MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH,
9262                                   DL, MVT::v8i16, V1, V2),
9263       DAG.getUNDEF(MVT::v8i16), Mask);
9264 }
9265
9266 /// \brief Generic lowering of 8-lane i16 shuffles.
9267 ///
9268 /// This handles both single-input shuffles and combined shuffle/blends with
9269 /// two inputs. The single input shuffles are immediately delegated to
9270 /// a dedicated lowering routine.
9271 ///
9272 /// The blends are lowered in one of three fundamental ways. If there are few
9273 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9274 /// of the input is significantly cheaper when lowered as an interleaving of
9275 /// the two inputs, try to interleave them. Otherwise, blend the low and high
9276 /// halves of the inputs separately (making them have relatively few inputs)
9277 /// and then concatenate them.
9278 static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9279                                        const X86Subtarget *Subtarget,
9280                                        SelectionDAG &DAG) {
9281   SDLoc DL(Op);
9282   assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!");
9283   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9284   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9285   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9286   ArrayRef<int> OrigMask = SVOp->getMask();
9287   int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3],
9288                         OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]};
9289   MutableArrayRef<int> Mask(MaskStorage);
9290
9291   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9292
9293   // Whenever we can lower this as a zext, that instruction is strictly faster
9294   // than any alternative.
9295   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9296           DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG))
9297     return ZExt;
9298
9299   auto isV1 = [](int M) { return M >= 0 && M < 8; };
9300   auto isV2 = [](int M) { return M >= 8; };
9301
9302   int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1);
9303   int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2);
9304
9305   if (NumV2Inputs == 0)
9306     return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG);
9307
9308   assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized "
9309                             "to be V1-input shuffles.");
9310
9311   // Try to use byte shift instructions.
9312   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9313           DL, MVT::v8i16, V1, V2, Mask, DAG))
9314     return Shift;
9315
9316   // There are special ways we can lower some single-element blends.
9317   if (NumV2Inputs == 1)
9318     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2,
9319                                                          Mask, Subtarget, DAG))
9320       return V;
9321
9322   // Use dedicated unpack instructions for masks that match their pattern.
9323   if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
9324     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
9325   if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
9326     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
9327
9328   if (Subtarget->hasSSE41())
9329     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9330                                                   Subtarget, DAG))
9331       return Blend;
9332
9333   // Try to use byte rotation instructions.
9334   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9335           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9336     return Rotate;
9337
9338   if (NumV1Inputs + NumV2Inputs <= 4)
9339     return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG);
9340
9341   // Check whether an interleaving lowering is likely to be more efficient.
9342   // This isn't perfect but it is a strong heuristic that tends to work well on
9343   // the kinds of shuffles that show up in practice.
9344   //
9345   // FIXME: Handle 1x, 2x, and 4x interleaving.
9346   if (shouldLowerAsInterleaving(Mask)) {
9347     // FIXME: Figure out whether we should pack these into the low or high
9348     // halves.
9349
9350     int EMask[8], OMask[8];
9351     for (int i = 0; i < 4; ++i) {
9352       EMask[i] = Mask[2*i];
9353       OMask[i] = Mask[2*i + 1];
9354       EMask[i + 4] = -1;
9355       OMask[i + 4] = -1;
9356     }
9357
9358     SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask);
9359     SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask);
9360
9361     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds);
9362   }
9363
9364   int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9365   int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9366
9367   for (int i = 0; i < 4; ++i) {
9368     LoBlendMask[i] = Mask[i];
9369     HiBlendMask[i] = Mask[i + 4];
9370   }
9371
9372   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9373   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9374   LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV);
9375   HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV);
9376
9377   return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9378                      DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV));
9379 }
9380
9381 /// \brief Check whether a compaction lowering can be done by dropping even
9382 /// elements and compute how many times even elements must be dropped.
9383 ///
9384 /// This handles shuffles which take every Nth element where N is a power of
9385 /// two. Example shuffle masks:
9386 ///
9387 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
9388 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
9389 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
9390 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
9391 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
9392 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
9393 ///
9394 /// Any of these lanes can of course be undef.
9395 ///
9396 /// This routine only supports N <= 3.
9397 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
9398 /// for larger N.
9399 ///
9400 /// \returns N above, or the number of times even elements must be dropped if
9401 /// there is such a number. Otherwise returns zero.
9402 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) {
9403   // Figure out whether we're looping over two inputs or just one.
9404   bool IsSingleInput = isSingleInputShuffleMask(Mask);
9405
9406   // The modulus for the shuffle vector entries is based on whether this is
9407   // a single input or not.
9408   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
9409   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
9410          "We should only be called with masks with a power-of-2 size!");
9411
9412   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
9413
9414   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
9415   // and 2^3 simultaneously. This is because we may have ambiguity with
9416   // partially undef inputs.
9417   bool ViableForN[3] = {true, true, true};
9418
9419   for (int i = 0, e = Mask.size(); i < e; ++i) {
9420     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
9421     // want.
9422     if (Mask[i] == -1)
9423       continue;
9424
9425     bool IsAnyViable = false;
9426     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9427       if (ViableForN[j]) {
9428         uint64_t N = j + 1;
9429
9430         // The shuffle mask must be equal to (i * 2^N) % M.
9431         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
9432           IsAnyViable = true;
9433         else
9434           ViableForN[j] = false;
9435       }
9436     // Early exit if we exhaust the possible powers of two.
9437     if (!IsAnyViable)
9438       break;
9439   }
9440
9441   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
9442     if (ViableForN[j])
9443       return j + 1;
9444
9445   // Return 0 as there is no viable power of two.
9446   return 0;
9447 }
9448
9449 /// \brief Generic lowering of v16i8 shuffles.
9450 ///
9451 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
9452 /// detect any complexity reducing interleaving. If that doesn't help, it uses
9453 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
9454 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
9455 /// back together.
9456 static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9457                                        const X86Subtarget *Subtarget,
9458                                        SelectionDAG &DAG) {
9459   SDLoc DL(Op);
9460   assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!");
9461   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9462   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
9463   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9464   ArrayRef<int> OrigMask = SVOp->getMask();
9465   assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!");
9466
9467   // Try to use byte shift instructions.
9468   if (SDValue Shift = lowerVectorShuffleAsByteShift(
9469           DL, MVT::v16i8, V1, V2, OrigMask, DAG))
9470     return Shift;
9471
9472   // Try to use byte rotation instructions.
9473   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9474           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9475     return Rotate;
9476
9477   // Try to use a zext lowering.
9478   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9479           DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG))
9480     return ZExt;
9481
9482   int MaskStorage[16] = {
9483       OrigMask[0],  OrigMask[1],  OrigMask[2],  OrigMask[3],
9484       OrigMask[4],  OrigMask[5],  OrigMask[6],  OrigMask[7],
9485       OrigMask[8],  OrigMask[9],  OrigMask[10], OrigMask[11],
9486       OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]};
9487   MutableArrayRef<int> Mask(MaskStorage);
9488   MutableArrayRef<int> LoMask = Mask.slice(0, 8);
9489   MutableArrayRef<int> HiMask = Mask.slice(8, 8);
9490
9491   int NumV2Elements =
9492       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; });
9493
9494   // For single-input shuffles, there are some nicer lowering tricks we can use.
9495   if (NumV2Elements == 0) {
9496     // Check for being able to broadcast a single element.
9497     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1,
9498                                                           Mask, Subtarget, DAG))
9499       return Broadcast;
9500
9501     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
9502     // Notably, this handles splat and partial-splat shuffles more efficiently.
9503     // However, it only makes sense if the pre-duplication shuffle simplifies
9504     // things significantly. Currently, this means we need to be able to
9505     // express the pre-duplication shuffle as an i16 shuffle.
9506     //
9507     // FIXME: We should check for other patterns which can be widened into an
9508     // i16 shuffle as well.
9509     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
9510       for (int i = 0; i < 16; i += 2)
9511         if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1])
9512           return false;
9513
9514       return true;
9515     };
9516     auto tryToWidenViaDuplication = [&]() -> SDValue {
9517       if (!canWidenViaDuplication(Mask))
9518         return SDValue();
9519       SmallVector<int, 4> LoInputs;
9520       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
9521                    [](int M) { return M >= 0 && M < 8; });
9522       std::sort(LoInputs.begin(), LoInputs.end());
9523       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
9524                      LoInputs.end());
9525       SmallVector<int, 4> HiInputs;
9526       std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
9527                    [](int M) { return M >= 8; });
9528       std::sort(HiInputs.begin(), HiInputs.end());
9529       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
9530                      HiInputs.end());
9531
9532       bool TargetLo = LoInputs.size() >= HiInputs.size();
9533       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
9534       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
9535
9536       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
9537       SmallDenseMap<int, int, 8> LaneMap;
9538       for (int I : InPlaceInputs) {
9539         PreDupI16Shuffle[I/2] = I/2;
9540         LaneMap[I] = I;
9541       }
9542       int j = TargetLo ? 0 : 4, je = j + 4;
9543       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
9544         // Check if j is already a shuffle of this input. This happens when
9545         // there are two adjacent bytes after we move the low one.
9546         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
9547           // If we haven't yet mapped the input, search for a slot into which
9548           // we can map it.
9549           while (j < je && PreDupI16Shuffle[j] != -1)
9550             ++j;
9551
9552           if (j == je)
9553             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
9554             return SDValue();
9555
9556           // Map this input with the i16 shuffle.
9557           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
9558         }
9559
9560         // Update the lane map based on the mapping we ended up with.
9561         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
9562       }
9563       V1 = DAG.getNode(
9564           ISD::BITCAST, DL, MVT::v16i8,
9565           DAG.getVectorShuffle(MVT::v8i16, DL,
9566                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9567                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
9568
9569       // Unpack the bytes to form the i16s that will be shuffled into place.
9570       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9571                        MVT::v16i8, V1, V1);
9572
9573       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9574       for (int i = 0; i < 16; ++i)
9575         if (Mask[i] != -1) {
9576           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
9577           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
9578           if (PostDupI16Shuffle[i / 2] == -1)
9579             PostDupI16Shuffle[i / 2] = MappedMask;
9580           else
9581             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
9582                    "Conflicting entrties in the original shuffle!");
9583         }
9584       return DAG.getNode(
9585           ISD::BITCAST, DL, MVT::v16i8,
9586           DAG.getVectorShuffle(MVT::v8i16, DL,
9587                                DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1),
9588                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
9589     };
9590     if (SDValue V = tryToWidenViaDuplication())
9591       return V;
9592   }
9593
9594   // Check whether an interleaving lowering is likely to be more efficient.
9595   // This isn't perfect but it is a strong heuristic that tends to work well on
9596   // the kinds of shuffles that show up in practice.
9597   //
9598   // FIXME: We need to handle other interleaving widths (i16, i32, ...).
9599   if (shouldLowerAsInterleaving(Mask)) {
9600     int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9601       return (M >= 0 && M < 8) || (M >= 16 && M < 24);
9602     });
9603     int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) {
9604       return (M >= 8 && M < 16) || M >= 24;
9605     });
9606     int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9607                      -1, -1, -1, -1, -1, -1, -1, -1};
9608     int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1,
9609                      -1, -1, -1, -1, -1, -1, -1, -1};
9610     bool UnpackLo = NumLoHalf >= NumHiHalf;
9611     MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8);
9612     MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8);
9613     for (int i = 0; i < 8; ++i) {
9614       TargetEMask[i] = Mask[2 * i];
9615       TargetOMask[i] = Mask[2 * i + 1];
9616     }
9617
9618     SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask);
9619     SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask);
9620
9621     return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9622                        MVT::v16i8, Evens, Odds);
9623   }
9624
9625   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
9626   // with PSHUFB. It is important to do this before we attempt to generate any
9627   // blends but after all of the single-input lowerings. If the single input
9628   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
9629   // want to preserve that and we can DAG combine any longer sequences into
9630   // a PSHUFB in the end. But once we start blending from multiple inputs,
9631   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
9632   // and there are *very* few patterns that would actually be faster than the
9633   // PSHUFB approach because of its ability to zero lanes.
9634   //
9635   // FIXME: The only exceptions to the above are blends which are exact
9636   // interleavings with direct instructions supporting them. We currently don't
9637   // handle those well here.
9638   if (Subtarget->hasSSSE3()) {
9639     SDValue V1Mask[16];
9640     SDValue V2Mask[16];
9641     bool V1InUse = false;
9642     bool V2InUse = false;
9643     SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9644
9645     for (int i = 0; i < 16; ++i) {
9646       if (Mask[i] == -1) {
9647         V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9648       } else {
9649         const int ZeroMask = 0x80;
9650         int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask);
9651         int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16);
9652         if (Zeroable[i])
9653           V1Idx = V2Idx = ZeroMask;
9654         V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8);
9655         V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8);
9656         V1InUse |= (ZeroMask != V1Idx);
9657         V2InUse |= (ZeroMask != V2Idx);
9658       }
9659     }
9660
9661     if (V1InUse)
9662       V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1,
9663                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask));
9664     if (V2InUse)
9665       V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2,
9666                        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask));
9667
9668     // If we need shuffled inputs from both, blend the two.
9669     if (V1InUse && V2InUse)
9670       return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9671     if (V1InUse)
9672       return V1; // Single inputs are easy.
9673     if (V2InUse)
9674       return V2; // Single inputs are easy.
9675     // Shuffling to a zeroable vector.
9676     return getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
9677   }
9678
9679   // There are special ways we can lower some single-element blends.
9680   if (NumV2Elements == 1)
9681     if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2,
9682                                                          Mask, Subtarget, DAG))
9683       return V;
9684
9685   // Check whether a compaction lowering can be done. This handles shuffles
9686   // which take every Nth element for some even N. See the helper function for
9687   // details.
9688   //
9689   // We special case these as they can be particularly efficiently handled with
9690   // the PACKUSB instruction on x86 and they show up in common patterns of
9691   // rearranging bytes to truncate wide elements.
9692   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) {
9693     // NumEvenDrops is the power of two stride of the elements. Another way of
9694     // thinking about it is that we need to drop the even elements this many
9695     // times to get the original input.
9696     bool IsSingleInput = isSingleInputShuffleMask(Mask);
9697
9698     // First we need to zero all the dropped bytes.
9699     assert(NumEvenDrops <= 3 &&
9700            "No support for dropping even elements more than 3 times.");
9701     // We use the mask type to pick which bytes are preserved based on how many
9702     // elements are dropped.
9703     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
9704     SDValue ByteClearMask =
9705         DAG.getNode(ISD::BITCAST, DL, MVT::v16i8,
9706                     DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1]));
9707     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
9708     if (!IsSingleInput)
9709       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
9710
9711     // Now pack things back together.
9712     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
9713     V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
9714     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
9715     for (int i = 1; i < NumEvenDrops; ++i) {
9716       Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result);
9717       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
9718     }
9719
9720     return Result;
9721   }
9722
9723   int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9724   int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9725   int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9726   int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
9727
9728   auto buildBlendMasks = [](MutableArrayRef<int> HalfMask,
9729                             MutableArrayRef<int> V1HalfBlendMask,
9730                             MutableArrayRef<int> V2HalfBlendMask) {
9731     for (int i = 0; i < 8; ++i)
9732       if (HalfMask[i] >= 0 && HalfMask[i] < 16) {
9733         V1HalfBlendMask[i] = HalfMask[i];
9734         HalfMask[i] = i;
9735       } else if (HalfMask[i] >= 16) {
9736         V2HalfBlendMask[i] = HalfMask[i] - 16;
9737         HalfMask[i] = i + 8;
9738       }
9739   };
9740   buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask);
9741   buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask);
9742
9743   SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
9744
9745   auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef<int> LoBlendMask,
9746                              MutableArrayRef<int> HiBlendMask) {
9747     SDValue V1, V2;
9748     // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
9749     // them out and avoid using UNPCK{L,H} to extract the elements of V as
9750     // i16s.
9751     if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(),
9752                      [](int M) { return M >= 0 && M % 2 == 1; }) &&
9753         std::none_of(HiBlendMask.begin(), HiBlendMask.end(),
9754                      [](int M) { return M >= 0 && M % 2 == 1; })) {
9755       // Use a mask to drop the high bytes.
9756       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
9757       V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1,
9758                        DAG.getConstant(0x00FF, MVT::v8i16));
9759
9760       // This will be a single vector shuffle instead of a blend so nuke V2.
9761       V2 = DAG.getUNDEF(MVT::v8i16);
9762
9763       // Squash the masks to point directly into V1.
9764       for (int &M : LoBlendMask)
9765         if (M >= 0)
9766           M /= 2;
9767       for (int &M : HiBlendMask)
9768         if (M >= 0)
9769           M /= 2;
9770     } else {
9771       // Otherwise just unpack the low half of V into V1 and the high half into
9772       // V2 so that we can blend them as i16s.
9773       V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9774                        DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
9775       V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16,
9776                        DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
9777     }
9778
9779     SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask);
9780     SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask);
9781     return std::make_pair(BlendedLo, BlendedHi);
9782   };
9783   SDValue V1Lo, V1Hi, V2Lo, V2Hi;
9784   std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask);
9785   std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask);
9786
9787   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask);
9788   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask);
9789
9790   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
9791 }
9792
9793 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
9794 ///
9795 /// This routine breaks down the specific type of 128-bit shuffle and
9796 /// dispatches to the lowering routines accordingly.
9797 static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
9798                                         MVT VT, const X86Subtarget *Subtarget,
9799                                         SelectionDAG &DAG) {
9800   switch (VT.SimpleTy) {
9801   case MVT::v2i64:
9802     return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9803   case MVT::v2f64:
9804     return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
9805   case MVT::v4i32:
9806     return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9807   case MVT::v4f32:
9808     return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
9809   case MVT::v8i16:
9810     return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
9811   case MVT::v16i8:
9812     return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
9813
9814   default:
9815     llvm_unreachable("Unimplemented!");
9816   }
9817 }
9818
9819 /// \brief Helper function to test whether a shuffle mask could be
9820 /// simplified by widening the elements being shuffled.
9821 ///
9822 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
9823 /// leaves it in an unspecified state.
9824 ///
9825 /// NOTE: This must handle normal vector shuffle masks and *target* vector
9826 /// shuffle masks. The latter have the special property of a '-2' representing
9827 /// a zero-ed lane of a vector.
9828 static bool canWidenShuffleElements(ArrayRef<int> Mask,
9829                                     SmallVectorImpl<int> &WidenedMask) {
9830   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
9831     // If both elements are undef, its trivial.
9832     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
9833       WidenedMask.push_back(SM_SentinelUndef);
9834       continue;
9835     }
9836
9837     // Check for an undef mask and a mask value properly aligned to fit with
9838     // a pair of values. If we find such a case, use the non-undef mask's value.
9839     if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
9840       WidenedMask.push_back(Mask[i + 1] / 2);
9841       continue;
9842     }
9843     if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
9844       WidenedMask.push_back(Mask[i] / 2);
9845       continue;
9846     }
9847
9848     // When zeroing, we need to spread the zeroing across both lanes to widen.
9849     if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
9850       if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
9851           (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
9852         WidenedMask.push_back(SM_SentinelZero);
9853         continue;
9854       }
9855       return false;
9856     }
9857
9858     // Finally check if the two mask values are adjacent and aligned with
9859     // a pair.
9860     if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
9861       WidenedMask.push_back(Mask[i] / 2);
9862       continue;
9863     }
9864
9865     // Otherwise we can't safely widen the elements used in this shuffle.
9866     return false;
9867   }
9868   assert(WidenedMask.size() == Mask.size() / 2 &&
9869          "Incorrect size of mask after widening the elements!");
9870
9871   return true;
9872 }
9873
9874 /// \brief Generic routine to split ector shuffle into half-sized shuffles.
9875 ///
9876 /// This routine just extracts two subvectors, shuffles them independently, and
9877 /// then concatenates them back together. This should work effectively with all
9878 /// AVX vector shuffle types.
9879 static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1,
9880                                           SDValue V2, ArrayRef<int> Mask,
9881                                           SelectionDAG &DAG) {
9882   assert(VT.getSizeInBits() >= 256 &&
9883          "Only for 256-bit or wider vector shuffles!");
9884   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
9885   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
9886
9887   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
9888   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
9889
9890   int NumElements = VT.getVectorNumElements();
9891   int SplitNumElements = NumElements / 2;
9892   MVT ScalarVT = VT.getScalarType();
9893   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
9894
9895   SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9896                              DAG.getIntPtrConstant(0));
9897   SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1,
9898                              DAG.getIntPtrConstant(SplitNumElements));
9899   SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9900                              DAG.getIntPtrConstant(0));
9901   SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2,
9902                              DAG.getIntPtrConstant(SplitNumElements));
9903
9904   // Now create two 4-way blends of these half-width vectors.
9905   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
9906     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
9907     SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask;
9908     for (int i = 0; i < SplitNumElements; ++i) {
9909       int M = HalfMask[i];
9910       if (M >= NumElements) {
9911         if (M >= NumElements + SplitNumElements)
9912           UseHiV2 = true;
9913         else
9914           UseLoV2 = true;
9915         V2BlendMask.push_back(M - NumElements);
9916         V1BlendMask.push_back(-1);
9917         BlendMask.push_back(SplitNumElements + i);
9918       } else if (M >= 0) {
9919         if (M >= SplitNumElements)
9920           UseHiV1 = true;
9921         else
9922           UseLoV1 = true;
9923         V2BlendMask.push_back(-1);
9924         V1BlendMask.push_back(M);
9925         BlendMask.push_back(i);
9926       } else {
9927         V2BlendMask.push_back(-1);
9928         V1BlendMask.push_back(-1);
9929         BlendMask.push_back(-1);
9930       }
9931     }
9932
9933     // Because the lowering happens after all combining takes place, we need to
9934     // manually combine these blend masks as much as possible so that we create
9935     // a minimal number of high-level vector shuffle nodes.
9936
9937     // First try just blending the halves of V1 or V2.
9938     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
9939       return DAG.getUNDEF(SplitVT);
9940     if (!UseLoV2 && !UseHiV2)
9941       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9942     if (!UseLoV1 && !UseHiV1)
9943       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9944
9945     SDValue V1Blend, V2Blend;
9946     if (UseLoV1 && UseHiV1) {
9947       V1Blend =
9948         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
9949     } else {
9950       // We only use half of V1 so map the usage down into the final blend mask.
9951       V1Blend = UseLoV1 ? LoV1 : HiV1;
9952       for (int i = 0; i < SplitNumElements; ++i)
9953         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
9954           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
9955     }
9956     if (UseLoV2 && UseHiV2) {
9957       V2Blend =
9958         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
9959     } else {
9960       // We only use half of V2 so map the usage down into the final blend mask.
9961       V2Blend = UseLoV2 ? LoV2 : HiV2;
9962       for (int i = 0; i < SplitNumElements; ++i)
9963         if (BlendMask[i] >= SplitNumElements)
9964           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
9965     }
9966     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
9967   };
9968   SDValue Lo = HalfBlend(LoMask);
9969   SDValue Hi = HalfBlend(HiMask);
9970   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
9971 }
9972
9973 /// \brief Either split a vector in halves or decompose the shuffles and the
9974 /// blend.
9975 ///
9976 /// This is provided as a good fallback for many lowerings of non-single-input
9977 /// shuffles with more than one 128-bit lane. In those cases, we want to select
9978 /// between splitting the shuffle into 128-bit components and stitching those
9979 /// back together vs. extracting the single-input shuffles and blending those
9980 /// results.
9981 static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1,
9982                                                 SDValue V2, ArrayRef<int> Mask,
9983                                                 SelectionDAG &DAG) {
9984   assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to "
9985                                             "lower single-input shuffles as it "
9986                                             "could then recurse on itself.");
9987   int Size = Mask.size();
9988
9989   // If this can be modeled as a broadcast of two elements followed by a blend,
9990   // prefer that lowering. This is especially important because broadcasts can
9991   // often fold with memory operands.
9992   auto DoBothBroadcast = [&] {
9993     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
9994     for (int M : Mask)
9995       if (M >= Size) {
9996         if (V2BroadcastIdx == -1)
9997           V2BroadcastIdx = M - Size;
9998         else if (M - Size != V2BroadcastIdx)
9999           return false;
10000       } else if (M >= 0) {
10001         if (V1BroadcastIdx == -1)
10002           V1BroadcastIdx = M;
10003         else if (M != V1BroadcastIdx)
10004           return false;
10005       }
10006     return true;
10007   };
10008   if (DoBothBroadcast())
10009     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10010                                                       DAG);
10011
10012   // If the inputs all stem from a single 128-bit lane of each input, then we
10013   // split them rather than blending because the split will decompose to
10014   // unusually few instructions.
10015   int LaneCount = VT.getSizeInBits() / 128;
10016   int LaneSize = Size / LaneCount;
10017   SmallBitVector LaneInputs[2];
10018   LaneInputs[0].resize(LaneCount, false);
10019   LaneInputs[1].resize(LaneCount, false);
10020   for (int i = 0; i < Size; ++i)
10021     if (Mask[i] >= 0)
10022       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10023   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10024     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10025
10026   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10027   // that the decomposed single-input shuffles don't end up here.
10028   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10029 }
10030
10031 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10032 /// a permutation and blend of those lanes.
10033 ///
10034 /// This essentially blends the out-of-lane inputs to each lane into the lane
10035 /// from a permuted copy of the vector. This lowering strategy results in four
10036 /// instructions in the worst case for a single-input cross lane shuffle which
10037 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
10038 /// of. Special cases for each particular shuffle pattern should be handled
10039 /// prior to trying this lowering.
10040 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT,
10041                                                        SDValue V1, SDValue V2,
10042                                                        ArrayRef<int> Mask,
10043                                                        SelectionDAG &DAG) {
10044   // FIXME: This should probably be generalized for 512-bit vectors as well.
10045   assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!");
10046   int LaneSize = Mask.size() / 2;
10047
10048   // If there are only inputs from one 128-bit lane, splitting will in fact be
10049   // less expensive. The flags track wether the given lane contains an element
10050   // that crosses to another lane.
10051   bool LaneCrossing[2] = {false, false};
10052   for (int i = 0, Size = Mask.size(); i < Size; ++i)
10053     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10054       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10055   if (!LaneCrossing[0] || !LaneCrossing[1])
10056     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10057
10058   if (isSingleInputShuffleMask(Mask)) {
10059     SmallVector<int, 32> FlippedBlendMask;
10060     for (int i = 0, Size = Mask.size(); i < Size; ++i)
10061       FlippedBlendMask.push_back(
10062           Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10063                                   ? Mask[i]
10064                                   : Mask[i] % LaneSize +
10065                                         (i / LaneSize) * LaneSize + Size));
10066
10067     // Flip the vector, and blend the results which should now be in-lane. The
10068     // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10069     // 5 for the high source. The value 3 selects the high half of source 2 and
10070     // the value 2 selects the low half of source 2. We only use source 2 to
10071     // allow folding it into a memory operand.
10072     unsigned PERMMask = 3 | 2 << 4;
10073     SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10074                                   V1, DAG.getConstant(PERMMask, MVT::i8));
10075     return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10076   }
10077
10078   // This now reduces to two single-input shuffles of V1 and V2 which at worst
10079   // will be handled by the above logic and a blend of the results, much like
10080   // other patterns in AVX.
10081   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10082 }
10083
10084 /// \brief Handle lowering 2-lane 128-bit shuffles.
10085 static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
10086                                         SDValue V2, ArrayRef<int> Mask,
10087                                         const X86Subtarget *Subtarget,
10088                                         SelectionDAG &DAG) {
10089   // Blends are faster and handle all the non-lane-crossing cases.
10090   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10091                                                 Subtarget, DAG))
10092     return Blend;
10093
10094   MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10095                                VT.getVectorNumElements() / 2);
10096   // Check for patterns which can be matched with a single insert of a 128-bit
10097   // subvector.
10098   bool OnlyUsesV1 = isShuffleEquivalent(Mask, 0, 1, 0, 1);
10099   if (OnlyUsesV1 || isShuffleEquivalent(Mask, 0, 1, 4, 5)) {
10100     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10101                               DAG.getIntPtrConstant(0));
10102     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10103                               OnlyUsesV1 ? V1 : V2, DAG.getIntPtrConstant(0));
10104     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10105   }
10106   if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) {
10107     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10108                               DAG.getIntPtrConstant(0));
10109     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
10110                               DAG.getIntPtrConstant(2));
10111     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10112   }
10113
10114   // Otherwise form a 128-bit permutation.
10115   // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half.
10116   int MaskLO = Mask[0];
10117   if (MaskLO == SM_SentinelUndef)
10118     MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10119
10120   int MaskHI = Mask[2];
10121   if (MaskHI == SM_SentinelUndef)
10122     MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10123
10124   unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10125   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10126                      DAG.getConstant(PermMask, MVT::i8));
10127 }
10128
10129 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10130 /// shuffling each lane.
10131 ///
10132 /// This will only succeed when the result of fixing the 128-bit lanes results
10133 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10134 /// each 128-bit lanes. This handles many cases where we can quickly blend away
10135 /// the lane crosses early and then use simpler shuffles within each lane.
10136 ///
10137 /// FIXME: It might be worthwhile at some point to support this without
10138 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10139 /// in x86 only floating point has interesting non-repeating shuffles, and even
10140 /// those are still *marginally* more expensive.
10141 static SDValue lowerVectorShuffleByMerging128BitLanes(
10142     SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10143     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
10144   assert(!isSingleInputShuffleMask(Mask) &&
10145          "This is only useful with multiple inputs.");
10146
10147   int Size = Mask.size();
10148   int LaneSize = 128 / VT.getScalarSizeInBits();
10149   int NumLanes = Size / LaneSize;
10150   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10151
10152   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10153   // check whether the in-128-bit lane shuffles share a repeating pattern.
10154   SmallVector<int, 4> Lanes;
10155   Lanes.resize(NumLanes, -1);
10156   SmallVector<int, 4> InLaneMask;
10157   InLaneMask.resize(LaneSize, -1);
10158   for (int i = 0; i < Size; ++i) {
10159     if (Mask[i] < 0)
10160       continue;
10161
10162     int j = i / LaneSize;
10163
10164     if (Lanes[j] < 0) {
10165       // First entry we've seen for this lane.
10166       Lanes[j] = Mask[i] / LaneSize;
10167     } else if (Lanes[j] != Mask[i] / LaneSize) {
10168       // This doesn't match the lane selected previously!
10169       return SDValue();
10170     }
10171
10172     // Check that within each lane we have a consistent shuffle mask.
10173     int k = i % LaneSize;
10174     if (InLaneMask[k] < 0) {
10175       InLaneMask[k] = Mask[i] % LaneSize;
10176     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10177       // This doesn't fit a repeating in-lane mask.
10178       return SDValue();
10179     }
10180   }
10181
10182   // First shuffle the lanes into place.
10183   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10184                                 VT.getSizeInBits() / 64);
10185   SmallVector<int, 8> LaneMask;
10186   LaneMask.resize(NumLanes * 2, -1);
10187   for (int i = 0; i < NumLanes; ++i)
10188     if (Lanes[i] >= 0) {
10189       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10190       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10191     }
10192
10193   V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1);
10194   V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2);
10195   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10196
10197   // Cast it back to the type we actually want.
10198   LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle);
10199
10200   // Now do a simple shuffle that isn't lane crossing.
10201   SmallVector<int, 8> NewMask;
10202   NewMask.resize(Size, -1);
10203   for (int i = 0; i < Size; ++i)
10204     if (Mask[i] >= 0)
10205       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10206   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10207          "Must not introduce lane crosses at this point!");
10208
10209   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10210 }
10211
10212 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
10213 /// given mask.
10214 ///
10215 /// This returns true if the elements from a particular input are already in the
10216 /// slot required by the given mask and require no permutation.
10217 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10218   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10219   int Size = Mask.size();
10220   for (int i = 0; i < Size; ++i)
10221     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10222       return false;
10223
10224   return true;
10225 }
10226
10227 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
10228 ///
10229 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
10230 /// isn't available.
10231 static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10232                                        const X86Subtarget *Subtarget,
10233                                        SelectionDAG &DAG) {
10234   SDLoc DL(Op);
10235   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10236   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
10237   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10238   ArrayRef<int> Mask = SVOp->getMask();
10239   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10240
10241   SmallVector<int, 4> WidenedMask;
10242   if (canWidenShuffleElements(Mask, WidenedMask))
10243     return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget,
10244                                     DAG);
10245
10246   if (isSingleInputShuffleMask(Mask)) {
10247     // Check for being able to broadcast a single element.
10248     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1,
10249                                                           Mask, Subtarget, DAG))
10250       return Broadcast;
10251
10252     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
10253       // Non-half-crossing single input shuffles can be lowerid with an
10254       // interleaved permutation.
10255       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
10256                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
10257       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
10258                          DAG.getConstant(VPERMILPMask, MVT::i8));
10259     }
10260
10261     // With AVX2 we have direct support for this permutation.
10262     if (Subtarget->hasAVX2())
10263       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
10264                          getV4X86ShuffleImm8ForMask(Mask, DAG));
10265
10266     // Otherwise, fall back.
10267     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
10268                                                    DAG);
10269   }
10270
10271   // X86 has dedicated unpack instructions that can handle specific blend
10272   // operations: UNPCKH and UNPCKL.
10273   if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10274     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2);
10275   if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10276     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2);
10277
10278   // If we have a single input to the zero element, insert that into V1 if we
10279   // can do so cheaply.
10280   int NumV2Elements =
10281       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
10282   if (NumV2Elements == 1 && Mask[0] >= 4)
10283     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10284             MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG))
10285       return Insertion;
10286
10287   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
10288                                                 Subtarget, DAG))
10289     return Blend;
10290
10291   // Check if the blend happens to exactly fit that of SHUFPD.
10292   if ((Mask[0] == -1 || Mask[0] < 2) &&
10293       (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) &&
10294       (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) &&
10295       (Mask[3] == -1 || Mask[3] >= 6)) {
10296     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) |
10297                           ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3);
10298     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2,
10299                        DAG.getConstant(SHUFPDMask, MVT::i8));
10300   }
10301   if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) &&
10302       (Mask[1] == -1 || Mask[1] < 2) &&
10303       (Mask[2] == -1 || Mask[2] >= 6) &&
10304       (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) {
10305     unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) |
10306                           ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3);
10307     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1,
10308                        DAG.getConstant(SHUFPDMask, MVT::i8));
10309   }
10310
10311   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10312   // shuffle. However, if we have AVX2 and either inputs are already in place,
10313   // we will be able to shuffle even across lanes the other input in a single
10314   // instruction so skip this pattern.
10315   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10316                                  isShuffleMaskInputInPlace(1, Mask))))
10317     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10318             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
10319       return Result;
10320
10321   // If we have AVX2 then we always want to lower with a blend because an v4 we
10322   // can fully permute the elements.
10323   if (Subtarget->hasAVX2())
10324     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
10325                                                       Mask, DAG);
10326
10327   // Otherwise fall back on generic lowering.
10328   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
10329 }
10330
10331 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
10332 ///
10333 /// This routine is only called when we have AVX2 and thus a reasonable
10334 /// instruction set for v4i64 shuffling..
10335 static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10336                                        const X86Subtarget *Subtarget,
10337                                        SelectionDAG &DAG) {
10338   SDLoc DL(Op);
10339   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10340   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
10341   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10342   ArrayRef<int> Mask = SVOp->getMask();
10343   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10344   assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!");
10345
10346   SmallVector<int, 4> WidenedMask;
10347   if (canWidenShuffleElements(Mask, WidenedMask))
10348     return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget,
10349                                     DAG);
10350
10351   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
10352                                                 Subtarget, DAG))
10353     return Blend;
10354
10355   // Check for being able to broadcast a single element.
10356   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1,
10357                                                         Mask, Subtarget, DAG))
10358     return Broadcast;
10359
10360   // When the shuffle is mirrored between the 128-bit lanes of the unit, we can
10361   // use lower latency instructions that will operate on both 128-bit lanes.
10362   SmallVector<int, 2> RepeatedMask;
10363   if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
10364     if (isSingleInputShuffleMask(Mask)) {
10365       int PSHUFDMask[] = {-1, -1, -1, -1};
10366       for (int i = 0; i < 2; ++i)
10367         if (RepeatedMask[i] >= 0) {
10368           PSHUFDMask[2 * i] = 2 * RepeatedMask[i];
10369           PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1;
10370         }
10371       return DAG.getNode(
10372           ISD::BITCAST, DL, MVT::v4i64,
10373           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
10374                       DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1),
10375                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
10376     }
10377
10378     // Use dedicated unpack instructions for masks that match their pattern.
10379     if (isShuffleEquivalent(Mask, 0, 4, 2, 6))
10380       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2);
10381     if (isShuffleEquivalent(Mask, 1, 5, 3, 7))
10382       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2);
10383   }
10384
10385   // AVX2 provides a direct instruction for permuting a single input across
10386   // lanes.
10387   if (isSingleInputShuffleMask(Mask))
10388     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
10389                        getV4X86ShuffleImm8ForMask(Mask, DAG));
10390
10391   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10392   // shuffle. However, if we have AVX2 and either inputs are already in place,
10393   // we will be able to shuffle even across lanes the other input in a single
10394   // instruction so skip this pattern.
10395   if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
10396                                  isShuffleMaskInputInPlace(1, Mask))))
10397     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10398             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
10399       return Result;
10400
10401   // Otherwise fall back on generic blend lowering.
10402   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
10403                                                     Mask, DAG);
10404 }
10405
10406 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
10407 ///
10408 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
10409 /// isn't available.
10410 static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10411                                        const X86Subtarget *Subtarget,
10412                                        SelectionDAG &DAG) {
10413   SDLoc DL(Op);
10414   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10415   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
10416   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10417   ArrayRef<int> Mask = SVOp->getMask();
10418   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10419
10420   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
10421                                                 Subtarget, DAG))
10422     return Blend;
10423
10424   // Check for being able to broadcast a single element.
10425   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
10426                                                         Mask, Subtarget, DAG))
10427     return Broadcast;
10428
10429   // If the shuffle mask is repeated in each 128-bit lane, we have many more
10430   // options to efficiently lower the shuffle.
10431   SmallVector<int, 4> RepeatedMask;
10432   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
10433     assert(RepeatedMask.size() == 4 &&
10434            "Repeated masks must be half the mask width!");
10435     if (isSingleInputShuffleMask(Mask))
10436       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
10437                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10438
10439     // Use dedicated unpack instructions for masks that match their pattern.
10440     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10441       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2);
10442     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10443       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2);
10444
10445     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
10446     // have already handled any direct blends. We also need to squash the
10447     // repeated mask into a simulated v4f32 mask.
10448     for (int i = 0; i < 4; ++i)
10449       if (RepeatedMask[i] >= 8)
10450         RepeatedMask[i] -= 4;
10451     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
10452   }
10453
10454   // If we have a single input shuffle with different shuffle patterns in the
10455   // two 128-bit lanes use the variable mask to VPERMILPS.
10456   if (isSingleInputShuffleMask(Mask)) {
10457     SDValue VPermMask[8];
10458     for (int i = 0; i < 8; ++i)
10459       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10460                                  : DAG.getConstant(Mask[i], MVT::i32);
10461     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
10462       return DAG.getNode(
10463           X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
10464           DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
10465
10466     if (Subtarget->hasAVX2())
10467       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
10468                          DAG.getNode(ISD::BITCAST, DL, MVT::v8f32,
10469                                      DAG.getNode(ISD::BUILD_VECTOR, DL,
10470                                                  MVT::v8i32, VPermMask)),
10471                          V1);
10472
10473     // Otherwise, fall back.
10474     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
10475                                                    DAG);
10476   }
10477
10478   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10479   // shuffle.
10480   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10481           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
10482     return Result;
10483
10484   // If we have AVX2 then we always want to lower with a blend because at v8 we
10485   // can fully permute the elements.
10486   if (Subtarget->hasAVX2())
10487     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
10488                                                       Mask, DAG);
10489
10490   // Otherwise fall back on generic lowering.
10491   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
10492 }
10493
10494 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
10495 ///
10496 /// This routine is only called when we have AVX2 and thus a reasonable
10497 /// instruction set for v8i32 shuffling..
10498 static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10499                                        const X86Subtarget *Subtarget,
10500                                        SelectionDAG &DAG) {
10501   SDLoc DL(Op);
10502   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10503   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
10504   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10505   ArrayRef<int> Mask = SVOp->getMask();
10506   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10507   assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!");
10508
10509   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
10510                                                 Subtarget, DAG))
10511     return Blend;
10512
10513   // Check for being able to broadcast a single element.
10514   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1,
10515                                                         Mask, Subtarget, DAG))
10516     return Broadcast;
10517
10518   // If the shuffle mask is repeated in each 128-bit lane we can use more
10519   // efficient instructions that mirror the shuffles across the two 128-bit
10520   // lanes.
10521   SmallVector<int, 4> RepeatedMask;
10522   if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
10523     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
10524     if (isSingleInputShuffleMask(Mask))
10525       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
10526                          getV4X86ShuffleImm8ForMask(RepeatedMask, DAG));
10527
10528     // Use dedicated unpack instructions for masks that match their pattern.
10529     if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13))
10530       return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2);
10531     if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15))
10532       return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2);
10533   }
10534
10535   // If the shuffle patterns aren't repeated but it is a single input, directly
10536   // generate a cross-lane VPERMD instruction.
10537   if (isSingleInputShuffleMask(Mask)) {
10538     SDValue VPermMask[8];
10539     for (int i = 0; i < 8; ++i)
10540       VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
10541                                  : DAG.getConstant(Mask[i], MVT::i32);
10542     return DAG.getNode(
10543         X86ISD::VPERMV, DL, MVT::v8i32,
10544         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1);
10545   }
10546
10547   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10548   // shuffle.
10549   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10550           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
10551     return Result;
10552
10553   // Otherwise fall back on generic blend lowering.
10554   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
10555                                                     Mask, DAG);
10556 }
10557
10558 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
10559 ///
10560 /// This routine is only called when we have AVX2 and thus a reasonable
10561 /// instruction set for v16i16 shuffling..
10562 static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10563                                         const X86Subtarget *Subtarget,
10564                                         SelectionDAG &DAG) {
10565   SDLoc DL(Op);
10566   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10567   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
10568   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10569   ArrayRef<int> Mask = SVOp->getMask();
10570   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10571   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
10572
10573   // Check for being able to broadcast a single element.
10574   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1,
10575                                                         Mask, Subtarget, DAG))
10576     return Broadcast;
10577
10578   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
10579                                                 Subtarget, DAG))
10580     return Blend;
10581
10582   // Use dedicated unpack instructions for masks that match their pattern.
10583   if (isShuffleEquivalent(Mask,
10584                           // First 128-bit lane:
10585                           0, 16, 1, 17, 2, 18, 3, 19,
10586                           // Second 128-bit lane:
10587                           8, 24, 9, 25, 10, 26, 11, 27))
10588     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
10589   if (isShuffleEquivalent(Mask,
10590                           // First 128-bit lane:
10591                           4, 20, 5, 21, 6, 22, 7, 23,
10592                           // Second 128-bit lane:
10593                           12, 28, 13, 29, 14, 30, 15, 31))
10594     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
10595
10596   if (isSingleInputShuffleMask(Mask)) {
10597     // There are no generalized cross-lane shuffle operations available on i16
10598     // element types.
10599     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
10600       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
10601                                                      Mask, DAG);
10602
10603     SDValue PSHUFBMask[32];
10604     for (int i = 0; i < 16; ++i) {
10605       if (Mask[i] == -1) {
10606         PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
10607         continue;
10608       }
10609
10610       int M = i < 8 ? Mask[i] : Mask[i] - 8;
10611       assert(M >= 0 && M < 8 && "Invalid single-input mask!");
10612       PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
10613       PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
10614     }
10615     return DAG.getNode(
10616         ISD::BITCAST, DL, MVT::v16i16,
10617         DAG.getNode(
10618             X86ISD::PSHUFB, DL, MVT::v32i8,
10619             DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
10620             DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
10621   }
10622
10623   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10624   // shuffle.
10625   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10626           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
10627     return Result;
10628
10629   // Otherwise fall back on generic lowering.
10630   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
10631 }
10632
10633 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
10634 ///
10635 /// This routine is only called when we have AVX2 and thus a reasonable
10636 /// instruction set for v32i8 shuffling..
10637 static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10638                                        const X86Subtarget *Subtarget,
10639                                        SelectionDAG &DAG) {
10640   SDLoc DL(Op);
10641   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10642   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
10643   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10644   ArrayRef<int> Mask = SVOp->getMask();
10645   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10646   assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!");
10647
10648   // Check for being able to broadcast a single element.
10649   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1,
10650                                                         Mask, Subtarget, DAG))
10651     return Broadcast;
10652
10653   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
10654                                                 Subtarget, DAG))
10655     return Blend;
10656
10657   // Use dedicated unpack instructions for masks that match their pattern.
10658   // Note that these are repeated 128-bit lane unpacks, not unpacks across all
10659   // 256-bit lanes.
10660   if (isShuffleEquivalent(
10661           Mask,
10662           // First 128-bit lane:
10663           0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
10664           // Second 128-bit lane:
10665           16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55))
10666     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2);
10667   if (isShuffleEquivalent(
10668           Mask,
10669           // First 128-bit lane:
10670           8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47,
10671           // Second 128-bit lane:
10672           24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63))
10673     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2);
10674
10675   if (isSingleInputShuffleMask(Mask)) {
10676     // There are no generalized cross-lane shuffle operations available on i8
10677     // element types.
10678     if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
10679       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2,
10680                                                      Mask, DAG);
10681
10682     SDValue PSHUFBMask[32];
10683     for (int i = 0; i < 32; ++i)
10684       PSHUFBMask[i] =
10685           Mask[i] < 0
10686               ? DAG.getUNDEF(MVT::i8)
10687               : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8);
10688
10689     return DAG.getNode(
10690         X86ISD::PSHUFB, DL, MVT::v32i8, V1,
10691         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask));
10692   }
10693
10694   // Try to simplify this by merging 128-bit lanes to enable a lane-based
10695   // shuffle.
10696   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
10697           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
10698     return Result;
10699
10700   // Otherwise fall back on generic lowering.
10701   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
10702 }
10703
10704 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
10705 ///
10706 /// This routine either breaks down the specific type of a 256-bit x86 vector
10707 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
10708 /// together based on the available instructions.
10709 static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10710                                         MVT VT, const X86Subtarget *Subtarget,
10711                                         SelectionDAG &DAG) {
10712   SDLoc DL(Op);
10713   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10714   ArrayRef<int> Mask = SVOp->getMask();
10715
10716   // There is a really nice hard cut-over between AVX1 and AVX2 that means we can
10717   // check for those subtargets here and avoid much of the subtarget querying in
10718   // the per-vector-type lowering routines. With AVX1 we have essentially *zero*
10719   // ability to manipulate a 256-bit vector with integer types. Since we'll use
10720   // floating point types there eventually, just immediately cast everything to
10721   // a float and operate entirely in that domain.
10722   if (VT.isInteger() && !Subtarget->hasAVX2()) {
10723     int ElementBits = VT.getScalarSizeInBits();
10724     if (ElementBits < 32)
10725       // No floating point type available, decompose into 128-bit vectors.
10726       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10727
10728     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
10729                                 VT.getVectorNumElements());
10730     V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1);
10731     V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2);
10732     return DAG.getNode(ISD::BITCAST, DL, VT,
10733                        DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
10734   }
10735
10736   switch (VT.SimpleTy) {
10737   case MVT::v4f64:
10738     return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10739   case MVT::v4i64:
10740     return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10741   case MVT::v8f32:
10742     return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10743   case MVT::v8i32:
10744     return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10745   case MVT::v16i16:
10746     return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10747   case MVT::v32i8:
10748     return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10749
10750   default:
10751     llvm_unreachable("Not a valid 256-bit x86 vector type!");
10752   }
10753 }
10754
10755 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
10756 static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10757                                        const X86Subtarget *Subtarget,
10758                                        SelectionDAG &DAG) {
10759   SDLoc DL(Op);
10760   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10761   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
10762   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10763   ArrayRef<int> Mask = SVOp->getMask();
10764   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10765
10766   // X86 has dedicated unpack instructions that can handle specific blend
10767   // operations: UNPCKH and UNPCKL.
10768   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10769     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
10770   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10771     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
10772
10773   // FIXME: Implement direct support for this type!
10774   return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
10775 }
10776
10777 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
10778 static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10779                                        const X86Subtarget *Subtarget,
10780                                        SelectionDAG &DAG) {
10781   SDLoc DL(Op);
10782   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10783   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
10784   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10785   ArrayRef<int> Mask = SVOp->getMask();
10786   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10787
10788   // Use dedicated unpack instructions for masks that match their pattern.
10789   if (isShuffleEquivalent(Mask,
10790                           0, 16, 1, 17, 4, 20, 5, 21,
10791                           8, 24, 9, 25, 12, 28, 13, 29))
10792     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
10793   if (isShuffleEquivalent(Mask,
10794                           2, 18, 3, 19, 6, 22, 7, 23,
10795                           10, 26, 11, 27, 14, 30, 15, 31))
10796     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
10797
10798   // FIXME: Implement direct support for this type!
10799   return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
10800 }
10801
10802 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
10803 static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10804                                        const X86Subtarget *Subtarget,
10805                                        SelectionDAG &DAG) {
10806   SDLoc DL(Op);
10807   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10808   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
10809   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10810   ArrayRef<int> Mask = SVOp->getMask();
10811   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10812
10813   // X86 has dedicated unpack instructions that can handle specific blend
10814   // operations: UNPCKH and UNPCKL.
10815   if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14))
10816     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
10817   if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15))
10818     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
10819
10820   // FIXME: Implement direct support for this type!
10821   return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
10822 }
10823
10824 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
10825 static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10826                                        const X86Subtarget *Subtarget,
10827                                        SelectionDAG &DAG) {
10828   SDLoc DL(Op);
10829   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10830   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
10831   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10832   ArrayRef<int> Mask = SVOp->getMask();
10833   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10834
10835   // Use dedicated unpack instructions for masks that match their pattern.
10836   if (isShuffleEquivalent(Mask,
10837                           0, 16, 1, 17, 4, 20, 5, 21,
10838                           8, 24, 9, 25, 12, 28, 13, 29))
10839     return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
10840   if (isShuffleEquivalent(Mask,
10841                           2, 18, 3, 19, 6, 22, 7, 23,
10842                           10, 26, 11, 27, 14, 30, 15, 31))
10843     return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
10844
10845   // FIXME: Implement direct support for this type!
10846   return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
10847 }
10848
10849 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
10850 static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10851                                         const X86Subtarget *Subtarget,
10852                                         SelectionDAG &DAG) {
10853   SDLoc DL(Op);
10854   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10855   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
10856   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10857   ArrayRef<int> Mask = SVOp->getMask();
10858   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
10859   assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
10860
10861   // FIXME: Implement direct support for this type!
10862   return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG);
10863 }
10864
10865 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
10866 static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10867                                        const X86Subtarget *Subtarget,
10868                                        SelectionDAG &DAG) {
10869   SDLoc DL(Op);
10870   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10871   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
10872   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10873   ArrayRef<int> Mask = SVOp->getMask();
10874   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
10875   assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
10876
10877   // FIXME: Implement direct support for this type!
10878   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
10879 }
10880
10881 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
10882 ///
10883 /// This routine either breaks down the specific type of a 512-bit x86 vector
10884 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
10885 /// together based on the available instructions.
10886 static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2,
10887                                         MVT VT, const X86Subtarget *Subtarget,
10888                                         SelectionDAG &DAG) {
10889   SDLoc DL(Op);
10890   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10891   ArrayRef<int> Mask = SVOp->getMask();
10892   assert(Subtarget->hasAVX512() &&
10893          "Cannot lower 512-bit vectors w/ basic ISA!");
10894
10895   // Check for being able to broadcast a single element.
10896   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1,
10897                                                         Mask, Subtarget, DAG))
10898     return Broadcast;
10899
10900   // Dispatch to each element type for lowering. If we don't have supprot for
10901   // specific element type shuffles at 512 bits, immediately split them and
10902   // lower them. Each lowering routine of a given type is allowed to assume that
10903   // the requisite ISA extensions for that element type are available.
10904   switch (VT.SimpleTy) {
10905   case MVT::v8f64:
10906     return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10907   case MVT::v16f32:
10908     return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10909   case MVT::v8i64:
10910     return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
10911   case MVT::v16i32:
10912     return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
10913   case MVT::v32i16:
10914     if (Subtarget->hasBWI())
10915       return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
10916     break;
10917   case MVT::v64i8:
10918     if (Subtarget->hasBWI())
10919       return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG);
10920     break;
10921
10922   default:
10923     llvm_unreachable("Not a valid 512-bit x86 vector type!");
10924   }
10925
10926   // Otherwise fall back on splitting.
10927   return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10928 }
10929
10930 /// \brief Top-level lowering for x86 vector shuffles.
10931 ///
10932 /// This handles decomposition, canonicalization, and lowering of all x86
10933 /// vector shuffles. Most of the specific lowering strategies are encapsulated
10934 /// above in helper routines. The canonicalization attempts to widen shuffles
10935 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
10936 /// s.t. only one of the two inputs needs to be tested, etc.
10937 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
10938                                   SelectionDAG &DAG) {
10939   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10940   ArrayRef<int> Mask = SVOp->getMask();
10941   SDValue V1 = Op.getOperand(0);
10942   SDValue V2 = Op.getOperand(1);
10943   MVT VT = Op.getSimpleValueType();
10944   int NumElements = VT.getVectorNumElements();
10945   SDLoc dl(Op);
10946
10947   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
10948
10949   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
10950   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
10951   if (V1IsUndef && V2IsUndef)
10952     return DAG.getUNDEF(VT);
10953
10954   // When we create a shuffle node we put the UNDEF node to second operand,
10955   // but in some cases the first operand may be transformed to UNDEF.
10956   // In this case we should just commute the node.
10957   if (V1IsUndef)
10958     return DAG.getCommutedVectorShuffle(*SVOp);
10959
10960   // Check for non-undef masks pointing at an undef vector and make the masks
10961   // undef as well. This makes it easier to match the shuffle based solely on
10962   // the mask.
10963   if (V2IsUndef)
10964     for (int M : Mask)
10965       if (M >= NumElements) {
10966         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
10967         for (int &M : NewMask)
10968           if (M >= NumElements)
10969             M = -1;
10970         return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask);
10971       }
10972
10973   // Try to collapse shuffles into using a vector type with fewer elements but
10974   // wider element types. We cap this to not form integers or floating point
10975   // elements wider than 64 bits, but it might be interesting to form i128
10976   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
10977   SmallVector<int, 16> WidenedMask;
10978   if (VT.getScalarSizeInBits() < 64 &&
10979       canWidenShuffleElements(Mask, WidenedMask)) {
10980     MVT NewEltVT = VT.isFloatingPoint()
10981                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
10982                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
10983     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
10984     // Make sure that the new vector type is legal. For example, v2f64 isn't
10985     // legal on SSE1.
10986     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
10987       V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
10988       V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
10989       return DAG.getNode(ISD::BITCAST, dl, VT,
10990                          DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask));
10991     }
10992   }
10993
10994   int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
10995   for (int M : SVOp->getMask())
10996     if (M < 0)
10997       ++NumUndefElements;
10998     else if (M < NumElements)
10999       ++NumV1Elements;
11000     else
11001       ++NumV2Elements;
11002
11003   // Commute the shuffle as needed such that more elements come from V1 than
11004   // V2. This allows us to match the shuffle pattern strictly on how many
11005   // elements come from V1 without handling the symmetric cases.
11006   if (NumV2Elements > NumV1Elements)
11007     return DAG.getCommutedVectorShuffle(*SVOp);
11008
11009   // When the number of V1 and V2 elements are the same, try to minimize the
11010   // number of uses of V2 in the low half of the vector. When that is tied,
11011   // ensure that the sum of indices for V1 is equal to or lower than the sum
11012   // indices for V2. When those are equal, try to ensure that the number of odd
11013   // indices for V1 is lower than the number of odd indices for V2.
11014   if (NumV1Elements == NumV2Elements) {
11015     int LowV1Elements = 0, LowV2Elements = 0;
11016     for (int M : SVOp->getMask().slice(0, NumElements / 2))
11017       if (M >= NumElements)
11018         ++LowV2Elements;
11019       else if (M >= 0)
11020         ++LowV1Elements;
11021     if (LowV2Elements > LowV1Elements) {
11022       return DAG.getCommutedVectorShuffle(*SVOp);
11023     } else if (LowV2Elements == LowV1Elements) {
11024       int SumV1Indices = 0, SumV2Indices = 0;
11025       for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11026         if (SVOp->getMask()[i] >= NumElements)
11027           SumV2Indices += i;
11028         else if (SVOp->getMask()[i] >= 0)
11029           SumV1Indices += i;
11030       if (SumV2Indices < SumV1Indices) {
11031         return DAG.getCommutedVectorShuffle(*SVOp);
11032       } else if (SumV2Indices == SumV1Indices) {
11033         int NumV1OddIndices = 0, NumV2OddIndices = 0;
11034         for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i)
11035           if (SVOp->getMask()[i] >= NumElements)
11036             NumV2OddIndices += i % 2;
11037           else if (SVOp->getMask()[i] >= 0)
11038             NumV1OddIndices += i % 2;
11039         if (NumV2OddIndices < NumV1OddIndices)
11040           return DAG.getCommutedVectorShuffle(*SVOp);
11041       }
11042     }
11043   }
11044
11045   // For each vector width, delegate to a specialized lowering routine.
11046   if (VT.getSizeInBits() == 128)
11047     return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11048
11049   if (VT.getSizeInBits() == 256)
11050     return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11051
11052   // Force AVX-512 vectors to be scalarized for now.
11053   // FIXME: Implement AVX-512 support!
11054   if (VT.getSizeInBits() == 512)
11055     return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG);
11056
11057   llvm_unreachable("Unimplemented!");
11058 }
11059
11060
11061 //===----------------------------------------------------------------------===//
11062 // Legacy vector shuffle lowering
11063 //
11064 // This code is the legacy code handling vector shuffles until the above
11065 // replaces its functionality and performance.
11066 //===----------------------------------------------------------------------===//
11067
11068 static bool isBlendMask(ArrayRef<int> MaskVals, MVT VT, bool hasSSE41,
11069                         bool hasInt256, unsigned *MaskOut = nullptr) {
11070   MVT EltVT = VT.getVectorElementType();
11071
11072   // There is no blend with immediate in AVX-512.
11073   if (VT.is512BitVector())
11074     return false;
11075
11076   if (!hasSSE41 || EltVT == MVT::i8)
11077     return false;
11078   if (!hasInt256 && VT == MVT::v16i16)
11079     return false;
11080
11081   unsigned MaskValue = 0;
11082   unsigned NumElems = VT.getVectorNumElements();
11083   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
11084   unsigned NumLanes = (NumElems - 1) / 8 + 1;
11085   unsigned NumElemsInLane = NumElems / NumLanes;
11086
11087   // Blend for v16i16 should be symetric for the both lanes.
11088   for (unsigned i = 0; i < NumElemsInLane; ++i) {
11089
11090     int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1;
11091     int EltIdx = MaskVals[i];
11092
11093     if ((EltIdx < 0 || EltIdx == (int)i) &&
11094         (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
11095       continue;
11096
11097     if (((unsigned)EltIdx == (i + NumElems)) &&
11098         (SndLaneEltIdx < 0 ||
11099          (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
11100       MaskValue |= (1 << i);
11101     else
11102       return false;
11103   }
11104
11105   if (MaskOut)
11106     *MaskOut = MaskValue;
11107   return true;
11108 }
11109
11110 // Try to lower a shuffle node into a simple blend instruction.
11111 // This function assumes isBlendMask returns true for this
11112 // SuffleVectorSDNode
11113 static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
11114                                           unsigned MaskValue,
11115                                           const X86Subtarget *Subtarget,
11116                                           SelectionDAG &DAG) {
11117   MVT VT = SVOp->getSimpleValueType(0);
11118   MVT EltVT = VT.getVectorElementType();
11119   assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(),
11120                      Subtarget->hasInt256() && "Trying to lower a "
11121                                                "VECTOR_SHUFFLE to a Blend but "
11122                                                "with the wrong mask"));
11123   SDValue V1 = SVOp->getOperand(0);
11124   SDValue V2 = SVOp->getOperand(1);
11125   SDLoc dl(SVOp);
11126   unsigned NumElems = VT.getVectorNumElements();
11127
11128   // Convert i32 vectors to floating point if it is not AVX2.
11129   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
11130   MVT BlendVT = VT;
11131   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
11132     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
11133                                NumElems);
11134     V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
11135     V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
11136   }
11137
11138   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
11139                             DAG.getConstant(MaskValue, MVT::i32));
11140   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
11141 }
11142
11143 /// In vector type \p VT, return true if the element at index \p InputIdx
11144 /// falls on a different 128-bit lane than \p OutputIdx.
11145 static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx,
11146                                      unsigned OutputIdx) {
11147   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
11148   return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128;
11149 }
11150
11151 /// Generate a PSHUFB if possible.  Selects elements from \p V1 according to
11152 /// \p MaskVals.  MaskVals[OutputIdx] = InputIdx specifies that we want to
11153 /// shuffle the element at InputIdx in V1 to OutputIdx in the result.  If \p
11154 /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a
11155 /// zero.
11156 static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
11157                          SelectionDAG &DAG) {
11158   MVT VT = V1.getSimpleValueType();
11159   assert(VT.is128BitVector() || VT.is256BitVector());
11160
11161   MVT EltVT = VT.getVectorElementType();
11162   unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8;
11163   unsigned NumElts = VT.getVectorNumElements();
11164
11165   SmallVector<SDValue, 32> PshufbMask;
11166   for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) {
11167     int InputIdx = MaskVals[OutputIdx];
11168     unsigned InputByteIdx;
11169
11170     if (InputIdx < 0 || NumElts <= (unsigned)InputIdx)
11171       InputByteIdx = 0x80;
11172     else {
11173       // Cross lane is not allowed.
11174       if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx))
11175         return SDValue();
11176       InputByteIdx = InputIdx * EltSizeInBytes;
11177       // Index is an byte offset within the 128-bit lane.
11178       InputByteIdx &= 0xf;
11179     }
11180
11181     for (unsigned j = 0; j < EltSizeInBytes; ++j) {
11182       PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8));
11183       if (InputByteIdx != 0x80)
11184         ++InputByteIdx;
11185     }
11186   }
11187
11188   MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size());
11189   if (ShufVT != VT)
11190     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
11191   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
11192                      DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
11193 }
11194
11195 // v8i16 shuffles - Prefer shuffles in the following order:
11196 // 1. [all]   pshuflw, pshufhw, optional move
11197 // 2. [ssse3] 1 x pshufb
11198 // 3. [ssse3] 2 x pshufb + 1 x por
11199 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
11200 static SDValue
11201 LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
11202                          SelectionDAG &DAG) {
11203   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11204   SDValue V1 = SVOp->getOperand(0);
11205   SDValue V2 = SVOp->getOperand(1);
11206   SDLoc dl(SVOp);
11207   SmallVector<int, 8> MaskVals;
11208
11209   // Determine if more than 1 of the words in each of the low and high quadwords
11210   // of the result come from the same quadword of one of the two inputs.  Undef
11211   // mask values count as coming from any quadword, for better codegen.
11212   //
11213   // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input
11214   // feeds this quad.  For i, 0 and 1 refer to V1, 2 and 3 refer to V2.
11215   unsigned LoQuad[] = { 0, 0, 0, 0 };
11216   unsigned HiQuad[] = { 0, 0, 0, 0 };
11217   // Indices of quads used.
11218   std::bitset<4> InputQuads;
11219   for (unsigned i = 0; i < 8; ++i) {
11220     unsigned *Quad = i < 4 ? LoQuad : HiQuad;
11221     int EltIdx = SVOp->getMaskElt(i);
11222     MaskVals.push_back(EltIdx);
11223     if (EltIdx < 0) {
11224       ++Quad[0];
11225       ++Quad[1];
11226       ++Quad[2];
11227       ++Quad[3];
11228       continue;
11229     }
11230     ++Quad[EltIdx / 4];
11231     InputQuads.set(EltIdx / 4);
11232   }
11233
11234   int BestLoQuad = -1;
11235   unsigned MaxQuad = 1;
11236   for (unsigned i = 0; i < 4; ++i) {
11237     if (LoQuad[i] > MaxQuad) {
11238       BestLoQuad = i;
11239       MaxQuad = LoQuad[i];
11240     }
11241   }
11242
11243   int BestHiQuad = -1;
11244   MaxQuad = 1;
11245   for (unsigned i = 0; i < 4; ++i) {
11246     if (HiQuad[i] > MaxQuad) {
11247       BestHiQuad = i;
11248       MaxQuad = HiQuad[i];
11249     }
11250   }
11251
11252   // For SSSE3, If all 8 words of the result come from only 1 quadword of each
11253   // of the two input vectors, shuffle them into one input vector so only a
11254   // single pshufb instruction is necessary. If there are more than 2 input
11255   // quads, disable the next transformation since it does not help SSSE3.
11256   bool V1Used = InputQuads[0] || InputQuads[1];
11257   bool V2Used = InputQuads[2] || InputQuads[3];
11258   if (Subtarget->hasSSSE3()) {
11259     if (InputQuads.count() == 2 && V1Used && V2Used) {
11260       BestLoQuad = InputQuads[0] ? 0 : 1;
11261       BestHiQuad = InputQuads[2] ? 2 : 3;
11262     }
11263     if (InputQuads.count() > 2) {
11264       BestLoQuad = -1;
11265       BestHiQuad = -1;
11266     }
11267   }
11268
11269   // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
11270   // the shuffle mask.  If a quad is scored as -1, that means that it contains
11271   // words from all 4 input quadwords.
11272   SDValue NewV;
11273   if (BestLoQuad >= 0 || BestHiQuad >= 0) {
11274     int MaskV[] = {
11275       BestLoQuad < 0 ? 0 : BestLoQuad,
11276       BestHiQuad < 0 ? 1 : BestHiQuad
11277     };
11278     NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
11279                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
11280                   DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
11281     NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
11282
11283     // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
11284     // source words for the shuffle, to aid later transformations.
11285     bool AllWordsInNewV = true;
11286     bool InOrder[2] = { true, true };
11287     for (unsigned i = 0; i != 8; ++i) {
11288       int idx = MaskVals[i];
11289       if (idx != (int)i)
11290         InOrder[i/4] = false;
11291       if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
11292         continue;
11293       AllWordsInNewV = false;
11294       break;
11295     }
11296
11297     bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
11298     if (AllWordsInNewV) {
11299       for (int i = 0; i != 8; ++i) {
11300         int idx = MaskVals[i];
11301         if (idx < 0)
11302           continue;
11303         idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
11304         if ((idx != i) && idx < 4)
11305           pshufhw = false;
11306         if ((idx != i) && idx > 3)
11307           pshuflw = false;
11308       }
11309       V1 = NewV;
11310       V2Used = false;
11311       BestLoQuad = 0;
11312       BestHiQuad = 1;
11313     }
11314
11315     // If we've eliminated the use of V2, and the new mask is a pshuflw or
11316     // pshufhw, that's as cheap as it gets.  Return the new shuffle.
11317     if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
11318       unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
11319       unsigned TargetMask = 0;
11320       NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
11321                                   DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
11322       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11323       TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp):
11324                              getShufflePSHUFLWImmediate(SVOp);
11325       V1 = NewV.getOperand(0);
11326       return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
11327     }
11328   }
11329
11330   // Promote splats to a larger type which usually leads to more efficient code.
11331   // FIXME: Is this true if pshufb is available?
11332   if (SVOp->isSplat())
11333     return PromoteSplat(SVOp, DAG);
11334
11335   // If we have SSSE3, and all words of the result are from 1 input vector,
11336   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
11337   // is present, fall back to case 4.
11338   if (Subtarget->hasSSSE3()) {
11339     SmallVector<SDValue,16> pshufbMask;
11340
11341     // If we have elements from both input vectors, set the high bit of the
11342     // shuffle mask element to zero out elements that come from V2 in the V1
11343     // mask, and elements that come from V1 in the V2 mask, so that the two
11344     // results can be OR'd together.
11345     bool TwoInputs = V1Used && V2Used;
11346     V1 = getPSHUFB(MaskVals, V1, dl, DAG);
11347     if (!TwoInputs)
11348       return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11349
11350     // Calculate the shuffle mask for the second input, shuffle it, and
11351     // OR it with the first shuffled input.
11352     CommuteVectorShuffleMask(MaskVals, 8);
11353     V2 = getPSHUFB(MaskVals, V2, dl, DAG);
11354     V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11355     return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11356   }
11357
11358   // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
11359   // and update MaskVals with new element order.
11360   std::bitset<8> InOrder;
11361   if (BestLoQuad >= 0) {
11362     int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 };
11363     for (int i = 0; i != 4; ++i) {
11364       int idx = MaskVals[i];
11365       if (idx < 0) {
11366         InOrder.set(i);
11367       } else if ((idx / 4) == BestLoQuad) {
11368         MaskV[i] = idx & 3;
11369         InOrder.set(i);
11370       }
11371     }
11372     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11373                                 &MaskV[0]);
11374
11375     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11376       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11377       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
11378                                   NewV.getOperand(0),
11379                                   getShufflePSHUFLWImmediate(SVOp), DAG);
11380     }
11381   }
11382
11383   // If BestHi >= 0, generate a pshufhw to put the high elements in order,
11384   // and update MaskVals with the new element order.
11385   if (BestHiQuad >= 0) {
11386     int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 };
11387     for (unsigned i = 4; i != 8; ++i) {
11388       int idx = MaskVals[i];
11389       if (idx < 0) {
11390         InOrder.set(i);
11391       } else if ((idx / 4) == BestHiQuad) {
11392         MaskV[i] = (idx & 3) + 4;
11393         InOrder.set(i);
11394       }
11395     }
11396     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
11397                                 &MaskV[0]);
11398
11399     if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
11400       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
11401       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
11402                                   NewV.getOperand(0),
11403                                   getShufflePSHUFHWImmediate(SVOp), DAG);
11404     }
11405   }
11406
11407   // In case BestHi & BestLo were both -1, which means each quadword has a word
11408   // from each of the four input quadwords, calculate the InOrder bitvector now
11409   // before falling through to the insert/extract cleanup.
11410   if (BestLoQuad == -1 && BestHiQuad == -1) {
11411     NewV = V1;
11412     for (int i = 0; i != 8; ++i)
11413       if (MaskVals[i] < 0 || MaskVals[i] == i)
11414         InOrder.set(i);
11415   }
11416
11417   // The other elements are put in the right place using pextrw and pinsrw.
11418   for (unsigned i = 0; i != 8; ++i) {
11419     if (InOrder[i])
11420       continue;
11421     int EltIdx = MaskVals[i];
11422     if (EltIdx < 0)
11423       continue;
11424     SDValue ExtOp = (EltIdx < 8) ?
11425       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
11426                   DAG.getIntPtrConstant(EltIdx)) :
11427       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
11428                   DAG.getIntPtrConstant(EltIdx - 8));
11429     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
11430                        DAG.getIntPtrConstant(i));
11431   }
11432   return NewV;
11433 }
11434
11435 /// \brief v16i16 shuffles
11436 ///
11437 /// FIXME: We only support generation of a single pshufb currently.  We can
11438 /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as
11439 /// well (e.g 2 x pshufb + 1 x por).
11440 static SDValue
11441 LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) {
11442   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
11443   SDValue V1 = SVOp->getOperand(0);
11444   SDValue V2 = SVOp->getOperand(1);
11445   SDLoc dl(SVOp);
11446
11447   if (V2.getOpcode() != ISD::UNDEF)
11448     return SDValue();
11449
11450   SmallVector<int, 16> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11451   return getPSHUFB(MaskVals, V1, dl, DAG);
11452 }
11453
11454 // v16i8 shuffles - Prefer shuffles in the following order:
11455 // 1. [ssse3] 1 x pshufb
11456 // 2. [ssse3] 2 x pshufb + 1 x por
11457 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
11458 static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
11459                                         const X86Subtarget* Subtarget,
11460                                         SelectionDAG &DAG) {
11461   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11462   SDValue V1 = SVOp->getOperand(0);
11463   SDValue V2 = SVOp->getOperand(1);
11464   SDLoc dl(SVOp);
11465   ArrayRef<int> MaskVals = SVOp->getMask();
11466
11467   // Promote splats to a larger type which usually leads to more efficient code.
11468   // FIXME: Is this true if pshufb is available?
11469   if (SVOp->isSplat())
11470     return PromoteSplat(SVOp, DAG);
11471
11472   // If we have SSSE3, case 1 is generated when all result bytes come from
11473   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
11474   // present, fall back to case 3.
11475
11476   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
11477   if (Subtarget->hasSSSE3()) {
11478     SmallVector<SDValue,16> pshufbMask;
11479
11480     // If all result elements are from one input vector, then only translate
11481     // undef mask values to 0x80 (zero out result) in the pshufb mask.
11482     //
11483     // Otherwise, we have elements from both input vectors, and must zero out
11484     // elements that come from V2 in the first mask, and V1 in the second mask
11485     // so that we can OR them together.
11486     for (unsigned i = 0; i != 16; ++i) {
11487       int EltIdx = MaskVals[i];
11488       if (EltIdx < 0 || EltIdx >= 16)
11489         EltIdx = 0x80;
11490       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11491     }
11492     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
11493                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11494                                  MVT::v16i8, pshufbMask));
11495
11496     // As PSHUFB will zero elements with negative indices, it's safe to ignore
11497     // the 2nd operand if it's undefined or zero.
11498     if (V2.getOpcode() == ISD::UNDEF ||
11499         ISD::isBuildVectorAllZeros(V2.getNode()))
11500       return V1;
11501
11502     // Calculate the shuffle mask for the second input, shuffle it, and
11503     // OR it with the first shuffled input.
11504     pshufbMask.clear();
11505     for (unsigned i = 0; i != 16; ++i) {
11506       int EltIdx = MaskVals[i];
11507       EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16;
11508       pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
11509     }
11510     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
11511                      DAG.getNode(ISD::BUILD_VECTOR, dl,
11512                                  MVT::v16i8, pshufbMask));
11513     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
11514   }
11515
11516   // No SSSE3 - Calculate in place words and then fix all out of place words
11517   // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
11518   // the 16 different words that comprise the two doublequadword input vectors.
11519   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
11520   V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
11521   SDValue NewV = V1;
11522   for (int i = 0; i != 8; ++i) {
11523     int Elt0 = MaskVals[i*2];
11524     int Elt1 = MaskVals[i*2+1];
11525
11526     // This word of the result is all undef, skip it.
11527     if (Elt0 < 0 && Elt1 < 0)
11528       continue;
11529
11530     // This word of the result is already in the correct place, skip it.
11531     if ((Elt0 == i*2) && (Elt1 == i*2+1))
11532       continue;
11533
11534     SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
11535     SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
11536     SDValue InsElt;
11537
11538     // If Elt0 and Elt1 are defined, are consecutive, and can be load
11539     // using a single extract together, load it and store it.
11540     if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
11541       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11542                            DAG.getIntPtrConstant(Elt1 / 2));
11543       NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11544                         DAG.getIntPtrConstant(i));
11545       continue;
11546     }
11547
11548     // If Elt1 is defined, extract it from the appropriate source.  If the
11549     // source byte is not also odd, shift the extracted word left 8 bits
11550     // otherwise clear the bottom 8 bits if we need to do an or.
11551     if (Elt1 >= 0) {
11552       InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
11553                            DAG.getIntPtrConstant(Elt1 / 2));
11554       if ((Elt1 & 1) == 0)
11555         InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
11556                              DAG.getConstant(8,
11557                                   TLI.getShiftAmountTy(InsElt.getValueType())));
11558       else if (Elt0 >= 0)
11559         InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
11560                              DAG.getConstant(0xFF00, MVT::i16));
11561     }
11562     // If Elt0 is defined, extract it from the appropriate source.  If the
11563     // source byte is not also even, shift the extracted word right 8 bits. If
11564     // Elt1 was also defined, OR the extracted values together before
11565     // inserting them in the result.
11566     if (Elt0 >= 0) {
11567       SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
11568                                     Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
11569       if ((Elt0 & 1) != 0)
11570         InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
11571                               DAG.getConstant(8,
11572                                  TLI.getShiftAmountTy(InsElt0.getValueType())));
11573       else if (Elt1 >= 0)
11574         InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
11575                              DAG.getConstant(0x00FF, MVT::i16));
11576       InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
11577                          : InsElt0;
11578     }
11579     NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
11580                        DAG.getIntPtrConstant(i));
11581   }
11582   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
11583 }
11584
11585 // v32i8 shuffles - Translate to VPSHUFB if possible.
11586 static
11587 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
11588                                  const X86Subtarget *Subtarget,
11589                                  SelectionDAG &DAG) {
11590   MVT VT = SVOp->getSimpleValueType(0);
11591   SDValue V1 = SVOp->getOperand(0);
11592   SDValue V2 = SVOp->getOperand(1);
11593   SDLoc dl(SVOp);
11594   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
11595
11596   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
11597   bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
11598   bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
11599
11600   // VPSHUFB may be generated if
11601   // (1) one of input vector is undefined or zeroinitializer.
11602   // The mask value 0x80 puts 0 in the corresponding slot of the vector.
11603   // And (2) the mask indexes don't cross the 128-bit lane.
11604   if (VT != MVT::v32i8 || !Subtarget->hasInt256() ||
11605       (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
11606     return SDValue();
11607
11608   if (V1IsAllZero && !V2IsAllZero) {
11609     CommuteVectorShuffleMask(MaskVals, 32);
11610     V1 = V2;
11611   }
11612   return getPSHUFB(MaskVals, V1, dl, DAG);
11613 }
11614
11615 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
11616 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
11617 /// done when every pair / quad of shuffle mask elements point to elements in
11618 /// the right sequence. e.g.
11619 /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
11620 static
11621 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
11622                                  SelectionDAG &DAG) {
11623   MVT VT = SVOp->getSimpleValueType(0);
11624   SDLoc dl(SVOp);
11625   unsigned NumElems = VT.getVectorNumElements();
11626   MVT NewVT;
11627   unsigned Scale;
11628   switch (VT.SimpleTy) {
11629   default: llvm_unreachable("Unexpected!");
11630   case MVT::v2i64:
11631   case MVT::v2f64:
11632            return SDValue(SVOp, 0);
11633   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
11634   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
11635   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
11636   case MVT::v16i8:  NewVT = MVT::v4i32; Scale = 4; break;
11637   case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break;
11638   case MVT::v32i8:  NewVT = MVT::v8i32; Scale = 4; break;
11639   }
11640
11641   SmallVector<int, 8> MaskVec;
11642   for (unsigned i = 0; i != NumElems; i += Scale) {
11643     int StartIdx = -1;
11644     for (unsigned j = 0; j != Scale; ++j) {
11645       int EltIdx = SVOp->getMaskElt(i+j);
11646       if (EltIdx < 0)
11647         continue;
11648       if (StartIdx < 0)
11649         StartIdx = (EltIdx / Scale);
11650       if (EltIdx != (int)(StartIdx*Scale + j))
11651         return SDValue();
11652     }
11653     MaskVec.push_back(StartIdx);
11654   }
11655
11656   SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0));
11657   SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1));
11658   return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
11659 }
11660
11661 /// getVZextMovL - Return a zero-extending vector move low node.
11662 ///
11663 static SDValue getVZextMovL(MVT VT, MVT OpVT,
11664                             SDValue SrcOp, SelectionDAG &DAG,
11665                             const X86Subtarget *Subtarget, SDLoc dl) {
11666   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
11667     LoadSDNode *LD = nullptr;
11668     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
11669       LD = dyn_cast<LoadSDNode>(SrcOp);
11670     if (!LD) {
11671       // movssrr and movsdrr do not clear top bits. Try to use movd, movq
11672       // instead.
11673       MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
11674       if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
11675           SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
11676           SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
11677           SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
11678         // PR2108
11679         OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
11680         return DAG.getNode(ISD::BITCAST, dl, VT,
11681                            DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11682                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
11683                                                    OpVT,
11684                                                    SrcOp.getOperand(0)
11685                                                           .getOperand(0))));
11686       }
11687     }
11688   }
11689
11690   return DAG.getNode(ISD::BITCAST, dl, VT,
11691                      DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
11692                                  DAG.getNode(ISD::BITCAST, dl,
11693                                              OpVT, SrcOp)));
11694 }
11695
11696 /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
11697 /// which could not be matched by any known target speficic shuffle
11698 static SDValue
11699 LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11700
11701   SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG);
11702   if (NewOp.getNode())
11703     return NewOp;
11704
11705   MVT VT = SVOp->getSimpleValueType(0);
11706
11707   unsigned NumElems = VT.getVectorNumElements();
11708   unsigned NumLaneElems = NumElems / 2;
11709
11710   SDLoc dl(SVOp);
11711   MVT EltVT = VT.getVectorElementType();
11712   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
11713   SDValue Output[2];
11714
11715   SmallVector<int, 16> Mask;
11716   for (unsigned l = 0; l < 2; ++l) {
11717     // Build a shuffle mask for the output, discovering on the fly which
11718     // input vectors to use as shuffle operands (recorded in InputUsed).
11719     // If building a suitable shuffle vector proves too hard, then bail
11720     // out with UseBuildVector set.
11721     bool UseBuildVector = false;
11722     int InputUsed[2] = { -1, -1 }; // Not yet discovered.
11723     unsigned LaneStart = l * NumLaneElems;
11724     for (unsigned i = 0; i != NumLaneElems; ++i) {
11725       // The mask element.  This indexes into the input.
11726       int Idx = SVOp->getMaskElt(i+LaneStart);
11727       if (Idx < 0) {
11728         // the mask element does not index into any input vector.
11729         Mask.push_back(-1);
11730         continue;
11731       }
11732
11733       // The input vector this mask element indexes into.
11734       int Input = Idx / NumLaneElems;
11735
11736       // Turn the index into an offset from the start of the input vector.
11737       Idx -= Input * NumLaneElems;
11738
11739       // Find or create a shuffle vector operand to hold this input.
11740       unsigned OpNo;
11741       for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
11742         if (InputUsed[OpNo] == Input)
11743           // This input vector is already an operand.
11744           break;
11745         if (InputUsed[OpNo] < 0) {
11746           // Create a new operand for this input vector.
11747           InputUsed[OpNo] = Input;
11748           break;
11749         }
11750       }
11751
11752       if (OpNo >= array_lengthof(InputUsed)) {
11753         // More than two input vectors used!  Give up on trying to create a
11754         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
11755         UseBuildVector = true;
11756         break;
11757       }
11758
11759       // Add the mask index for the new shuffle vector.
11760       Mask.push_back(Idx + OpNo * NumLaneElems);
11761     }
11762
11763     if (UseBuildVector) {
11764       SmallVector<SDValue, 16> SVOps;
11765       for (unsigned i = 0; i != NumLaneElems; ++i) {
11766         // The mask element.  This indexes into the input.
11767         int Idx = SVOp->getMaskElt(i+LaneStart);
11768         if (Idx < 0) {
11769           SVOps.push_back(DAG.getUNDEF(EltVT));
11770           continue;
11771         }
11772
11773         // The input vector this mask element indexes into.
11774         int Input = Idx / NumElems;
11775
11776         // Turn the index into an offset from the start of the input vector.
11777         Idx -= Input * NumElems;
11778
11779         // Extract the vector element by hand.
11780         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
11781                                     SVOp->getOperand(Input),
11782                                     DAG.getIntPtrConstant(Idx)));
11783       }
11784
11785       // Construct the output using a BUILD_VECTOR.
11786       Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
11787     } else if (InputUsed[0] < 0) {
11788       // No input vectors were used! The result is undefined.
11789       Output[l] = DAG.getUNDEF(NVT);
11790     } else {
11791       SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2),
11792                                         (InputUsed[0] % 2) * NumLaneElems,
11793                                         DAG, dl);
11794       // If only one input was used, use an undefined vector for the other.
11795       SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) :
11796         Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2),
11797                             (InputUsed[1] % 2) * NumLaneElems, DAG, dl);
11798       // At least one input vector was used. Create a new shuffle vector.
11799       Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]);
11800     }
11801
11802     Mask.clear();
11803   }
11804
11805   // Concatenate the result back
11806   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]);
11807 }
11808
11809 /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
11810 /// 4 elements, and match them with several different shuffle types.
11811 static SDValue
11812 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
11813   SDValue V1 = SVOp->getOperand(0);
11814   SDValue V2 = SVOp->getOperand(1);
11815   SDLoc dl(SVOp);
11816   MVT VT = SVOp->getSimpleValueType(0);
11817
11818   assert(VT.is128BitVector() && "Unsupported vector size");
11819
11820   std::pair<int, int> Locs[4];
11821   int Mask1[] = { -1, -1, -1, -1 };
11822   SmallVector<int, 8> PermMask(SVOp->getMask().begin(), SVOp->getMask().end());
11823
11824   unsigned NumHi = 0;
11825   unsigned NumLo = 0;
11826   for (unsigned i = 0; i != 4; ++i) {
11827     int Idx = PermMask[i];
11828     if (Idx < 0) {
11829       Locs[i] = std::make_pair(-1, -1);
11830     } else {
11831       assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
11832       if (Idx < 4) {
11833         Locs[i] = std::make_pair(0, NumLo);
11834         Mask1[NumLo] = Idx;
11835         NumLo++;
11836       } else {
11837         Locs[i] = std::make_pair(1, NumHi);
11838         if (2+NumHi < 4)
11839           Mask1[2+NumHi] = Idx;
11840         NumHi++;
11841       }
11842     }
11843   }
11844
11845   if (NumLo <= 2 && NumHi <= 2) {
11846     // If no more than two elements come from either vector. This can be
11847     // implemented with two shuffles. First shuffle gather the elements.
11848     // The second shuffle, which takes the first shuffle as both of its
11849     // vector operands, put the elements into the right order.
11850     V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11851
11852     int Mask2[] = { -1, -1, -1, -1 };
11853
11854     for (unsigned i = 0; i != 4; ++i)
11855       if (Locs[i].first != -1) {
11856         unsigned Idx = (i < 2) ? 0 : 4;
11857         Idx += Locs[i].first * 2 + Locs[i].second;
11858         Mask2[i] = Idx;
11859       }
11860
11861     return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
11862   }
11863
11864   if (NumLo == 3 || NumHi == 3) {
11865     // Otherwise, we must have three elements from one vector, call it X, and
11866     // one element from the other, call it Y.  First, use a shufps to build an
11867     // intermediate vector with the one element from Y and the element from X
11868     // that will be in the same half in the final destination (the indexes don't
11869     // matter). Then, use a shufps to build the final vector, taking the half
11870     // containing the element from Y from the intermediate, and the other half
11871     // from X.
11872     if (NumHi == 3) {
11873       // Normalize it so the 3 elements come from V1.
11874       CommuteVectorShuffleMask(PermMask, 4);
11875       std::swap(V1, V2);
11876     }
11877
11878     // Find the element from V2.
11879     unsigned HiIndex;
11880     for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
11881       int Val = PermMask[HiIndex];
11882       if (Val < 0)
11883         continue;
11884       if (Val >= 4)
11885         break;
11886     }
11887
11888     Mask1[0] = PermMask[HiIndex];
11889     Mask1[1] = -1;
11890     Mask1[2] = PermMask[HiIndex^1];
11891     Mask1[3] = -1;
11892     V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11893
11894     if (HiIndex >= 2) {
11895       Mask1[0] = PermMask[0];
11896       Mask1[1] = PermMask[1];
11897       Mask1[2] = HiIndex & 1 ? 6 : 4;
11898       Mask1[3] = HiIndex & 1 ? 4 : 6;
11899       return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
11900     }
11901
11902     Mask1[0] = HiIndex & 1 ? 2 : 0;
11903     Mask1[1] = HiIndex & 1 ? 0 : 2;
11904     Mask1[2] = PermMask[2];
11905     Mask1[3] = PermMask[3];
11906     if (Mask1[2] >= 0)
11907       Mask1[2] += 4;
11908     if (Mask1[3] >= 0)
11909       Mask1[3] += 4;
11910     return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
11911   }
11912
11913   // Break it into (shuffle shuffle_hi, shuffle_lo).
11914   int LoMask[] = { -1, -1, -1, -1 };
11915   int HiMask[] = { -1, -1, -1, -1 };
11916
11917   int *MaskPtr = LoMask;
11918   unsigned MaskIdx = 0;
11919   unsigned LoIdx = 0;
11920   unsigned HiIdx = 2;
11921   for (unsigned i = 0; i != 4; ++i) {
11922     if (i == 2) {
11923       MaskPtr = HiMask;
11924       MaskIdx = 1;
11925       LoIdx = 0;
11926       HiIdx = 2;
11927     }
11928     int Idx = PermMask[i];
11929     if (Idx < 0) {
11930       Locs[i] = std::make_pair(-1, -1);
11931     } else if (Idx < 4) {
11932       Locs[i] = std::make_pair(MaskIdx, LoIdx);
11933       MaskPtr[LoIdx] = Idx;
11934       LoIdx++;
11935     } else {
11936       Locs[i] = std::make_pair(MaskIdx, HiIdx);
11937       MaskPtr[HiIdx] = Idx;
11938       HiIdx++;
11939     }
11940   }
11941
11942   SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
11943   SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
11944   int MaskOps[] = { -1, -1, -1, -1 };
11945   for (unsigned i = 0; i != 4; ++i)
11946     if (Locs[i].first != -1)
11947       MaskOps[i] = Locs[i].first * 4 + Locs[i].second;
11948   return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
11949 }
11950
11951 static bool MayFoldVectorLoad(SDValue V) {
11952   while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
11953     V = V.getOperand(0);
11954
11955   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
11956     V = V.getOperand(0);
11957   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
11958       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
11959     // BUILD_VECTOR (load), undef
11960     V = V.getOperand(0);
11961
11962   return MayFoldLoad(V);
11963 }
11964
11965 static
11966 SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
11967   MVT VT = Op.getSimpleValueType();
11968
11969   // Canonizalize to v2f64.
11970   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
11971   return DAG.getNode(ISD::BITCAST, dl, VT,
11972                      getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
11973                                           V1, DAG));
11974 }
11975
11976 static
11977 SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
11978                         bool HasSSE2) {
11979   SDValue V1 = Op.getOperand(0);
11980   SDValue V2 = Op.getOperand(1);
11981   MVT VT = Op.getSimpleValueType();
11982
11983   assert(VT != MVT::v2i64 && "unsupported shuffle type");
11984
11985   if (HasSSE2 && VT == MVT::v2f64)
11986     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
11987
11988   // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
11989   return DAG.getNode(ISD::BITCAST, dl, VT,
11990                      getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
11991                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
11992                            DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
11993 }
11994
11995 static
11996 SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
11997   SDValue V1 = Op.getOperand(0);
11998   SDValue V2 = Op.getOperand(1);
11999   MVT VT = Op.getSimpleValueType();
12000
12001   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
12002          "unsupported shuffle type");
12003
12004   if (V2.getOpcode() == ISD::UNDEF)
12005     V2 = V1;
12006
12007   // v4i32 or v4f32
12008   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
12009 }
12010
12011 static
12012 SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
12013   SDValue V1 = Op.getOperand(0);
12014   SDValue V2 = Op.getOperand(1);
12015   MVT VT = Op.getSimpleValueType();
12016   unsigned NumElems = VT.getVectorNumElements();
12017
12018   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
12019   // operand of these instructions is only memory, so check if there's a
12020   // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
12021   // same masks.
12022   bool CanFoldLoad = false;
12023
12024   // Trivial case, when V2 comes from a load.
12025   if (MayFoldVectorLoad(V2))
12026     CanFoldLoad = true;
12027
12028   // When V1 is a load, it can be folded later into a store in isel, example:
12029   //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
12030   //    turns into:
12031   //  (MOVLPSmr addr:$src1, VR128:$src2)
12032   // So, recognize this potential and also use MOVLPS or MOVLPD
12033   else if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
12034     CanFoldLoad = true;
12035
12036   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12037   if (CanFoldLoad) {
12038     if (HasSSE2 && NumElems == 2)
12039       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
12040
12041     if (NumElems == 4)
12042       // If we don't care about the second element, proceed to use movss.
12043       if (SVOp->getMaskElt(1) != -1)
12044         return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
12045   }
12046
12047   // movl and movlp will both match v2i64, but v2i64 is never matched by
12048   // movl earlier because we make it strict to avoid messing with the movlp load
12049   // folding logic (see the code above getMOVLP call). Match it here then,
12050   // this is horrible, but will stay like this until we move all shuffle
12051   // matching to x86 specific nodes. Note that for the 1st condition all
12052   // types are matched with movsd.
12053   if (HasSSE2) {
12054     // FIXME: isMOVLMask should be checked and matched before getMOVLP,
12055     // as to remove this logic from here, as much as possible
12056     if (NumElems == 2 || !isMOVLMask(SVOp->getMask(), VT))
12057       return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12058     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12059   }
12060
12061   assert(VT != MVT::v4i32 && "unsupported shuffle type");
12062
12063   // Invert the operand order and use SHUFPS to match it.
12064   return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V2, V1,
12065                               getShuffleSHUFImmediate(SVOp), DAG);
12066 }
12067
12068 static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
12069                                          SelectionDAG &DAG) {
12070   SDLoc dl(Load);
12071   MVT VT = Load->getSimpleValueType(0);
12072   MVT EVT = VT.getVectorElementType();
12073   SDValue Addr = Load->getOperand(1);
12074   SDValue NewAddr = DAG.getNode(
12075       ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
12076       DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
12077
12078   SDValue NewLoad =
12079       DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
12080                   DAG.getMachineFunction().getMachineMemOperand(
12081                       Load->getMemOperand(), 0, EVT.getStoreSize()));
12082   return NewLoad;
12083 }
12084
12085 // It is only safe to call this function if isINSERTPSMask is true for
12086 // this shufflevector mask.
12087 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
12088                            SelectionDAG &DAG) {
12089   // Generate an insertps instruction when inserting an f32 from memory onto a
12090   // v4f32 or when copying a member from one v4f32 to another.
12091   // We also use it for transferring i32 from one register to another,
12092   // since it simply copies the same bits.
12093   // If we're transferring an i32 from memory to a specific element in a
12094   // register, we output a generic DAG that will match the PINSRD
12095   // instruction.
12096   MVT VT = SVOp->getSimpleValueType(0);
12097   MVT EVT = VT.getVectorElementType();
12098   SDValue V1 = SVOp->getOperand(0);
12099   SDValue V2 = SVOp->getOperand(1);
12100   auto Mask = SVOp->getMask();
12101   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
12102          "unsupported vector type for insertps/pinsrd");
12103
12104   auto FromV1Predicate = [](const int &i) { return i < 4 && i > -1; };
12105   auto FromV2Predicate = [](const int &i) { return i >= 4; };
12106   int FromV1 = std::count_if(Mask.begin(), Mask.end(), FromV1Predicate);
12107
12108   SDValue From;
12109   SDValue To;
12110   unsigned DestIndex;
12111   if (FromV1 == 1) {
12112     From = V1;
12113     To = V2;
12114     DestIndex = std::find_if(Mask.begin(), Mask.end(), FromV1Predicate) -
12115                 Mask.begin();
12116
12117     // If we have 1 element from each vector, we have to check if we're
12118     // changing V1's element's place. If so, we're done. Otherwise, we
12119     // should assume we're changing V2's element's place and behave
12120     // accordingly.
12121     int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate);
12122     assert(DestIndex <= INT32_MAX && "truncated destination index");
12123     if (FromV1 == FromV2 &&
12124         static_cast<int>(DestIndex) == Mask[DestIndex] % 4) {
12125       From = V2;
12126       To = V1;
12127       DestIndex =
12128           std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12129     }
12130   } else {
12131     assert(std::count_if(Mask.begin(), Mask.end(), FromV2Predicate) == 1 &&
12132            "More than one element from V1 and from V2, or no elements from one "
12133            "of the vectors. This case should not have returned true from "
12134            "isINSERTPSMask");
12135     From = V2;
12136     To = V1;
12137     DestIndex =
12138         std::find_if(Mask.begin(), Mask.end(), FromV2Predicate) - Mask.begin();
12139   }
12140
12141   // Get an index into the source vector in the range [0,4) (the mask is
12142   // in the range [0,8) because it can address V1 and V2)
12143   unsigned SrcIndex = Mask[DestIndex] % 4;
12144   if (MayFoldLoad(From)) {
12145     // Trivial case, when From comes from a load and is only used by the
12146     // shuffle. Make it use insertps from the vector that we need from that
12147     // load.
12148     SDValue NewLoad =
12149         NarrowVectorLoadToElement(cast<LoadSDNode>(From), SrcIndex, DAG);
12150     if (!NewLoad.getNode())
12151       return SDValue();
12152
12153     if (EVT == MVT::f32) {
12154       // Create this as a scalar to vector to match the instruction pattern.
12155       SDValue LoadScalarToVector =
12156           DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
12157       SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
12158       return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
12159                          InsertpsMask);
12160     } else { // EVT == MVT::i32
12161       // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
12162       // instruction, to match the PINSRD instruction, which loads an i32 to a
12163       // certain vector element.
12164       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
12165                          DAG.getConstant(DestIndex, MVT::i32));
12166     }
12167   }
12168
12169   // Vector-element-to-vector
12170   SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
12171   return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
12172 }
12173
12174 // Reduce a vector shuffle to zext.
12175 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
12176                                     SelectionDAG &DAG) {
12177   // PMOVZX is only available from SSE41.
12178   if (!Subtarget->hasSSE41())
12179     return SDValue();
12180
12181   MVT VT = Op.getSimpleValueType();
12182
12183   // Only AVX2 support 256-bit vector integer extending.
12184   if (!Subtarget->hasInt256() && VT.is256BitVector())
12185     return SDValue();
12186
12187   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12188   SDLoc DL(Op);
12189   SDValue V1 = Op.getOperand(0);
12190   SDValue V2 = Op.getOperand(1);
12191   unsigned NumElems = VT.getVectorNumElements();
12192
12193   // Extending is an unary operation and the element type of the source vector
12194   // won't be equal to or larger than i64.
12195   if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
12196       VT.getVectorElementType() == MVT::i64)
12197     return SDValue();
12198
12199   // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
12200   unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
12201   while ((1U << Shift) < NumElems) {
12202     if (SVOp->getMaskElt(1U << Shift) == 1)
12203       break;
12204     Shift += 1;
12205     // The maximal ratio is 8, i.e. from i8 to i64.
12206     if (Shift > 3)
12207       return SDValue();
12208   }
12209
12210   // Check the shuffle mask.
12211   unsigned Mask = (1U << Shift) - 1;
12212   for (unsigned i = 0; i != NumElems; ++i) {
12213     int EltIdx = SVOp->getMaskElt(i);
12214     if ((i & Mask) != 0 && EltIdx != -1)
12215       return SDValue();
12216     if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
12217       return SDValue();
12218   }
12219
12220   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
12221   MVT NeVT = MVT::getIntegerVT(NBits);
12222   MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
12223
12224   if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
12225     return SDValue();
12226
12227   return DAG.getNode(ISD::BITCAST, DL, VT,
12228                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
12229 }
12230
12231 static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
12232                                       SelectionDAG &DAG) {
12233   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12234   MVT VT = Op.getSimpleValueType();
12235   SDLoc dl(Op);
12236   SDValue V1 = Op.getOperand(0);
12237   SDValue V2 = Op.getOperand(1);
12238
12239   if (isZeroShuffle(SVOp))
12240     return getZeroVector(VT, Subtarget, DAG, dl);
12241
12242   // Handle splat operations
12243   if (SVOp->isSplat()) {
12244     // Use vbroadcast whenever the splat comes from a foldable load
12245     SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
12246     if (Broadcast.getNode())
12247       return Broadcast;
12248   }
12249
12250   // Check integer expanding shuffles.
12251   SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
12252   if (NewOp.getNode())
12253     return NewOp;
12254
12255   // If the shuffle can be profitably rewritten as a narrower shuffle, then
12256   // do it!
12257   if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
12258       VT == MVT::v32i8) {
12259     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12260     if (NewOp.getNode())
12261       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
12262   } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
12263     // FIXME: Figure out a cleaner way to do this.
12264     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
12265       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12266       if (NewOp.getNode()) {
12267         MVT NewVT = NewOp.getSimpleValueType();
12268         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
12269                                NewVT, true, false))
12270           return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
12271                               dl);
12272       }
12273     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
12274       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
12275       if (NewOp.getNode()) {
12276         MVT NewVT = NewOp.getSimpleValueType();
12277         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
12278           return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
12279                               dl);
12280       }
12281     }
12282   }
12283   return SDValue();
12284 }
12285
12286 SDValue
12287 X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
12288   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12289   SDValue V1 = Op.getOperand(0);
12290   SDValue V2 = Op.getOperand(1);
12291   MVT VT = Op.getSimpleValueType();
12292   SDLoc dl(Op);
12293   unsigned NumElems = VT.getVectorNumElements();
12294   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
12295   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
12296   bool V1IsSplat = false;
12297   bool V2IsSplat = false;
12298   bool HasSSE2 = Subtarget->hasSSE2();
12299   bool HasFp256    = Subtarget->hasFp256();
12300   bool HasInt256   = Subtarget->hasInt256();
12301   MachineFunction &MF = DAG.getMachineFunction();
12302   bool OptForSize = MF.getFunction()->getAttributes().
12303     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
12304
12305   // Check if we should use the experimental vector shuffle lowering. If so,
12306   // delegate completely to that code path.
12307   if (ExperimentalVectorShuffleLowering)
12308     return lowerVectorShuffle(Op, Subtarget, DAG);
12309
12310   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
12311
12312   if (V1IsUndef && V2IsUndef)
12313     return DAG.getUNDEF(VT);
12314
12315   // When we create a shuffle node we put the UNDEF node to second operand,
12316   // but in some cases the first operand may be transformed to UNDEF.
12317   // In this case we should just commute the node.
12318   if (V1IsUndef)
12319     return DAG.getCommutedVectorShuffle(*SVOp);
12320
12321   // Vector shuffle lowering takes 3 steps:
12322   //
12323   // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
12324   //    narrowing and commutation of operands should be handled.
12325   // 2) Matching of shuffles with known shuffle masks to x86 target specific
12326   //    shuffle nodes.
12327   // 3) Rewriting of unmatched masks into new generic shuffle operations,
12328   //    so the shuffle can be broken into other shuffles and the legalizer can
12329   //    try the lowering again.
12330   //
12331   // The general idea is that no vector_shuffle operation should be left to
12332   // be matched during isel, all of them must be converted to a target specific
12333   // node here.
12334
12335   // Normalize the input vectors. Here splats, zeroed vectors, profitable
12336   // narrowing and commutation of operands should be handled. The actual code
12337   // doesn't include all of those, work in progress...
12338   SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
12339   if (NewOp.getNode())
12340     return NewOp;
12341
12342   SmallVector<int, 8> M(SVOp->getMask().begin(), SVOp->getMask().end());
12343
12344   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
12345   // unpckh_undef). Only use pshufd if speed is more important than size.
12346   if (OptForSize && isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12347     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12348   if (OptForSize && isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12349     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12350
12351   if (isMOVDDUPMask(M, VT) && Subtarget->hasSSE3() &&
12352       V2IsUndef && MayFoldVectorLoad(V1))
12353     return getMOVDDup(Op, dl, V1, DAG);
12354
12355   if (isMOVHLPS_v_undef_Mask(M, VT))
12356     return getMOVHighToLow(Op, dl, DAG);
12357
12358   // Use to match splats
12359   if (HasSSE2 && isUNPCKHMask(M, VT, HasInt256) && V2IsUndef &&
12360       (VT == MVT::v2f64 || VT == MVT::v2i64))
12361     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12362
12363   if (isPSHUFDMask(M, VT)) {
12364     // The actual implementation will match the mask in the if above and then
12365     // during isel it can match several different instructions, not only pshufd
12366     // as its name says, sad but true, emulate the behavior for now...
12367     if (isMOVDDUPMask(M, VT) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
12368       return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);
12369
12370     unsigned TargetMask = getShuffleSHUFImmediate(SVOp);
12371
12372     if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
12373       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
12374
12375     if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64))
12376       return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask,
12377                                   DAG);
12378
12379     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1,
12380                                 TargetMask, DAG);
12381   }
12382
12383   if (isPALIGNRMask(M, VT, Subtarget))
12384     return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
12385                                 getShufflePALIGNRImmediate(SVOp),
12386                                 DAG);
12387
12388   if (isVALIGNMask(M, VT, Subtarget))
12389     return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2,
12390                                 getShuffleVALIGNImmediate(SVOp),
12391                                 DAG);
12392
12393   // Check if this can be converted into a logical shift.
12394   bool isLeft = false;
12395   unsigned ShAmt = 0;
12396   SDValue ShVal;
12397   bool isShift = HasSSE2 && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
12398   if (isShift && ShVal.hasOneUse()) {
12399     // If the shifted value has multiple uses, it may be cheaper to use
12400     // v_set0 + movlhps or movhlps, etc.
12401     MVT EltVT = VT.getVectorElementType();
12402     ShAmt *= EltVT.getSizeInBits();
12403     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12404   }
12405
12406   if (isMOVLMask(M, VT)) {
12407     if (ISD::isBuildVectorAllZeros(V1.getNode()))
12408       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
12409     if (!isMOVLPMask(M, VT)) {
12410       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
12411         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
12412
12413       if (VT == MVT::v4i32 || VT == MVT::v4f32)
12414         return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
12415     }
12416   }
12417
12418   // FIXME: fold these into legal mask.
12419   if (isMOVLHPSMask(M, VT) && !isUNPCKLMask(M, VT, HasInt256))
12420     return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
12421
12422   if (isMOVHLPSMask(M, VT))
12423     return getMOVHighToLow(Op, dl, DAG);
12424
12425   if (V2IsUndef && isMOVSHDUPMask(M, VT, Subtarget))
12426     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
12427
12428   if (V2IsUndef && isMOVSLDUPMask(M, VT, Subtarget))
12429     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
12430
12431   if (isMOVLPMask(M, VT))
12432     return getMOVLP(Op, dl, DAG, HasSSE2);
12433
12434   if (ShouldXformToMOVHLPS(M, VT) ||
12435       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), M, VT))
12436     return DAG.getCommutedVectorShuffle(*SVOp);
12437
12438   if (isShift) {
12439     // No better options. Use a vshldq / vsrldq.
12440     MVT EltVT = VT.getVectorElementType();
12441     ShAmt *= EltVT.getSizeInBits();
12442     return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
12443   }
12444
12445   bool Commuted = false;
12446   // FIXME: This should also accept a bitcast of a splat?  Be careful, not
12447   // 1,1,1,1 -> v8i16 though.
12448   BitVector UndefElements;
12449   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V1.getNode()))
12450     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12451       V1IsSplat = true;
12452   if (auto *BVOp = dyn_cast<BuildVectorSDNode>(V2.getNode()))
12453     if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none())
12454       V2IsSplat = true;
12455
12456   // Canonicalize the splat or undef, if present, to be on the RHS.
12457   if (!V2IsUndef && V1IsSplat && !V2IsSplat) {
12458     CommuteVectorShuffleMask(M, NumElems);
12459     std::swap(V1, V2);
12460     std::swap(V1IsSplat, V2IsSplat);
12461     Commuted = true;
12462   }
12463
12464   if (isCommutedMOVLMask(M, VT, V2IsSplat, V2IsUndef)) {
12465     // Shuffling low element of v1 into undef, just return v1.
12466     if (V2IsUndef)
12467       return V1;
12468     // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
12469     // the instruction selector will not match, so get a canonical MOVL with
12470     // swapped operands to undo the commute.
12471     return getMOVL(DAG, dl, VT, V2, V1);
12472   }
12473
12474   if (isUNPCKLMask(M, VT, HasInt256))
12475     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12476
12477   if (isUNPCKHMask(M, VT, HasInt256))
12478     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12479
12480   if (V2IsSplat) {
12481     // Normalize mask so all entries that point to V2 points to its first
12482     // element then try to match unpck{h|l} again. If match, return a
12483     // new vector_shuffle with the corrected mask.p
12484     SmallVector<int, 8> NewMask(M.begin(), M.end());
12485     NormalizeMask(NewMask, NumElems);
12486     if (isUNPCKLMask(NewMask, VT, HasInt256, true))
12487       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12488     if (isUNPCKHMask(NewMask, VT, HasInt256, true))
12489       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12490   }
12491
12492   if (Commuted) {
12493     // Commute is back and try unpck* again.
12494     // FIXME: this seems wrong.
12495     CommuteVectorShuffleMask(M, NumElems);
12496     std::swap(V1, V2);
12497     std::swap(V1IsSplat, V2IsSplat);
12498
12499     if (isUNPCKLMask(M, VT, HasInt256))
12500       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V2, DAG);
12501
12502     if (isUNPCKHMask(M, VT, HasInt256))
12503       return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V2, DAG);
12504   }
12505
12506   // Normalize the node to match x86 shuffle ops if needed
12507   if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
12508     return DAG.getCommutedVectorShuffle(*SVOp);
12509
12510   // The checks below are all present in isShuffleMaskLegal, but they are
12511   // inlined here right now to enable us to directly emit target specific
12512   // nodes, and remove one by one until they don't return Op anymore.
12513
12514   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
12515       SVOp->getSplatIndex() == 0 && V2IsUndef) {
12516     if (VT == MVT::v2f64 || VT == MVT::v2i64)
12517       return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12518   }
12519
12520   if (isPSHUFHWMask(M, VT, HasInt256))
12521     return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
12522                                 getShufflePSHUFHWImmediate(SVOp),
12523                                 DAG);
12524
12525   if (isPSHUFLWMask(M, VT, HasInt256))
12526     return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
12527                                 getShufflePSHUFLWImmediate(SVOp),
12528                                 DAG);
12529
12530   unsigned MaskValue;
12531   if (isBlendMask(M, VT, Subtarget->hasSSE41(), Subtarget->hasInt256(),
12532                   &MaskValue))
12533     return LowerVECTOR_SHUFFLEtoBlend(SVOp, MaskValue, Subtarget, DAG);
12534
12535   if (isSHUFPMask(M, VT))
12536     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
12537                                 getShuffleSHUFImmediate(SVOp), DAG);
12538
12539   if (isUNPCKL_v_undef_Mask(M, VT, HasInt256))
12540     return getTargetShuffleNode(X86ISD::UNPCKL, dl, VT, V1, V1, DAG);
12541   if (isUNPCKH_v_undef_Mask(M, VT, HasInt256))
12542     return getTargetShuffleNode(X86ISD::UNPCKH, dl, VT, V1, V1, DAG);
12543
12544   //===--------------------------------------------------------------------===//
12545   // Generate target specific nodes for 128 or 256-bit shuffles only
12546   // supported in the AVX instruction set.
12547   //
12548
12549   // Handle VMOVDDUPY permutations
12550   if (V2IsUndef && isMOVDDUPYMask(M, VT, HasFp256))
12551     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
12552
12553   // Handle VPERMILPS/D* permutations
12554   if (isVPERMILPMask(M, VT)) {
12555     if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
12556       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
12557                                   getShuffleSHUFImmediate(SVOp), DAG);
12558     return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1,
12559                                 getShuffleSHUFImmediate(SVOp), DAG);
12560   }
12561
12562   unsigned Idx;
12563   if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
12564     return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
12565                               Idx*(NumElems/2), DAG, dl);
12566
12567   // Handle VPERM2F128/VPERM2I128 permutations
12568   if (isVPERM2X128Mask(M, VT, HasFp256))
12569     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
12570                                 V2, getShuffleVPERM2X128Immediate(SVOp), DAG);
12571
12572   if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
12573     return getINSERTPS(SVOp, dl, DAG);
12574
12575   unsigned Imm8;
12576   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
12577     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
12578
12579   if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
12580       VT.is512BitVector()) {
12581     MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
12582     MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
12583     SmallVector<SDValue, 16> permclMask;
12584     for (unsigned i = 0; i != NumElems; ++i) {
12585       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
12586     }
12587
12588     SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
12589     if (V2IsUndef)
12590       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
12591       return DAG.getNode(X86ISD::VPERMV, dl, VT,
12592                           DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
12593     return DAG.getNode(X86ISD::VPERMV3, dl, VT, V1,
12594                        DAG.getNode(ISD::BITCAST, dl, VT, Mask), V2);
12595   }
12596
12597   //===--------------------------------------------------------------------===//
12598   // Since no target specific shuffle was selected for this generic one,
12599   // lower it into other known shuffles. FIXME: this isn't true yet, but
12600   // this is the plan.
12601   //
12602
12603   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
12604   if (VT == MVT::v8i16) {
12605     SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
12606     if (NewOp.getNode())
12607       return NewOp;
12608   }
12609
12610   if (VT == MVT::v16i16 && Subtarget->hasInt256()) {
12611     SDValue NewOp = LowerVECTOR_SHUFFLEv16i16(Op, DAG);
12612     if (NewOp.getNode())
12613       return NewOp;
12614   }
12615
12616   if (VT == MVT::v16i8) {
12617     SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
12618     if (NewOp.getNode())
12619       return NewOp;
12620   }
12621
12622   if (VT == MVT::v32i8) {
12623     SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
12624     if (NewOp.getNode())
12625       return NewOp;
12626   }
12627
12628   // Handle all 128-bit wide vectors with 4 elements, and match them with
12629   // several different shuffle types.
12630   if (NumElems == 4 && VT.is128BitVector())
12631     return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
12632
12633   // Handle general 256-bit shuffles
12634   if (VT.is256BitVector())
12635     return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
12636
12637   return SDValue();
12638 }
12639
12640 // This function assumes its argument is a BUILD_VECTOR of constants or
12641 // undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
12642 // true.
12643 static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
12644                                     unsigned &MaskValue) {
12645   MaskValue = 0;
12646   unsigned NumElems = BuildVector->getNumOperands();
12647   // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
12648   unsigned NumLanes = (NumElems - 1) / 8 + 1;
12649   unsigned NumElemsInLane = NumElems / NumLanes;
12650
12651   // Blend for v16i16 should be symetric for the both lanes.
12652   for (unsigned i = 0; i < NumElemsInLane; ++i) {
12653     SDValue EltCond = BuildVector->getOperand(i);
12654     SDValue SndLaneEltCond =
12655         (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
12656
12657     int Lane1Cond = -1, Lane2Cond = -1;
12658     if (isa<ConstantSDNode>(EltCond))
12659       Lane1Cond = !isZero(EltCond);
12660     if (isa<ConstantSDNode>(SndLaneEltCond))
12661       Lane2Cond = !isZero(SndLaneEltCond);
12662
12663     if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
12664       // Lane1Cond != 0, means we want the first argument.
12665       // Lane1Cond == 0, means we want the second argument.
12666       // The encoding of this argument is 0 for the first argument, 1
12667       // for the second. Therefore, invert the condition.
12668       MaskValue |= !Lane1Cond << i;
12669     else if (Lane1Cond < 0)
12670       MaskValue |= !Lane2Cond << i;
12671     else
12672       return false;
12673   }
12674   return true;
12675 }
12676
12677 /// \brief Try to lower a VSELECT instruction to an immediate-controlled blend
12678 /// instruction.
12679 static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget,
12680                                     SelectionDAG &DAG) {
12681   SDValue Cond = Op.getOperand(0);
12682   SDValue LHS = Op.getOperand(1);
12683   SDValue RHS = Op.getOperand(2);
12684   SDLoc dl(Op);
12685   MVT VT = Op.getSimpleValueType();
12686   MVT EltVT = VT.getVectorElementType();
12687   unsigned NumElems = VT.getVectorNumElements();
12688
12689   // There is no blend with immediate in AVX-512.
12690   if (VT.is512BitVector())
12691     return SDValue();
12692
12693   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
12694     return SDValue();
12695   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
12696     return SDValue();
12697
12698   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12699     return SDValue();
12700
12701   // Check the mask for BLEND and build the value.
12702   unsigned MaskValue = 0;
12703   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
12704     return SDValue();
12705
12706   // Convert i32 vectors to floating point if it is not AVX2.
12707   // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
12708   MVT BlendVT = VT;
12709   if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
12710     BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
12711                                NumElems);
12712     LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
12713     RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
12714   }
12715
12716   SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
12717                             DAG.getConstant(MaskValue, MVT::i32));
12718   return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
12719 }
12720
12721 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12722   // A vselect where all conditions and data are constants can be optimized into
12723   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12724   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12725       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12726       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12727     return SDValue();
12728
12729   SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG);
12730   if (BlendOp.getNode())
12731     return BlendOp;
12732
12733   // Some types for vselect were previously set to Expand, not Legal or
12734   // Custom. Return an empty SDValue so we fall-through to Expand, after
12735   // the Custom lowering phase.
12736   MVT VT = Op.getSimpleValueType();
12737   switch (VT.SimpleTy) {
12738   default:
12739     break;
12740   case MVT::v8i16:
12741   case MVT::v16i16:
12742     if (Subtarget->hasBWI() && Subtarget->hasVLX())
12743       break;
12744     return SDValue();
12745   }
12746
12747   // We couldn't create a "Blend with immediate" node.
12748   // This node should still be legal, but we'll have to emit a blendv*
12749   // instruction.
12750   return Op;
12751 }
12752
12753 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12754   MVT VT = Op.getSimpleValueType();
12755   SDLoc dl(Op);
12756
12757   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12758     return SDValue();
12759
12760   if (VT.getSizeInBits() == 8) {
12761     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12762                                   Op.getOperand(0), Op.getOperand(1));
12763     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12764                                   DAG.getValueType(VT));
12765     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12766   }
12767
12768   if (VT.getSizeInBits() == 16) {
12769     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12770     // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12771     if (Idx == 0)
12772       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12773                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12774                                      DAG.getNode(ISD::BITCAST, dl,
12775                                                  MVT::v4i32,
12776                                                  Op.getOperand(0)),
12777                                      Op.getOperand(1)));
12778     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12779                                   Op.getOperand(0), Op.getOperand(1));
12780     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12781                                   DAG.getValueType(VT));
12782     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12783   }
12784
12785   if (VT == MVT::f32) {
12786     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12787     // the result back to FR32 register. It's only worth matching if the
12788     // result has a single use which is a store or a bitcast to i32.  And in
12789     // the case of a store, it's not worth it if the index is a constant 0,
12790     // because a MOVSSmr can be used instead, which is smaller and faster.
12791     if (!Op.hasOneUse())
12792       return SDValue();
12793     SDNode *User = *Op.getNode()->use_begin();
12794     if ((User->getOpcode() != ISD::STORE ||
12795          (isa<ConstantSDNode>(Op.getOperand(1)) &&
12796           cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
12797         (User->getOpcode() != ISD::BITCAST ||
12798          User->getValueType(0) != MVT::i32))
12799       return SDValue();
12800     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12801                                   DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
12802                                               Op.getOperand(0)),
12803                                               Op.getOperand(1));
12804     return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
12805   }
12806
12807   if (VT == MVT::i32 || VT == MVT::i64) {
12808     // ExtractPS/pextrq works with constant index.
12809     if (isa<ConstantSDNode>(Op.getOperand(1)))
12810       return Op;
12811   }
12812   return SDValue();
12813 }
12814
12815 /// Extract one bit from mask vector, like v16i1 or v8i1.
12816 /// AVX-512 feature.
12817 SDValue
12818 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12819   SDValue Vec = Op.getOperand(0);
12820   SDLoc dl(Vec);
12821   MVT VecVT = Vec.getSimpleValueType();
12822   SDValue Idx = Op.getOperand(1);
12823   MVT EltVT = Op.getSimpleValueType();
12824
12825   assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12826
12827   // variable index can't be handled in mask registers,
12828   // extend vector to VR512
12829   if (!isa<ConstantSDNode>(Idx)) {
12830     MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12831     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12832     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12833                               ExtVT.getVectorElementType(), Ext, Idx);
12834     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12835   }
12836
12837   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12838   const TargetRegisterClass* rc = getRegClassFor(VecVT);
12839   unsigned MaxSift = rc->getSize()*8 - 1;
12840   Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12841                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12842   Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12843                     DAG.getConstant(MaxSift, MVT::i8));
12844   return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12845                        DAG.getIntPtrConstant(0));
12846 }
12847
12848 SDValue
12849 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12850                                            SelectionDAG &DAG) const {
12851   SDLoc dl(Op);
12852   SDValue Vec = Op.getOperand(0);
12853   MVT VecVT = Vec.getSimpleValueType();
12854   SDValue Idx = Op.getOperand(1);
12855
12856   if (Op.getSimpleValueType() == MVT::i1)
12857     return ExtractBitFromMaskVector(Op, DAG);
12858
12859   if (!isa<ConstantSDNode>(Idx)) {
12860     if (VecVT.is512BitVector() ||
12861         (VecVT.is256BitVector() && Subtarget->hasInt256() &&
12862          VecVT.getVectorElementType().getSizeInBits() == 32)) {
12863
12864       MVT MaskEltVT =
12865         MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12866       MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12867                                     MaskEltVT.getSizeInBits());
12868
12869       Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12870       SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12871                                 getZeroVector(MaskVT, Subtarget, DAG, dl),
12872                                 Idx, DAG.getConstant(0, getPointerTy()));
12873       SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12874       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
12875                         Perm, DAG.getConstant(0, getPointerTy()));
12876     }
12877     return SDValue();
12878   }
12879
12880   // If this is a 256-bit vector result, first extract the 128-bit vector and
12881   // then extract the element from the 128-bit vector.
12882   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12883
12884     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12885     // Get the 128-bit vector.
12886     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
12887     MVT EltVT = VecVT.getVectorElementType();
12888
12889     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12890
12891     //if (IdxVal >= NumElems/2)
12892     //  IdxVal -= NumElems/2;
12893     IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
12894     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12895                        DAG.getConstant(IdxVal, MVT::i32));
12896   }
12897
12898   assert(VecVT.is128BitVector() && "Unexpected vector length");
12899
12900   if (Subtarget->hasSSE41()) {
12901     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
12902     if (Res.getNode())
12903       return Res;
12904   }
12905
12906   MVT VT = Op.getSimpleValueType();
12907   // TODO: handle v16i8.
12908   if (VT.getSizeInBits() == 16) {
12909     SDValue Vec = Op.getOperand(0);
12910     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12911     if (Idx == 0)
12912       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12913                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12914                                      DAG.getNode(ISD::BITCAST, dl,
12915                                                  MVT::v4i32, Vec),
12916                                      Op.getOperand(1)));
12917     // Transform it so it match pextrw which produces a 32-bit result.
12918     MVT EltVT = MVT::i32;
12919     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
12920                                   Op.getOperand(0), Op.getOperand(1));
12921     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12922                                   DAG.getValueType(VT));
12923     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12924   }
12925
12926   if (VT.getSizeInBits() == 32) {
12927     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12928     if (Idx == 0)
12929       return Op;
12930
12931     // SHUFPS the element to the lowest double word, then movss.
12932     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
12933     MVT VVT = Op.getOperand(0).getSimpleValueType();
12934     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12935                                        DAG.getUNDEF(VVT), Mask);
12936     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12937                        DAG.getIntPtrConstant(0));
12938   }
12939
12940   if (VT.getSizeInBits() == 64) {
12941     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12942     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12943     //        to match extract_elt for f64.
12944     unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
12945     if (Idx == 0)
12946       return Op;
12947
12948     // UNPCKHPD the element to the lowest double word, then movsd.
12949     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12950     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12951     int Mask[2] = { 1, -1 };
12952     MVT VVT = Op.getOperand(0).getSimpleValueType();
12953     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
12954                                        DAG.getUNDEF(VVT), Mask);
12955     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12956                        DAG.getIntPtrConstant(0));
12957   }
12958
12959   return SDValue();
12960 }
12961
12962 /// Insert one bit to mask vector, like v16i1 or v8i1.
12963 /// AVX-512 feature.
12964 SDValue
12965 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12966   SDLoc dl(Op);
12967   SDValue Vec = Op.getOperand(0);
12968   SDValue Elt = Op.getOperand(1);
12969   SDValue Idx = Op.getOperand(2);
12970   MVT VecVT = Vec.getSimpleValueType();
12971
12972   if (!isa<ConstantSDNode>(Idx)) {
12973     // Non constant index. Extend source and destination,
12974     // insert element and then truncate the result.
12975     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12976     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
12977     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12978       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12979       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12980     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12981   }
12982
12983   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12984   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12985   if (Vec.getOpcode() == ISD::UNDEF)
12986     return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12987                        DAG.getConstant(IdxVal, MVT::i8));
12988   const TargetRegisterClass* rc = getRegClassFor(VecVT);
12989   unsigned MaxSift = rc->getSize()*8 - 1;
12990   EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12991                     DAG.getConstant(MaxSift, MVT::i8));
12992   EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
12993                     DAG.getConstant(MaxSift - IdxVal, MVT::i8));
12994   return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12995 }
12996
12997 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12998                                                   SelectionDAG &DAG) const {
12999   MVT VT = Op.getSimpleValueType();
13000   MVT EltVT = VT.getVectorElementType();
13001
13002   if (EltVT == MVT::i1)
13003     return InsertBitToMaskVector(Op, DAG);
13004
13005   SDLoc dl(Op);
13006   SDValue N0 = Op.getOperand(0);
13007   SDValue N1 = Op.getOperand(1);
13008   SDValue N2 = Op.getOperand(2);
13009   if (!isa<ConstantSDNode>(N2))
13010     return SDValue();
13011   auto *N2C = cast<ConstantSDNode>(N2);
13012   unsigned IdxVal = N2C->getZExtValue();
13013
13014   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13015   // into that, and then insert the subvector back into the result.
13016   if (VT.is256BitVector() || VT.is512BitVector()) {
13017     // Get the desired 128-bit vector half.
13018     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
13019
13020     // Insert the element into the desired half.
13021     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13022     unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128;
13023
13024     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13025                     DAG.getConstant(IdxIn128, MVT::i32));
13026
13027     // Insert the changed part back to the 256-bit vector
13028     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
13029   }
13030   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13031
13032   if (Subtarget->hasSSE41()) {
13033     if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13034       unsigned Opc;
13035       if (VT == MVT::v8i16) {
13036         Opc = X86ISD::PINSRW;
13037       } else {
13038         assert(VT == MVT::v16i8);
13039         Opc = X86ISD::PINSRB;
13040       }
13041
13042       // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13043       // argument.
13044       if (N1.getValueType() != MVT::i32)
13045         N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13046       if (N2.getValueType() != MVT::i32)
13047         N2 = DAG.getIntPtrConstant(IdxVal);
13048       return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13049     }
13050
13051     if (EltVT == MVT::f32) {
13052       // Bits [7:6] of the constant are the source select.  This will always be
13053       //  zero here.  The DAG Combiner may combine an extract_elt index into
13054       //  these
13055       //  bits.  For example (insert (extract, 3), 2) could be matched by
13056       //  putting
13057       //  the '3' into bits [7:6] of X86ISD::INSERTPS.
13058       // Bits [5:4] of the constant are the destination select.  This is the
13059       //  value of the incoming immediate.
13060       // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
13061       //   combine either bitwise AND or insert of float 0.0 to set these bits.
13062       N2 = DAG.getIntPtrConstant(IdxVal << 4);
13063       // Create this as a scalar to vector..
13064       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13065       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13066     }
13067
13068     if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13069       // PINSR* works with constant index.
13070       return Op;
13071     }
13072   }
13073
13074   if (EltVT == MVT::i8)
13075     return SDValue();
13076
13077   if (EltVT.getSizeInBits() == 16) {
13078     // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13079     // as its second argument.
13080     if (N1.getValueType() != MVT::i32)
13081       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13082     if (N2.getValueType() != MVT::i32)
13083       N2 = DAG.getIntPtrConstant(IdxVal);
13084     return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13085   }
13086   return SDValue();
13087 }
13088
13089 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13090   SDLoc dl(Op);
13091   MVT OpVT = Op.getSimpleValueType();
13092
13093   // If this is a 256-bit vector result, first insert into a 128-bit
13094   // vector and then insert into the 256-bit vector.
13095   if (!OpVT.is128BitVector()) {
13096     // Insert into a 128-bit vector.
13097     unsigned SizeFactor = OpVT.getSizeInBits()/128;
13098     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13099                                  OpVT.getVectorNumElements() / SizeFactor);
13100
13101     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13102
13103     // Insert the 128-bit vector.
13104     return Insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13105   }
13106
13107   if (OpVT == MVT::v1i64 &&
13108       Op.getOperand(0).getValueType() == MVT::i64)
13109     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
13110
13111   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13112   assert(OpVT.is128BitVector() && "Expected an SSE type!");
13113   return DAG.getNode(ISD::BITCAST, dl, OpVT,
13114                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
13115 }
13116
13117 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
13118 // a simple subregister reference or explicit instructions to grab
13119 // upper bits of a vector.
13120 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13121                                       SelectionDAG &DAG) {
13122   SDLoc dl(Op);
13123   SDValue In =  Op.getOperand(0);
13124   SDValue Idx = Op.getOperand(1);
13125   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13126   MVT ResVT   = Op.getSimpleValueType();
13127   MVT InVT    = In.getSimpleValueType();
13128
13129   if (Subtarget->hasFp256()) {
13130     if (ResVT.is128BitVector() &&
13131         (InVT.is256BitVector() || InVT.is512BitVector()) &&
13132         isa<ConstantSDNode>(Idx)) {
13133       return Extract128BitVector(In, IdxVal, DAG, dl);
13134     }
13135     if (ResVT.is256BitVector() && InVT.is512BitVector() &&
13136         isa<ConstantSDNode>(Idx)) {
13137       return Extract256BitVector(In, IdxVal, DAG, dl);
13138     }
13139   }
13140   return SDValue();
13141 }
13142
13143 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
13144 // simple superregister reference or explicit instructions to insert
13145 // the upper bits of a vector.
13146 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
13147                                      SelectionDAG &DAG) {
13148   if (Subtarget->hasFp256()) {
13149     SDLoc dl(Op.getNode());
13150     SDValue Vec = Op.getNode()->getOperand(0);
13151     SDValue SubVec = Op.getNode()->getOperand(1);
13152     SDValue Idx = Op.getNode()->getOperand(2);
13153
13154     if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
13155          Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
13156         SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
13157         isa<ConstantSDNode>(Idx)) {
13158       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13159       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13160     }
13161
13162     if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
13163         SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
13164         isa<ConstantSDNode>(Idx)) {
13165       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13166       return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13167     }
13168   }
13169   return SDValue();
13170 }
13171
13172 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13173 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13174 // one of the above mentioned nodes. It has to be wrapped because otherwise
13175 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13176 // be used to form addressing mode. These wrapped nodes will be selected
13177 // into MOV32ri.
13178 SDValue
13179 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13180   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13181
13182   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13183   // global base reg.
13184   unsigned char OpFlag = 0;
13185   unsigned WrapperKind = X86ISD::Wrapper;
13186   CodeModel::Model M = DAG.getTarget().getCodeModel();
13187
13188   if (Subtarget->isPICStyleRIPRel() &&
13189       (M == CodeModel::Small || M == CodeModel::Kernel))
13190     WrapperKind = X86ISD::WrapperRIP;
13191   else if (Subtarget->isPICStyleGOT())
13192     OpFlag = X86II::MO_GOTOFF;
13193   else if (Subtarget->isPICStyleStubPIC())
13194     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13195
13196   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
13197                                              CP->getAlignment(),
13198                                              CP->getOffset(), OpFlag);
13199   SDLoc DL(CP);
13200   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13201   // With PIC, the address is actually $g + Offset.
13202   if (OpFlag) {
13203     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13204                          DAG.getNode(X86ISD::GlobalBaseReg,
13205                                      SDLoc(), getPointerTy()),
13206                          Result);
13207   }
13208
13209   return Result;
13210 }
13211
13212 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13213   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13214
13215   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13216   // global base reg.
13217   unsigned char OpFlag = 0;
13218   unsigned WrapperKind = X86ISD::Wrapper;
13219   CodeModel::Model M = DAG.getTarget().getCodeModel();
13220
13221   if (Subtarget->isPICStyleRIPRel() &&
13222       (M == CodeModel::Small || M == CodeModel::Kernel))
13223     WrapperKind = X86ISD::WrapperRIP;
13224   else if (Subtarget->isPICStyleGOT())
13225     OpFlag = X86II::MO_GOTOFF;
13226   else if (Subtarget->isPICStyleStubPIC())
13227     OpFlag = X86II::MO_PIC_BASE_OFFSET;
13228
13229   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
13230                                           OpFlag);
13231   SDLoc DL(JT);
13232   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13233
13234   // With PIC, the address is actually $g + Offset.
13235   if (OpFlag)
13236     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13237                          DAG.getNode(X86ISD::GlobalBaseReg,
13238                                      SDLoc(), getPointerTy()),
13239                          Result);
13240
13241   return Result;
13242 }
13243
13244 SDValue
13245 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13246   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13247
13248   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13249   // global base reg.
13250   unsigned char OpFlag = 0;
13251   unsigned WrapperKind = X86ISD::Wrapper;
13252   CodeModel::Model M = DAG.getTarget().getCodeModel();
13253
13254   if (Subtarget->isPICStyleRIPRel() &&
13255       (M == CodeModel::Small || M == CodeModel::Kernel)) {
13256     if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
13257       OpFlag = X86II::MO_GOTPCREL;
13258     WrapperKind = X86ISD::WrapperRIP;
13259   } else if (Subtarget->isPICStyleGOT()) {
13260     OpFlag = X86II::MO_GOT;
13261   } else if (Subtarget->isPICStyleStubPIC()) {
13262     OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
13263   } else if (Subtarget->isPICStyleStubNoDynamic()) {
13264     OpFlag = X86II::MO_DARWIN_NONLAZY;
13265   }
13266
13267   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
13268
13269   SDLoc DL(Op);
13270   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13271
13272   // With PIC, the address is actually $g + Offset.
13273   if (DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
13274       !Subtarget->is64Bit()) {
13275     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13276                          DAG.getNode(X86ISD::GlobalBaseReg,
13277                                      SDLoc(), getPointerTy()),
13278                          Result);
13279   }
13280
13281   // For symbols that require a load from a stub to get the address, emit the
13282   // load.
13283   if (isGlobalStubReference(OpFlag))
13284     Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
13285                          MachinePointerInfo::getGOT(), false, false, false, 0);
13286
13287   return Result;
13288 }
13289
13290 SDValue
13291 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13292   // Create the TargetBlockAddressAddress node.
13293   unsigned char OpFlags =
13294     Subtarget->ClassifyBlockAddressReference();
13295   CodeModel::Model M = DAG.getTarget().getCodeModel();
13296   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13297   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13298   SDLoc dl(Op);
13299   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
13300                                              OpFlags);
13301
13302   if (Subtarget->isPICStyleRIPRel() &&
13303       (M == CodeModel::Small || M == CodeModel::Kernel))
13304     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13305   else
13306     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13307
13308   // With PIC, the address is actually $g + Offset.
13309   if (isGlobalRelativeToPICBase(OpFlags)) {
13310     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13311                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13312                          Result);
13313   }
13314
13315   return Result;
13316 }
13317
13318 SDValue
13319 X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
13320                                       int64_t Offset, SelectionDAG &DAG) const {
13321   // Create the TargetGlobalAddress node, folding in the constant
13322   // offset if it is legal.
13323   unsigned char OpFlags =
13324       Subtarget->ClassifyGlobalReference(GV, DAG.getTarget());
13325   CodeModel::Model M = DAG.getTarget().getCodeModel();
13326   SDValue Result;
13327   if (OpFlags == X86II::MO_NO_FLAG &&
13328       X86::isOffsetSuitableForCodeModel(Offset, M)) {
13329     // A direct static reference to a global.
13330     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
13331     Offset = 0;
13332   } else {
13333     Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
13334   }
13335
13336   if (Subtarget->isPICStyleRIPRel() &&
13337       (M == CodeModel::Small || M == CodeModel::Kernel))
13338     Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
13339   else
13340     Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
13341
13342   // With PIC, the address is actually $g + Offset.
13343   if (isGlobalRelativeToPICBase(OpFlags)) {
13344     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
13345                          DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
13346                          Result);
13347   }
13348
13349   // For globals that require a load from a stub to get the address, emit the
13350   // load.
13351   if (isGlobalStubReference(OpFlags))
13352     Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
13353                          MachinePointerInfo::getGOT(), false, false, false, 0);
13354
13355   // If there was a non-zero offset that we didn't fold, create an explicit
13356   // addition for it.
13357   if (Offset != 0)
13358     Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
13359                          DAG.getConstant(Offset, getPointerTy()));
13360
13361   return Result;
13362 }
13363
13364 SDValue
13365 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
13366   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
13367   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
13368   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
13369 }
13370
13371 static SDValue
13372 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
13373            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
13374            unsigned char OperandFlags, bool LocalDynamic = false) {
13375   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13376   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13377   SDLoc dl(GA);
13378   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13379                                            GA->getValueType(0),
13380                                            GA->getOffset(),
13381                                            OperandFlags);
13382
13383   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
13384                                            : X86ISD::TLSADDR;
13385
13386   if (InFlag) {
13387     SDValue Ops[] = { Chain,  TGA, *InFlag };
13388     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13389   } else {
13390     SDValue Ops[]  = { Chain, TGA };
13391     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13392   }
13393
13394   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13395   MFI->setAdjustsStack(true);
13396   MFI->setHasCalls(true);
13397
13398   SDValue Flag = Chain.getValue(1);
13399   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13400 }
13401
13402 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13403 static SDValue
13404 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13405                                 const EVT PtrVT) {
13406   SDValue InFlag;
13407   SDLoc dl(GA);  // ? function entry point might be better
13408   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13409                                    DAG.getNode(X86ISD::GlobalBaseReg,
13410                                                SDLoc(), PtrVT), InFlag);
13411   InFlag = Chain.getValue(1);
13412
13413   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13414 }
13415
13416 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13417 static SDValue
13418 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13419                                 const EVT PtrVT) {
13420   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13421                     X86::RAX, X86II::MO_TLSGD);
13422 }
13423
13424 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13425                                            SelectionDAG &DAG,
13426                                            const EVT PtrVT,
13427                                            bool is64Bit) {
13428   SDLoc dl(GA);
13429
13430   // Get the start address of the TLS block for this module.
13431   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13432       .getInfo<X86MachineFunctionInfo>();
13433   MFI->incNumLocalDynamicTLSAccesses();
13434
13435   SDValue Base;
13436   if (is64Bit) {
13437     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13438                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
13439   } else {
13440     SDValue InFlag;
13441     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13442         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13443     InFlag = Chain.getValue(1);
13444     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13445                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13446   }
13447
13448   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13449   // of Base.
13450
13451   // Build x@dtpoff.
13452   unsigned char OperandFlags = X86II::MO_DTPOFF;
13453   unsigned WrapperKind = X86ISD::Wrapper;
13454   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13455                                            GA->getValueType(0),
13456                                            GA->getOffset(), OperandFlags);
13457   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13458
13459   // Add x@dtpoff with the base.
13460   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13461 }
13462
13463 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13464 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13465                                    const EVT PtrVT, TLSModel::Model model,
13466                                    bool is64Bit, bool isPIC) {
13467   SDLoc dl(GA);
13468
13469   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13470   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13471                                                          is64Bit ? 257 : 256));
13472
13473   SDValue ThreadPointer =
13474       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
13475                   MachinePointerInfo(Ptr), false, false, false, 0);
13476
13477   unsigned char OperandFlags = 0;
13478   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13479   // initialexec.
13480   unsigned WrapperKind = X86ISD::Wrapper;
13481   if (model == TLSModel::LocalExec) {
13482     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13483   } else if (model == TLSModel::InitialExec) {
13484     if (is64Bit) {
13485       OperandFlags = X86II::MO_GOTTPOFF;
13486       WrapperKind = X86ISD::WrapperRIP;
13487     } else {
13488       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13489     }
13490   } else {
13491     llvm_unreachable("Unexpected model");
13492   }
13493
13494   // emit "addl x@ntpoff,%eax" (local exec)
13495   // or "addl x@indntpoff,%eax" (initial exec)
13496   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13497   SDValue TGA =
13498       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13499                                  GA->getOffset(), OperandFlags);
13500   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13501
13502   if (model == TLSModel::InitialExec) {
13503     if (isPIC && !is64Bit) {
13504       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13505                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13506                            Offset);
13507     }
13508
13509     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13510                          MachinePointerInfo::getGOT(), false, false, false, 0);
13511   }
13512
13513   // The address of the thread local variable is the add of the thread
13514   // pointer with the offset of the variable.
13515   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13516 }
13517
13518 SDValue
13519 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13520
13521   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13522   const GlobalValue *GV = GA->getGlobal();
13523
13524   if (Subtarget->isTargetELF()) {
13525     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13526
13527     switch (model) {
13528       case TLSModel::GeneralDynamic:
13529         if (Subtarget->is64Bit())
13530           return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
13531         return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
13532       case TLSModel::LocalDynamic:
13533         return LowerToTLSLocalDynamicModel(GA, DAG, getPointerTy(),
13534                                            Subtarget->is64Bit());
13535       case TLSModel::InitialExec:
13536       case TLSModel::LocalExec:
13537         return LowerToTLSExecModel(
13538             GA, DAG, getPointerTy(), model, Subtarget->is64Bit(),
13539             DAG.getTarget().getRelocationModel() == Reloc::PIC_);
13540     }
13541     llvm_unreachable("Unknown TLS model.");
13542   }
13543
13544   if (Subtarget->isTargetDarwin()) {
13545     // Darwin only has one model of TLS.  Lower to that.
13546     unsigned char OpFlag = 0;
13547     unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
13548                            X86ISD::WrapperRIP : X86ISD::Wrapper;
13549
13550     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13551     // global base reg.
13552     bool PIC32 = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) &&
13553                  !Subtarget->is64Bit();
13554     if (PIC32)
13555       OpFlag = X86II::MO_TLVP_PIC_BASE;
13556     else
13557       OpFlag = X86II::MO_TLVP;
13558     SDLoc DL(Op);
13559     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13560                                                 GA->getValueType(0),
13561                                                 GA->getOffset(), OpFlag);
13562     SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
13563
13564     // With PIC32, the address is actually $g + Offset.
13565     if (PIC32)
13566       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
13567                            DAG.getNode(X86ISD::GlobalBaseReg,
13568                                        SDLoc(), getPointerTy()),
13569                            Offset);
13570
13571     // Lowering the machine isd will make sure everything is in the right
13572     // location.
13573     SDValue Chain = DAG.getEntryNode();
13574     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13575     SDValue Args[] = { Chain, Offset };
13576     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13577
13578     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13579     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13580     MFI->setAdjustsStack(true);
13581
13582     // And our return value (tls address) is in the standard call return value
13583     // location.
13584     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
13585     return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
13586                               Chain.getValue(1));
13587   }
13588
13589   if (Subtarget->isTargetKnownWindowsMSVC() ||
13590       Subtarget->isTargetWindowsGNU()) {
13591     // Just use the implicit TLS architecture
13592     // Need to generate someting similar to:
13593     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13594     //                                  ; from TEB
13595     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13596     //   mov     rcx, qword [rdx+rcx*8]
13597     //   mov     eax, .tls$:tlsvar
13598     //   [rax+rcx] contains the address
13599     // Windows 64bit: gs:0x58
13600     // Windows 32bit: fs:__tls_array
13601
13602     SDLoc dl(GA);
13603     SDValue Chain = DAG.getEntryNode();
13604
13605     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13606     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13607     // use its literal value of 0x2C.
13608     Value *Ptr = Constant::getNullValue(Subtarget->is64Bit()
13609                                         ? Type::getInt8PtrTy(*DAG.getContext(),
13610                                                              256)
13611                                         : Type::getInt32PtrTy(*DAG.getContext(),
13612                                                               257));
13613
13614     SDValue TlsArray =
13615         Subtarget->is64Bit()
13616             ? DAG.getIntPtrConstant(0x58)
13617             : (Subtarget->isTargetWindowsGNU()
13618                    ? DAG.getIntPtrConstant(0x2C)
13619                    : DAG.getExternalSymbol("_tls_array", getPointerTy()));
13620
13621     SDValue ThreadPointer =
13622         DAG.getLoad(getPointerTy(), dl, Chain, TlsArray,
13623                     MachinePointerInfo(Ptr), false, false, false, 0);
13624
13625     // Load the _tls_index variable
13626     SDValue IDX = DAG.getExternalSymbol("_tls_index", getPointerTy());
13627     if (Subtarget->is64Bit())
13628       IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain,
13629                            IDX, MachinePointerInfo(), MVT::i32,
13630                            false, false, false, 0);
13631     else
13632       IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(),
13633                         false, false, false, 0);
13634
13635     SDValue Scale = DAG.getConstant(Log2_64_Ceil(TD->getPointerSize()),
13636                                     getPointerTy());
13637     IDX = DAG.getNode(ISD::SHL, dl, getPointerTy(), IDX, Scale);
13638
13639     SDValue res = DAG.getNode(ISD::ADD, dl, getPointerTy(), ThreadPointer, IDX);
13640     res = DAG.getLoad(getPointerTy(), dl, Chain, res, MachinePointerInfo(),
13641                       false, false, false, 0);
13642
13643     // Get the offset of start of .tls section
13644     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13645                                              GA->getValueType(0),
13646                                              GA->getOffset(), X86II::MO_SECREL);
13647     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), TGA);
13648
13649     // The address of the thread local variable is the add of the thread
13650     // pointer with the offset of the variable.
13651     return DAG.getNode(ISD::ADD, dl, getPointerTy(), res, Offset);
13652   }
13653
13654   llvm_unreachable("TLS not implemented for this target.");
13655 }
13656
13657 /// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
13658 /// and take a 2 x i32 value to shift plus a shift amount.
13659 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13660   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13661   MVT VT = Op.getSimpleValueType();
13662   unsigned VTBits = VT.getSizeInBits();
13663   SDLoc dl(Op);
13664   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13665   SDValue ShOpLo = Op.getOperand(0);
13666   SDValue ShOpHi = Op.getOperand(1);
13667   SDValue ShAmt  = Op.getOperand(2);
13668   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13669   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13670   // during isel.
13671   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13672                                   DAG.getConstant(VTBits - 1, MVT::i8));
13673   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13674                                      DAG.getConstant(VTBits - 1, MVT::i8))
13675                        : DAG.getConstant(0, VT);
13676
13677   SDValue Tmp2, Tmp3;
13678   if (Op.getOpcode() == ISD::SHL_PARTS) {
13679     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13680     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13681   } else {
13682     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13683     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13684   }
13685
13686   // If the shift amount is larger or equal than the width of a part we can't
13687   // rely on the results of shld/shrd. Insert a test and select the appropriate
13688   // values for large shift amounts.
13689   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13690                                 DAG.getConstant(VTBits, MVT::i8));
13691   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13692                              AndNode, DAG.getConstant(0, MVT::i8));
13693
13694   SDValue Hi, Lo;
13695   SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
13696   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13697   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13698
13699   if (Op.getOpcode() == ISD::SHL_PARTS) {
13700     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13701     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13702   } else {
13703     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13704     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13705   }
13706
13707   SDValue Ops[2] = { Lo, Hi };
13708   return DAG.getMergeValues(Ops, dl);
13709 }
13710
13711 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13712                                            SelectionDAG &DAG) const {
13713   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
13714   SDLoc dl(Op);
13715
13716   if (SrcVT.isVector()) {
13717     if (SrcVT.getVectorElementType() == MVT::i1) {
13718       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13719       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13720                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT,
13721                                      Op.getOperand(0)));
13722     }
13723     return SDValue();
13724   }
13725
13726   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13727          "Unknown SINT_TO_FP to lower!");
13728
13729   // These are really Legal; return the operand so the caller accepts it as
13730   // Legal.
13731   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13732     return Op;
13733   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13734       Subtarget->is64Bit()) {
13735     return Op;
13736   }
13737
13738   unsigned Size = SrcVT.getSizeInBits()/8;
13739   MachineFunction &MF = DAG.getMachineFunction();
13740   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13741   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13742   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13743                                StackSlot,
13744                                MachinePointerInfo::getFixedStack(SSFI),
13745                                false, false, 0);
13746   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13747 }
13748
13749 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13750                                      SDValue StackSlot,
13751                                      SelectionDAG &DAG) const {
13752   // Build the FILD
13753   SDLoc DL(Op);
13754   SDVTList Tys;
13755   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13756   if (useSSE)
13757     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13758   else
13759     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13760
13761   unsigned ByteSize = SrcVT.getSizeInBits()/8;
13762
13763   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13764   MachineMemOperand *MMO;
13765   if (FI) {
13766     int SSFI = FI->getIndex();
13767     MMO =
13768       DAG.getMachineFunction()
13769       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13770                             MachineMemOperand::MOLoad, ByteSize, ByteSize);
13771   } else {
13772     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13773     StackSlot = StackSlot.getOperand(1);
13774   }
13775   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13776   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13777                                            X86ISD::FILD, DL,
13778                                            Tys, Ops, SrcVT, MMO);
13779
13780   if (useSSE) {
13781     Chain = Result.getValue(1);
13782     SDValue InFlag = Result.getValue(2);
13783
13784     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13785     // shouldn't be necessary except that RFP cannot be live across
13786     // multiple blocks. When stackifier is fixed, they can be uncoupled.
13787     MachineFunction &MF = DAG.getMachineFunction();
13788     unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13789     int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13790     SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
13791     Tys = DAG.getVTList(MVT::Other);
13792     SDValue Ops[] = {
13793       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13794     };
13795     MachineMemOperand *MMO =
13796       DAG.getMachineFunction()
13797       .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
13798                             MachineMemOperand::MOStore, SSFISize, SSFISize);
13799
13800     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13801                                     Ops, Op.getValueType(), MMO);
13802     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
13803                          MachinePointerInfo::getFixedStack(SSFI),
13804                          false, false, false, 0);
13805   }
13806
13807   return Result;
13808 }
13809
13810 // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
13811 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13812                                                SelectionDAG &DAG) const {
13813   // This algorithm is not obvious. Here it is what we're trying to output:
13814   /*
13815      movq       %rax,  %xmm0
13816      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13817      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13818      #ifdef __SSE3__
13819        haddpd   %xmm0, %xmm0
13820      #else
13821        pshufd   $0x4e, %xmm0, %xmm1
13822        addpd    %xmm1, %xmm0
13823      #endif
13824   */
13825
13826   SDLoc dl(Op);
13827   LLVMContext *Context = DAG.getContext();
13828
13829   // Build some magic constants.
13830   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13831   Constant *C0 = ConstantDataVector::get(*Context, CV0);
13832   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
13833
13834   SmallVector<Constant*,2> CV1;
13835   CV1.push_back(
13836     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13837                                       APInt(64, 0x4330000000000000ULL))));
13838   CV1.push_back(
13839     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13840                                       APInt(64, 0x4530000000000000ULL))));
13841   Constant *C1 = ConstantVector::get(CV1);
13842   SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
13843
13844   // Load the 64-bit value into an XMM register.
13845   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13846                             Op.getOperand(0));
13847   SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13848                               MachinePointerInfo::getConstantPool(),
13849                               false, false, false, 16);
13850   SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
13851                               DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
13852                               CLod0);
13853
13854   SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13855                               MachinePointerInfo::getConstantPool(),
13856                               false, false, false, 16);
13857   SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
13858   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13859   SDValue Result;
13860
13861   if (Subtarget->hasSSE3()) {
13862     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13863     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13864   } else {
13865     SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
13866     SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13867                                            S2F, 0x4E, DAG);
13868     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13869                          DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
13870                          Sub);
13871   }
13872
13873   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13874                      DAG.getIntPtrConstant(0));
13875 }
13876
13877 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
13878 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13879                                                SelectionDAG &DAG) const {
13880   SDLoc dl(Op);
13881   // FP constant to bias correct the final result.
13882   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
13883                                    MVT::f64);
13884
13885   // Load the 32-bit value into an XMM register.
13886   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13887                              Op.getOperand(0));
13888
13889   // Zero out the upper parts of the register.
13890   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13891
13892   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13893                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
13894                      DAG.getIntPtrConstant(0));
13895
13896   // Or the load with the bias.
13897   SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
13898                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13899                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13900                                                    MVT::v2f64, Load)),
13901                            DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
13902                                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
13903                                                    MVT::v2f64, Bias)));
13904   Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13905                    DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
13906                    DAG.getIntPtrConstant(0));
13907
13908   // Subtract the bias.
13909   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13910
13911   // Handle final rounding.
13912   EVT DestVT = Op.getValueType();
13913
13914   if (DestVT.bitsLT(MVT::f64))
13915     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13916                        DAG.getIntPtrConstant(0));
13917   if (DestVT.bitsGT(MVT::f64))
13918     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13919
13920   // Handle final rounding.
13921   return Sub;
13922 }
13923
13924 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13925                                      const X86Subtarget &Subtarget) {
13926   // The algorithm is the following:
13927   // #ifdef __SSE4_1__
13928   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13929   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13930   //                                 (uint4) 0x53000000, 0xaa);
13931   // #else
13932   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13933   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13934   // #endif
13935   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13936   //     return (float4) lo + fhi;
13937
13938   SDLoc DL(Op);
13939   SDValue V = Op->getOperand(0);
13940   EVT VecIntVT = V.getValueType();
13941   bool Is128 = VecIntVT == MVT::v4i32;
13942   EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13943   // If we convert to something else than the supported type, e.g., to v4f64,
13944   // abort early.
13945   if (VecFloatVT != Op->getValueType(0))
13946     return SDValue();
13947
13948   unsigned NumElts = VecIntVT.getVectorNumElements();
13949   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13950          "Unsupported custom type");
13951   assert(NumElts <= 8 && "The size of the constant array must be fixed");
13952
13953   // In the #idef/#else code, we have in common:
13954   // - The vector of constants:
13955   // -- 0x4b000000
13956   // -- 0x53000000
13957   // - A shift:
13958   // -- v >> 16
13959
13960   // Create the splat vector for 0x4b000000.
13961   SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32);
13962   SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow,
13963                            CstLow, CstLow, CstLow, CstLow};
13964   SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13965                                   makeArrayRef(&CstLowArray[0], NumElts));
13966   // Create the splat vector for 0x53000000.
13967   SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32);
13968   SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh,
13969                             CstHigh, CstHigh, CstHigh, CstHigh};
13970   SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13971                                    makeArrayRef(&CstHighArray[0], NumElts));
13972
13973   // Create the right shift.
13974   SDValue CstShift = DAG.getConstant(16, MVT::i32);
13975   SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift,
13976                              CstShift, CstShift, CstShift, CstShift};
13977   SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT,
13978                                     makeArrayRef(&CstShiftArray[0], NumElts));
13979   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13980
13981   SDValue Low, High;
13982   if (Subtarget.hasSSE41()) {
13983     EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13984     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13985     SDValue VecCstLowBitcast =
13986         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow);
13987     SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V);
13988     // Low will be bitcasted right away, so do not bother bitcasting back to its
13989     // original type.
13990     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13991                       VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32));
13992     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13993     //                                 (uint4) 0x53000000, 0xaa);
13994     SDValue VecCstHighBitcast =
13995         DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh);
13996     SDValue VecShiftBitcast =
13997         DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift);
13998     // High will be bitcasted right away, so do not bother bitcasting back to
13999     // its original type.
14000     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14001                        VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32));
14002   } else {
14003     SDValue CstMask = DAG.getConstant(0xffff, MVT::i32);
14004     SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask,
14005                                      CstMask, CstMask, CstMask);
14006     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14007     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14008     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14009
14010     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
14011     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14012   }
14013
14014   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14015   SDValue CstFAdd = DAG.getConstantFP(
14016       APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32);
14017   SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd,
14018                             CstFAdd, CstFAdd, CstFAdd, CstFAdd};
14019   SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT,
14020                                    makeArrayRef(&CstFAddArray[0], NumElts));
14021
14022   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14023   SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High);
14024   SDValue FHigh =
14025       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14026   //     return (float4) lo + fhi;
14027   SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low);
14028   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14029 }
14030
14031 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14032                                                SelectionDAG &DAG) const {
14033   SDValue N0 = Op.getOperand(0);
14034   MVT SVT = N0.getSimpleValueType();
14035   SDLoc dl(Op);
14036
14037   switch (SVT.SimpleTy) {
14038   default:
14039     llvm_unreachable("Custom UINT_TO_FP is not supported!");
14040   case MVT::v4i8:
14041   case MVT::v4i16:
14042   case MVT::v8i8:
14043   case MVT::v8i16: {
14044     MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
14045     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14046                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14047   }
14048   case MVT::v4i32:
14049   case MVT::v8i32:
14050     return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget);
14051   }
14052   llvm_unreachable(nullptr);
14053 }
14054
14055 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14056                                            SelectionDAG &DAG) const {
14057   SDValue N0 = Op.getOperand(0);
14058   SDLoc dl(Op);
14059
14060   if (Op.getValueType().isVector())
14061     return lowerUINT_TO_FP_vec(Op, DAG);
14062
14063   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14064   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14065   // the optimization here.
14066   if (DAG.SignBitIsZero(N0))
14067     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14068
14069   MVT SrcVT = N0.getSimpleValueType();
14070   MVT DstVT = Op.getSimpleValueType();
14071   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14072     return LowerUINT_TO_FP_i64(Op, DAG);
14073   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14074     return LowerUINT_TO_FP_i32(Op, DAG);
14075   if (Subtarget->is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14076     return SDValue();
14077
14078   // Make a 64-bit buffer, and use it to build an FILD.
14079   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14080   if (SrcVT == MVT::i32) {
14081     SDValue WordOff = DAG.getConstant(4, getPointerTy());
14082     SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
14083                                      getPointerTy(), StackSlot, WordOff);
14084     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14085                                   StackSlot, MachinePointerInfo(),
14086                                   false, false, 0);
14087     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
14088                                   OffsetSlot, MachinePointerInfo(),
14089                                   false, false, 0);
14090     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14091     return Fild;
14092   }
14093
14094   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14095   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14096                                StackSlot, MachinePointerInfo(),
14097                                false, false, 0);
14098   // For i64 source, we need to add the appropriate power of 2 if the input
14099   // was negative.  This is the same as the optimization in
14100   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14101   // we must be careful to do the computation in x87 extended precision, not
14102   // in SSE. (The generic code can't know it's OK to do this, or how to.)
14103   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14104   MachineMemOperand *MMO =
14105     DAG.getMachineFunction()
14106     .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14107                           MachineMemOperand::MOLoad, 8, 8);
14108
14109   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14110   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14111   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14112                                          MVT::i64, MMO);
14113
14114   APInt FF(32, 0x5F800000ULL);
14115
14116   // Check whether the sign bit is set.
14117   SDValue SignSet = DAG.getSetCC(dl,
14118                                  getSetCCResultType(*DAG.getContext(), MVT::i64),
14119                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
14120                                  ISD::SETLT);
14121
14122   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14123   SDValue FudgePtr = DAG.getConstantPool(
14124                              ConstantInt::get(*DAG.getContext(), FF.zext(64)),
14125                                          getPointerTy());
14126
14127   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14128   SDValue Zero = DAG.getIntPtrConstant(0);
14129   SDValue Four = DAG.getIntPtrConstant(4);
14130   SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14131                                Zero, Four);
14132   FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
14133
14134   // Load the value out, extending it from f32 to f80.
14135   // FIXME: Avoid the extend by constructing the right constant pool?
14136   SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
14137                                  FudgePtr, MachinePointerInfo::getConstantPool(),
14138                                  MVT::f32, false, false, false, 4);
14139   // Extend everything to 80 bits to force it to be done on x87.
14140   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14141   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
14142 }
14143
14144 std::pair<SDValue,SDValue>
14145 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14146                                     bool IsSigned, bool IsReplace) const {
14147   SDLoc DL(Op);
14148
14149   EVT DstTy = Op.getValueType();
14150
14151   if (!IsSigned && !isIntegerTypeFTOL(DstTy)) {
14152     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14153     DstTy = MVT::i64;
14154   }
14155
14156   assert(DstTy.getSimpleVT() <= MVT::i64 &&
14157          DstTy.getSimpleVT() >= MVT::i16 &&
14158          "Unknown FP_TO_INT to lower!");
14159
14160   // These are really Legal.
14161   if (DstTy == MVT::i32 &&
14162       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14163     return std::make_pair(SDValue(), SDValue());
14164   if (Subtarget->is64Bit() &&
14165       DstTy == MVT::i64 &&
14166       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14167     return std::make_pair(SDValue(), SDValue());
14168
14169   // We lower FP->int64 either into FISTP64 followed by a load from a temporary
14170   // stack slot, or into the FTOL runtime function.
14171   MachineFunction &MF = DAG.getMachineFunction();
14172   unsigned MemSize = DstTy.getSizeInBits()/8;
14173   int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14174   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14175
14176   unsigned Opc;
14177   if (!IsSigned && isIntegerTypeFTOL(DstTy))
14178     Opc = X86ISD::WIN_FTOL;
14179   else
14180     switch (DstTy.getSimpleVT().SimpleTy) {
14181     default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14182     case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14183     case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14184     case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14185     }
14186
14187   SDValue Chain = DAG.getEntryNode();
14188   SDValue Value = Op.getOperand(0);
14189   EVT TheVT = Op.getOperand(0).getValueType();
14190   // FIXME This causes a redundant load/store if the SSE-class value is already
14191   // in memory, such as if it is on the callstack.
14192   if (isScalarFPTypeInSSEReg(TheVT)) {
14193     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14194     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14195                          MachinePointerInfo::getFixedStack(SSFI),
14196                          false, false, 0);
14197     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14198     SDValue Ops[] = {
14199       Chain, StackSlot, DAG.getValueType(TheVT)
14200     };
14201
14202     MachineMemOperand *MMO =
14203       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14204                               MachineMemOperand::MOLoad, MemSize, MemSize);
14205     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14206     Chain = Value.getValue(1);
14207     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
14208     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
14209   }
14210
14211   MachineMemOperand *MMO =
14212     MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
14213                             MachineMemOperand::MOStore, MemSize, MemSize);
14214
14215   if (Opc != X86ISD::WIN_FTOL) {
14216     // Build the FP_TO_INT*_IN_MEM
14217     SDValue Ops[] = { Chain, Value, StackSlot };
14218     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
14219                                            Ops, DstTy, MMO);
14220     return std::make_pair(FIST, StackSlot);
14221   } else {
14222     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
14223       DAG.getVTList(MVT::Other, MVT::Glue),
14224       Chain, Value);
14225     SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX,
14226       MVT::i32, ftol.getValue(1));
14227     SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX,
14228       MVT::i32, eax.getValue(2));
14229     SDValue Ops[] = { eax, edx };
14230     SDValue pair = IsReplace
14231       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
14232       : DAG.getMergeValues(Ops, DL);
14233     return std::make_pair(pair, SDValue());
14234   }
14235 }
14236
14237 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
14238                               const X86Subtarget *Subtarget) {
14239   MVT VT = Op->getSimpleValueType(0);
14240   SDValue In = Op->getOperand(0);
14241   MVT InVT = In.getSimpleValueType();
14242   SDLoc dl(Op);
14243
14244   // Optimize vectors in AVX mode:
14245   //
14246   //   v8i16 -> v8i32
14247   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
14248   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
14249   //   Concat upper and lower parts.
14250   //
14251   //   v4i32 -> v4i64
14252   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
14253   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
14254   //   Concat upper and lower parts.
14255   //
14256
14257   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14258       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14259       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14260     return SDValue();
14261
14262   if (Subtarget->hasInt256())
14263     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14264
14265   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14266   SDValue Undef = DAG.getUNDEF(InVT);
14267   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14268   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14269   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14270
14271   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14272                              VT.getVectorNumElements()/2);
14273
14274   OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
14275   OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
14276
14277   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14278 }
14279
14280 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14281                                         SelectionDAG &DAG) {
14282   MVT VT = Op->getSimpleValueType(0);
14283   SDValue In = Op->getOperand(0);
14284   MVT InVT = In.getSimpleValueType();
14285   SDLoc DL(Op);
14286   unsigned int NumElts = VT.getVectorNumElements();
14287   if (NumElts != 8 && NumElts != 16)
14288     return SDValue();
14289
14290   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14291     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14292
14293   EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
14294   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14295   // Now we have only mask extension
14296   assert(InVT.getVectorElementType() == MVT::i1);
14297   SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
14298   const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14299   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
14300   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14301   SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14302                            MachinePointerInfo::getConstantPool(),
14303                            false, false, false, Alignment);
14304
14305   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
14306   if (VT.is512BitVector())
14307     return Brcst;
14308   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
14309 }
14310
14311 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14312                                SelectionDAG &DAG) {
14313   if (Subtarget->hasFp256()) {
14314     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14315     if (Res.getNode())
14316       return Res;
14317   }
14318
14319   return SDValue();
14320 }
14321
14322 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
14323                                 SelectionDAG &DAG) {
14324   SDLoc DL(Op);
14325   MVT VT = Op.getSimpleValueType();
14326   SDValue In = Op.getOperand(0);
14327   MVT SVT = In.getSimpleValueType();
14328
14329   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14330     return LowerZERO_EXTEND_AVX512(Op, DAG);
14331
14332   if (Subtarget->hasFp256()) {
14333     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
14334     if (Res.getNode())
14335       return Res;
14336   }
14337
14338   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14339          VT.getVectorNumElements() != SVT.getVectorNumElements());
14340   return SDValue();
14341 }
14342
14343 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14344   SDLoc DL(Op);
14345   MVT VT = Op.getSimpleValueType();
14346   SDValue In = Op.getOperand(0);
14347   MVT InVT = In.getSimpleValueType();
14348
14349   if (VT == MVT::i1) {
14350     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14351            "Invalid scalar TRUNCATE operation");
14352     if (InVT.getSizeInBits() >= 32)
14353       return SDValue();
14354     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14355     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14356   }
14357   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14358          "Invalid TRUNCATE operation");
14359
14360   if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
14361     if (VT.getVectorElementType().getSizeInBits() >=8)
14362       return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14363
14364     assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
14365     unsigned NumElts = InVT.getVectorNumElements();
14366     assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
14367     if (InVT.getSizeInBits() < 512) {
14368       MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
14369       In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14370       InVT = ExtVT;
14371     }
14372
14373     SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
14374     const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
14375     SDValue CP = DAG.getConstantPool(C, getPointerTy());
14376     unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
14377     SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
14378                            MachinePointerInfo::getConstantPool(),
14379                            false, false, false, Alignment);
14380     SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
14381     SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
14382     return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
14383   }
14384
14385   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14386     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14387     if (Subtarget->hasInt256()) {
14388       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14389       In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
14390       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14391                                 ShufMask);
14392       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14393                          DAG.getIntPtrConstant(0));
14394     }
14395
14396     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14397                                DAG.getIntPtrConstant(0));
14398     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14399                                DAG.getIntPtrConstant(2));
14400     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14401     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14402     static const int ShufMask[] = {0, 2, 4, 6};
14403     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14404   }
14405
14406   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14407     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14408     if (Subtarget->hasInt256()) {
14409       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
14410
14411       SmallVector<SDValue,32> pshufbMask;
14412       for (unsigned i = 0; i < 2; ++i) {
14413         pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
14414         pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
14415         pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
14416         pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
14417         pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
14418         pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
14419         pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
14420         pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
14421         for (unsigned j = 0; j < 8; ++j)
14422           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
14423       }
14424       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
14425       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14426       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
14427
14428       static const int ShufMask[] = {0,  2,  -1,  -1};
14429       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14430                                 &ShufMask[0]);
14431       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14432                        DAG.getIntPtrConstant(0));
14433       return DAG.getNode(ISD::BITCAST, DL, VT, In);
14434     }
14435
14436     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14437                                DAG.getIntPtrConstant(0));
14438
14439     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14440                                DAG.getIntPtrConstant(4));
14441
14442     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
14443     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
14444
14445     // The PSHUFB mask:
14446     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14447                                    -1, -1, -1, -1, -1, -1, -1, -1};
14448
14449     SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14450     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14451     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14452
14453     OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
14454     OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
14455
14456     // The MOVLHPS Mask:
14457     static const int ShufMask2[] = {0, 1, 4, 5};
14458     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14459     return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
14460   }
14461
14462   // Handle truncation of V256 to V128 using shuffles.
14463   if (!VT.is128BitVector() || !InVT.is256BitVector())
14464     return SDValue();
14465
14466   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
14467
14468   unsigned NumElems = VT.getVectorNumElements();
14469   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14470
14471   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14472   // Prepare truncation shuffle mask
14473   for (unsigned i = 0; i != NumElems; ++i)
14474     MaskVec[i] = i * 2;
14475   SDValue V = DAG.getVectorShuffle(NVT, DL,
14476                                    DAG.getNode(ISD::BITCAST, DL, NVT, In),
14477                                    DAG.getUNDEF(NVT), &MaskVec[0]);
14478   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14479                      DAG.getIntPtrConstant(0));
14480 }
14481
14482 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14483                                            SelectionDAG &DAG) const {
14484   assert(!Op.getSimpleValueType().isVector());
14485
14486   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14487     /*IsSigned=*/ true, /*IsReplace=*/ false);
14488   SDValue FIST = Vals.first, StackSlot = Vals.second;
14489   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14490   if (!FIST.getNode()) return Op;
14491
14492   if (StackSlot.getNode())
14493     // Load the result.
14494     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14495                        FIST, StackSlot, MachinePointerInfo(),
14496                        false, false, false, 0);
14497
14498   // The node is the result.
14499   return FIST;
14500 }
14501
14502 SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14503                                            SelectionDAG &DAG) const {
14504   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14505     /*IsSigned=*/ false, /*IsReplace=*/ false);
14506   SDValue FIST = Vals.first, StackSlot = Vals.second;
14507   assert(FIST.getNode() && "Unexpected failure");
14508
14509   if (StackSlot.getNode())
14510     // Load the result.
14511     return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14512                        FIST, StackSlot, MachinePointerInfo(),
14513                        false, false, false, 0);
14514
14515   // The node is the result.
14516   return FIST;
14517 }
14518
14519 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14520   SDLoc DL(Op);
14521   MVT VT = Op.getSimpleValueType();
14522   SDValue In = Op.getOperand(0);
14523   MVT SVT = In.getSimpleValueType();
14524
14525   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14526
14527   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14528                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14529                                  In, DAG.getUNDEF(SVT)));
14530 }
14531
14532 /// The only differences between FABS and FNEG are the mask and the logic op.
14533 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
14534 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14535   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14536          "Wrong opcode for lowering FABS or FNEG.");
14537
14538   bool IsFABS = (Op.getOpcode() == ISD::FABS);
14539
14540   // If this is a FABS and it has an FNEG user, bail out to fold the combination
14541   // into an FNABS. We'll lower the FABS after that if it is still in use.
14542   if (IsFABS)
14543     for (SDNode *User : Op->uses())
14544       if (User->getOpcode() == ISD::FNEG)
14545         return Op;
14546
14547   SDValue Op0 = Op.getOperand(0);
14548   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14549
14550   SDLoc dl(Op);
14551   MVT VT = Op.getSimpleValueType();
14552   // Assume scalar op for initialization; update for vector if needed.
14553   // Note that there are no scalar bitwise logical SSE/AVX instructions, so we
14554   // generate a 16-byte vector constant and logic op even for the scalar case.
14555   // Using a 16-byte mask allows folding the load of the mask with
14556   // the logic op, so it can save (~4 bytes) on code size.
14557   MVT EltVT = VT;
14558   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
14559   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14560   // decide if we should generate a 16-byte constant mask when we only need 4 or
14561   // 8 bytes for the scalar case.
14562   if (VT.isVector()) {
14563     EltVT = VT.getVectorElementType();
14564     NumElts = VT.getVectorNumElements();
14565   }
14566
14567   unsigned EltBits = EltVT.getSizeInBits();
14568   LLVMContext *Context = DAG.getContext();
14569   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14570   APInt MaskElt =
14571     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14572   Constant *C = ConstantInt::get(*Context, MaskElt);
14573   C = ConstantVector::getSplat(NumElts, C);
14574   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14575   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy());
14576   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14577   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14578                              MachinePointerInfo::getConstantPool(),
14579                              false, false, false, Alignment);
14580
14581   if (VT.isVector()) {
14582     // For a vector, cast operands to a vector type, perform the logic op,
14583     // and cast the result back to the original value type.
14584     MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
14585     SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask);
14586     SDValue Operand = IsFNABS ?
14587       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) :
14588       DAG.getNode(ISD::BITCAST, dl, VecVT, Op0);
14589     unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR;
14590     return DAG.getNode(ISD::BITCAST, dl, VT,
14591                        DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted));
14592   }
14593
14594   // If not vector, then scalar.
14595   unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14596   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14597   return DAG.getNode(BitOp, dl, VT, Operand, Mask);
14598 }
14599
14600 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14601   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14602   LLVMContext *Context = DAG.getContext();
14603   SDValue Op0 = Op.getOperand(0);
14604   SDValue Op1 = Op.getOperand(1);
14605   SDLoc dl(Op);
14606   MVT VT = Op.getSimpleValueType();
14607   MVT SrcVT = Op1.getSimpleValueType();
14608
14609   // If second operand is smaller, extend it first.
14610   if (SrcVT.bitsLT(VT)) {
14611     Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14612     SrcVT = VT;
14613   }
14614   // And if it is bigger, shrink it first.
14615   if (SrcVT.bitsGT(VT)) {
14616     Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
14617     SrcVT = VT;
14618   }
14619
14620   // At this point the operands and the result should have the same
14621   // type, and that won't be f80 since that is not custom lowered.
14622
14623   const fltSemantics &Sem =
14624       VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
14625   const unsigned SizeInBits = VT.getSizeInBits();
14626
14627   SmallVector<Constant *, 4> CV(
14628       VT == MVT::f64 ? 2 : 4,
14629       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14630
14631   // First, clear all bits but the sign bit from the second operand (sign).
14632   CV[0] = ConstantFP::get(*Context,
14633                           APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14634   Constant *C = ConstantVector::get(CV);
14635   SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14636   SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
14637                               MachinePointerInfo::getConstantPool(),
14638                               false, false, false, 16);
14639   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
14640
14641   // Next, clear the sign bit from the first operand (magnitude).
14642   // If it's a constant, we can clear it here.
14643   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14644     APFloat APF = Op0CN->getValueAPF();
14645     // If the magnitude is a positive zero, the sign bit alone is enough.
14646     if (APF.isPosZero())
14647       return SignBit;
14648     APF.clearSign();
14649     CV[0] = ConstantFP::get(*Context, APF);
14650   } else {
14651     CV[0] = ConstantFP::get(
14652         *Context,
14653         APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14654   }
14655   C = ConstantVector::get(CV);
14656   CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16);
14657   SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
14658                             MachinePointerInfo::getConstantPool(),
14659                             false, false, false, 16);
14660   // If the magnitude operand wasn't a constant, we need to AND out the sign.
14661   if (!isa<ConstantFPSDNode>(Op0))
14662     Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val);
14663
14664   // OR the magnitude value with the sign bit.
14665   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
14666 }
14667
14668 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14669   SDValue N0 = Op.getOperand(0);
14670   SDLoc dl(Op);
14671   MVT VT = Op.getSimpleValueType();
14672
14673   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
14674   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
14675                                   DAG.getConstant(1, VT));
14676   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
14677 }
14678
14679 // Check whether an OR'd tree is PTEST-able.
14680 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
14681                                       SelectionDAG &DAG) {
14682   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14683
14684   if (!Subtarget->hasSSE41())
14685     return SDValue();
14686
14687   if (!Op->hasOneUse())
14688     return SDValue();
14689
14690   SDNode *N = Op.getNode();
14691   SDLoc DL(N);
14692
14693   SmallVector<SDValue, 8> Opnds;
14694   DenseMap<SDValue, unsigned> VecInMap;
14695   SmallVector<SDValue, 8> VecIns;
14696   EVT VT = MVT::Other;
14697
14698   // Recognize a special case where a vector is casted into wide integer to
14699   // test all 0s.
14700   Opnds.push_back(N->getOperand(0));
14701   Opnds.push_back(N->getOperand(1));
14702
14703   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14704     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14705     // BFS traverse all OR'd operands.
14706     if (I->getOpcode() == ISD::OR) {
14707       Opnds.push_back(I->getOperand(0));
14708       Opnds.push_back(I->getOperand(1));
14709       // Re-evaluate the number of nodes to be traversed.
14710       e += 2; // 2 more nodes (LHS and RHS) are pushed.
14711       continue;
14712     }
14713
14714     // Quit if a non-EXTRACT_VECTOR_ELT
14715     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14716       return SDValue();
14717
14718     // Quit if without a constant index.
14719     SDValue Idx = I->getOperand(1);
14720     if (!isa<ConstantSDNode>(Idx))
14721       return SDValue();
14722
14723     SDValue ExtractedFromVec = I->getOperand(0);
14724     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14725     if (M == VecInMap.end()) {
14726       VT = ExtractedFromVec.getValueType();
14727       // Quit if not 128/256-bit vector.
14728       if (!VT.is128BitVector() && !VT.is256BitVector())
14729         return SDValue();
14730       // Quit if not the same type.
14731       if (VecInMap.begin() != VecInMap.end() &&
14732           VT != VecInMap.begin()->first.getValueType())
14733         return SDValue();
14734       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14735       VecIns.push_back(ExtractedFromVec);
14736     }
14737     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14738   }
14739
14740   assert((VT.is128BitVector() || VT.is256BitVector()) &&
14741          "Not extracted from 128-/256-bit vector.");
14742
14743   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14744
14745   for (DenseMap<SDValue, unsigned>::const_iterator
14746         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14747     // Quit if not all elements are used.
14748     if (I->second != FullMask)
14749       return SDValue();
14750   }
14751
14752   EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14753
14754   // Cast all vectors into TestVT for PTEST.
14755   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14756     VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
14757
14758   // If more than one full vectors are evaluated, OR them first before PTEST.
14759   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14760     // Each iteration will OR 2 nodes and append the result until there is only
14761     // 1 node left, i.e. the final OR'd value of all vectors.
14762     SDValue LHS = VecIns[Slot];
14763     SDValue RHS = VecIns[Slot + 1];
14764     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14765   }
14766
14767   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14768                      VecIns.back(), VecIns.back());
14769 }
14770
14771 /// \brief return true if \c Op has a use that doesn't just read flags.
14772 static bool hasNonFlagsUse(SDValue Op) {
14773   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14774        ++UI) {
14775     SDNode *User = *UI;
14776     unsigned UOpNo = UI.getOperandNo();
14777     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14778       // Look pass truncate.
14779       UOpNo = User->use_begin().getOperandNo();
14780       User = *User->use_begin();
14781     }
14782
14783     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14784         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14785       return true;
14786   }
14787   return false;
14788 }
14789
14790 /// Emit nodes that will be selected as "test Op0,Op0", or something
14791 /// equivalent.
14792 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
14793                                     SelectionDAG &DAG) const {
14794   if (Op.getValueType() == MVT::i1)
14795     // KORTEST instruction should be selected
14796     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14797                        DAG.getConstant(0, Op.getValueType()));
14798
14799   // CF and OF aren't always set the way we want. Determine which
14800   // of these we need.
14801   bool NeedCF = false;
14802   bool NeedOF = false;
14803   switch (X86CC) {
14804   default: break;
14805   case X86::COND_A: case X86::COND_AE:
14806   case X86::COND_B: case X86::COND_BE:
14807     NeedCF = true;
14808     break;
14809   case X86::COND_G: case X86::COND_GE:
14810   case X86::COND_L: case X86::COND_LE:
14811   case X86::COND_O: case X86::COND_NO: {
14812     // Check if we really need to set the
14813     // Overflow flag. If NoSignedWrap is present
14814     // that is not actually needed.
14815     switch (Op->getOpcode()) {
14816     case ISD::ADD:
14817     case ISD::SUB:
14818     case ISD::MUL:
14819     case ISD::SHL: {
14820       const BinaryWithFlagsSDNode *BinNode =
14821           cast<BinaryWithFlagsSDNode>(Op.getNode());
14822       if (BinNode->hasNoSignedWrap())
14823         break;
14824     }
14825     default:
14826       NeedOF = true;
14827       break;
14828     }
14829     break;
14830   }
14831   }
14832   // See if we can use the EFLAGS value from the operand instead of
14833   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14834   // we prove that the arithmetic won't overflow, we can't use OF or CF.
14835   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14836     // Emit a CMP with 0, which is the TEST pattern.
14837     //if (Op.getValueType() == MVT::i1)
14838     //  return DAG.getNode(X86ISD::CMP, dl, MVT::i1, Op,
14839     //                     DAG.getConstant(0, MVT::i1));
14840     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14841                        DAG.getConstant(0, Op.getValueType()));
14842   }
14843   unsigned Opcode = 0;
14844   unsigned NumOperands = 0;
14845
14846   // Truncate operations may prevent the merge of the SETCC instruction
14847   // and the arithmetic instruction before it. Attempt to truncate the operands
14848   // of the arithmetic instruction and use a reduced bit-width instruction.
14849   bool NeedTruncation = false;
14850   SDValue ArithOp = Op;
14851   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14852     SDValue Arith = Op->getOperand(0);
14853     // Both the trunc and the arithmetic op need to have one user each.
14854     if (Arith->hasOneUse())
14855       switch (Arith.getOpcode()) {
14856         default: break;
14857         case ISD::ADD:
14858         case ISD::SUB:
14859         case ISD::AND:
14860         case ISD::OR:
14861         case ISD::XOR: {
14862           NeedTruncation = true;
14863           ArithOp = Arith;
14864         }
14865       }
14866   }
14867
14868   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14869   // which may be the result of a CAST.  We use the variable 'Op', which is the
14870   // non-casted variable when we check for possible users.
14871   switch (ArithOp.getOpcode()) {
14872   case ISD::ADD:
14873     // Due to an isel shortcoming, be conservative if this add is likely to be
14874     // selected as part of a load-modify-store instruction. When the root node
14875     // in a match is a store, isel doesn't know how to remap non-chain non-flag
14876     // uses of other nodes in the match, such as the ADD in this case. This
14877     // leads to the ADD being left around and reselected, with the result being
14878     // two adds in the output.  Alas, even if none our users are stores, that
14879     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14880     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14881     // climbing the DAG back to the root, and it doesn't seem to be worth the
14882     // effort.
14883     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14884          UE = Op.getNode()->use_end(); UI != UE; ++UI)
14885       if (UI->getOpcode() != ISD::CopyToReg &&
14886           UI->getOpcode() != ISD::SETCC &&
14887           UI->getOpcode() != ISD::STORE)
14888         goto default_case;
14889
14890     if (ConstantSDNode *C =
14891         dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14892       // An add of one will be selected as an INC.
14893       if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) {
14894         Opcode = X86ISD::INC;
14895         NumOperands = 1;
14896         break;
14897       }
14898
14899       // An add of negative one (subtract of one) will be selected as a DEC.
14900       if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) {
14901         Opcode = X86ISD::DEC;
14902         NumOperands = 1;
14903         break;
14904       }
14905     }
14906
14907     // Otherwise use a regular EFLAGS-setting add.
14908     Opcode = X86ISD::ADD;
14909     NumOperands = 2;
14910     break;
14911   case ISD::SHL:
14912   case ISD::SRL:
14913     // If we have a constant logical shift that's only used in a comparison
14914     // against zero turn it into an equivalent AND. This allows turning it into
14915     // a TEST instruction later.
14916     if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14917         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14918       EVT VT = Op.getValueType();
14919       unsigned BitWidth = VT.getSizeInBits();
14920       unsigned ShAmt = Op->getConstantOperandVal(1);
14921       if (ShAmt >= BitWidth) // Avoid undefined shifts.
14922         break;
14923       APInt Mask = ArithOp.getOpcode() == ISD::SRL
14924                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14925                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14926       if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14927         break;
14928       SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14929                                 DAG.getConstant(Mask, VT));
14930       DAG.ReplaceAllUsesWith(Op, New);
14931       Op = New;
14932     }
14933     break;
14934
14935   case ISD::AND:
14936     // If the primary and result isn't used, don't bother using X86ISD::AND,
14937     // because a TEST instruction will be better.
14938     if (!hasNonFlagsUse(Op))
14939       break;
14940     // FALL THROUGH
14941   case ISD::SUB:
14942   case ISD::OR:
14943   case ISD::XOR:
14944     // Due to the ISEL shortcoming noted above, be conservative if this op is
14945     // likely to be selected as part of a load-modify-store instruction.
14946     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14947            UE = Op.getNode()->use_end(); UI != UE; ++UI)
14948       if (UI->getOpcode() == ISD::STORE)
14949         goto default_case;
14950
14951     // Otherwise use a regular EFLAGS-setting instruction.
14952     switch (ArithOp.getOpcode()) {
14953     default: llvm_unreachable("unexpected operator!");
14954     case ISD::SUB: Opcode = X86ISD::SUB; break;
14955     case ISD::XOR: Opcode = X86ISD::XOR; break;
14956     case ISD::AND: Opcode = X86ISD::AND; break;
14957     case ISD::OR: {
14958       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14959         SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
14960         if (EFLAGS.getNode())
14961           return EFLAGS;
14962       }
14963       Opcode = X86ISD::OR;
14964       break;
14965     }
14966     }
14967
14968     NumOperands = 2;
14969     break;
14970   case X86ISD::ADD:
14971   case X86ISD::SUB:
14972   case X86ISD::INC:
14973   case X86ISD::DEC:
14974   case X86ISD::OR:
14975   case X86ISD::XOR:
14976   case X86ISD::AND:
14977     return SDValue(Op.getNode(), 1);
14978   default:
14979   default_case:
14980     break;
14981   }
14982
14983   // If we found that truncation is beneficial, perform the truncation and
14984   // update 'Op'.
14985   if (NeedTruncation) {
14986     EVT VT = Op.getValueType();
14987     SDValue WideVal = Op->getOperand(0);
14988     EVT WideVT = WideVal.getValueType();
14989     unsigned ConvertedOp = 0;
14990     // Use a target machine opcode to prevent further DAGCombine
14991     // optimizations that may separate the arithmetic operations
14992     // from the setcc node.
14993     switch (WideVal.getOpcode()) {
14994       default: break;
14995       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14996       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14997       case ISD::AND: ConvertedOp = X86ISD::AND; break;
14998       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
14999       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15000     }
15001
15002     if (ConvertedOp) {
15003       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15004       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15005         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15006         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15007         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15008       }
15009     }
15010   }
15011
15012   if (Opcode == 0)
15013     // Emit a CMP with 0, which is the TEST pattern.
15014     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15015                        DAG.getConstant(0, Op.getValueType()));
15016
15017   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15018   SmallVector<SDValue, 4> Ops;
15019   for (unsigned i = 0; i != NumOperands; ++i)
15020     Ops.push_back(Op.getOperand(i));
15021
15022   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15023   DAG.ReplaceAllUsesWith(Op, New);
15024   return SDValue(New.getNode(), 1);
15025 }
15026
15027 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15028 /// equivalent.
15029 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15030                                    SDLoc dl, SelectionDAG &DAG) const {
15031   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
15032     if (C->getAPIntValue() == 0)
15033       return EmitTest(Op0, X86CC, dl, DAG);
15034
15035      if (Op0.getValueType() == MVT::i1)
15036        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
15037   }
15038
15039   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
15040        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
15041     // Do the comparison at i32 if it's smaller, besides the Atom case.
15042     // This avoids subregister aliasing issues. Keep the smaller reference
15043     // if we're optimizing for size, however, as that'll allow better folding
15044     // of memory operations.
15045     if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 &&
15046         !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
15047              AttributeSet::FunctionIndex, Attribute::MinSize) &&
15048         !Subtarget->isAtom()) {
15049       unsigned ExtendOp =
15050           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
15051       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
15052       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
15053     }
15054     // Use SUB instead of CMP to enable CSE between SUB and CMP.
15055     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
15056     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
15057                               Op0, Op1);
15058     return SDValue(Sub.getNode(), 1);
15059   }
15060   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
15061 }
15062
15063 /// Convert a comparison if required by the subtarget.
15064 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
15065                                                  SelectionDAG &DAG) const {
15066   // If the subtarget does not support the FUCOMI instruction, floating-point
15067   // comparisons have to be converted.
15068   if (Subtarget->hasCMov() ||
15069       Cmp.getOpcode() != X86ISD::CMP ||
15070       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
15071       !Cmp.getOperand(1).getValueType().isFloatingPoint())
15072     return Cmp;
15073
15074   // The instruction selector will select an FUCOM instruction instead of
15075   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
15076   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
15077   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
15078   SDLoc dl(Cmp);
15079   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
15080   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
15081   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
15082                             DAG.getConstant(8, MVT::i8));
15083   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
15084   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
15085 }
15086
15087 /// The minimum architected relative accuracy is 2^-12. We need one
15088 /// Newton-Raphson step to have a good float result (24 bits of precision).
15089 SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
15090                                             DAGCombinerInfo &DCI,
15091                                             unsigned &RefinementSteps,
15092                                             bool &UseOneConstNR) const {
15093   // FIXME: We should use instruction latency models to calculate the cost of
15094   // each potential sequence, but this is very hard to do reliably because
15095   // at least Intel's Core* chips have variable timing based on the number of
15096   // significant digits in the divisor and/or sqrt operand.
15097   if (!Subtarget->useSqrtEst())
15098     return SDValue();
15099
15100   EVT VT = Op.getValueType();
15101
15102   // SSE1 has rsqrtss and rsqrtps.
15103   // TODO: Add support for AVX512 (v16f32).
15104   // It is likely not profitable to do this for f64 because a double-precision
15105   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
15106   // instructions: convert to single, rsqrtss, convert back to double, refine
15107   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
15108   // along with FMA, this could be a throughput win.
15109   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15110       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15111     RefinementSteps = 1;
15112     UseOneConstNR = false;
15113     return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
15114   }
15115   return SDValue();
15116 }
15117
15118 /// The minimum architected relative accuracy is 2^-12. We need one
15119 /// Newton-Raphson step to have a good float result (24 bits of precision).
15120 SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
15121                                             DAGCombinerInfo &DCI,
15122                                             unsigned &RefinementSteps) const {
15123   // FIXME: We should use instruction latency models to calculate the cost of
15124   // each potential sequence, but this is very hard to do reliably because
15125   // at least Intel's Core* chips have variable timing based on the number of
15126   // significant digits in the divisor.
15127   if (!Subtarget->useReciprocalEst())
15128     return SDValue();
15129
15130   EVT VT = Op.getValueType();
15131
15132   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
15133   // TODO: Add support for AVX512 (v16f32).
15134   // It is likely not profitable to do this for f64 because a double-precision
15135   // reciprocal estimate with refinement on x86 prior to FMA requires
15136   // 15 instructions: convert to single, rcpss, convert back to double, refine
15137   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
15138   // along with FMA, this could be a throughput win.
15139   if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
15140       (Subtarget->hasAVX() && VT == MVT::v8f32)) {
15141     RefinementSteps = ReciprocalEstimateRefinementSteps;
15142     return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
15143   }
15144   return SDValue();
15145 }
15146
15147 static bool isAllOnes(SDValue V) {
15148   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
15149   return C && C->isAllOnesValue();
15150 }
15151
15152 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
15153 /// if it's possible.
15154 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
15155                                      SDLoc dl, SelectionDAG &DAG) const {
15156   SDValue Op0 = And.getOperand(0);
15157   SDValue Op1 = And.getOperand(1);
15158   if (Op0.getOpcode() == ISD::TRUNCATE)
15159     Op0 = Op0.getOperand(0);
15160   if (Op1.getOpcode() == ISD::TRUNCATE)
15161     Op1 = Op1.getOperand(0);
15162
15163   SDValue LHS, RHS;
15164   if (Op1.getOpcode() == ISD::SHL)
15165     std::swap(Op0, Op1);
15166   if (Op0.getOpcode() == ISD::SHL) {
15167     if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
15168       if (And00C->getZExtValue() == 1) {
15169         // If we looked past a truncate, check that it's only truncating away
15170         // known zeros.
15171         unsigned BitWidth = Op0.getValueSizeInBits();
15172         unsigned AndBitWidth = And.getValueSizeInBits();
15173         if (BitWidth > AndBitWidth) {
15174           APInt Zeros, Ones;
15175           DAG.computeKnownBits(Op0, Zeros, Ones);
15176           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15177             return SDValue();
15178         }
15179         LHS = Op1;
15180         RHS = Op0.getOperand(1);
15181       }
15182   } else if (Op1.getOpcode() == ISD::Constant) {
15183     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15184     uint64_t AndRHSVal = AndRHS->getZExtValue();
15185     SDValue AndLHS = Op0;
15186
15187     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15188       LHS = AndLHS.getOperand(0);
15189       RHS = AndLHS.getOperand(1);
15190     }
15191
15192     // Use BT if the immediate can't be encoded in a TEST instruction.
15193     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15194       LHS = AndLHS;
15195       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
15196     }
15197   }
15198
15199   if (LHS.getNode()) {
15200     // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15201     // instruction.  Since the shift amount is in-range-or-undefined, we know
15202     // that doing a bittest on the i32 value is ok.  We extend to i32 because
15203     // the encoding for the i16 version is larger than the i32 version.
15204     // Also promote i16 to i32 for performance / code size reason.
15205     if (LHS.getValueType() == MVT::i8 ||
15206         LHS.getValueType() == MVT::i16)
15207       LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15208
15209     // If the operand types disagree, extend the shift amount to match.  Since
15210     // BT ignores high bits (like shifts) we can use anyextend.
15211     if (LHS.getValueType() != RHS.getValueType())
15212       RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15213
15214     SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15215     X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15216     return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15217                        DAG.getConstant(Cond, MVT::i8), BT);
15218   }
15219
15220   return SDValue();
15221 }
15222
15223 /// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
15224 /// mask CMPs.
15225 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15226                               SDValue &Op1) {
15227   unsigned SSECC;
15228   bool Swap = false;
15229
15230   // SSE Condition code mapping:
15231   //  0 - EQ
15232   //  1 - LT
15233   //  2 - LE
15234   //  3 - UNORD
15235   //  4 - NEQ
15236   //  5 - NLT
15237   //  6 - NLE
15238   //  7 - ORD
15239   switch (SetCCOpcode) {
15240   default: llvm_unreachable("Unexpected SETCC condition");
15241   case ISD::SETOEQ:
15242   case ISD::SETEQ:  SSECC = 0; break;
15243   case ISD::SETOGT:
15244   case ISD::SETGT:  Swap = true; // Fallthrough
15245   case ISD::SETLT:
15246   case ISD::SETOLT: SSECC = 1; break;
15247   case ISD::SETOGE:
15248   case ISD::SETGE:  Swap = true; // Fallthrough
15249   case ISD::SETLE:
15250   case ISD::SETOLE: SSECC = 2; break;
15251   case ISD::SETUO:  SSECC = 3; break;
15252   case ISD::SETUNE:
15253   case ISD::SETNE:  SSECC = 4; break;
15254   case ISD::SETULE: Swap = true; // Fallthrough
15255   case ISD::SETUGE: SSECC = 5; break;
15256   case ISD::SETULT: Swap = true; // Fallthrough
15257   case ISD::SETUGT: SSECC = 6; break;
15258   case ISD::SETO:   SSECC = 7; break;
15259   case ISD::SETUEQ:
15260   case ISD::SETONE: SSECC = 8; break;
15261   }
15262   if (Swap)
15263     std::swap(Op0, Op1);
15264
15265   return SSECC;
15266 }
15267
15268 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
15269 // ones, and then concatenate the result back.
15270 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15271   MVT VT = Op.getSimpleValueType();
15272
15273   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15274          "Unsupported value type for operation");
15275
15276   unsigned NumElems = VT.getVectorNumElements();
15277   SDLoc dl(Op);
15278   SDValue CC = Op.getOperand(2);
15279
15280   // Extract the LHS vectors
15281   SDValue LHS = Op.getOperand(0);
15282   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
15283   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
15284
15285   // Extract the RHS vectors
15286   SDValue RHS = Op.getOperand(1);
15287   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
15288   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
15289
15290   // Issue the operation on the smaller types and concatenate the result back
15291   MVT EltVT = VT.getVectorElementType();
15292   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15293   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15294                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15295                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15296 }
15297
15298 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
15299                                      const X86Subtarget *Subtarget) {
15300   SDValue Op0 = Op.getOperand(0);
15301   SDValue Op1 = Op.getOperand(1);
15302   SDValue CC = Op.getOperand(2);
15303   MVT VT = Op.getSimpleValueType();
15304   SDLoc dl(Op);
15305
15306   assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 &&
15307          Op.getValueType().getScalarType() == MVT::i1 &&
15308          "Cannot set masked compare for this operation");
15309
15310   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15311   unsigned  Opc = 0;
15312   bool Unsigned = false;
15313   bool Swap = false;
15314   unsigned SSECC;
15315   switch (SetCCOpcode) {
15316   default: llvm_unreachable("Unexpected SETCC condition");
15317   case ISD::SETNE:  SSECC = 4; break;
15318   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15319   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15320   case ISD::SETLT:  Swap = true; //fall-through
15321   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15322   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15323   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15324   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15325   case ISD::SETULE: Unsigned = true; //fall-through
15326   case ISD::SETLE:  SSECC = 2; break;
15327   }
15328
15329   if (Swap)
15330     std::swap(Op0, Op1);
15331   if (Opc)
15332     return DAG.getNode(Opc, dl, VT, Op0, Op1);
15333   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15334   return DAG.getNode(Opc, dl, VT, Op0, Op1,
15335                      DAG.getConstant(SSECC, MVT::i8));
15336 }
15337
15338 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15339 /// operand \p Op1.  If non-trivial (for example because it's not constant)
15340 /// return an empty value.
15341 static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
15342 {
15343   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15344   if (!BV)
15345     return SDValue();
15346
15347   MVT VT = Op1.getSimpleValueType();
15348   MVT EVT = VT.getVectorElementType();
15349   unsigned n = VT.getVectorNumElements();
15350   SmallVector<SDValue, 8> ULTOp1;
15351
15352   for (unsigned i = 0; i < n; ++i) {
15353     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15354     if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT)
15355       return SDValue();
15356
15357     // Avoid underflow.
15358     APInt Val = Elt->getAPIntValue();
15359     if (Val == 0)
15360       return SDValue();
15361
15362     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
15363   }
15364
15365   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
15366 }
15367
15368 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
15369                            SelectionDAG &DAG) {
15370   SDValue Op0 = Op.getOperand(0);
15371   SDValue Op1 = Op.getOperand(1);
15372   SDValue CC = Op.getOperand(2);
15373   MVT VT = Op.getSimpleValueType();
15374   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15375   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15376   SDLoc dl(Op);
15377
15378   if (isFP) {
15379 #ifndef NDEBUG
15380     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15381     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15382 #endif
15383
15384     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15385     unsigned Opc = X86ISD::CMPP;
15386     if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15387       assert(VT.getVectorNumElements() <= 16);
15388       Opc = X86ISD::CMPM;
15389     }
15390     // In the two special cases we can't handle, emit two comparisons.
15391     if (SSECC == 8) {
15392       unsigned CC0, CC1;
15393       unsigned CombineOpc;
15394       if (SetCCOpcode == ISD::SETUEQ) {
15395         CC0 = 3; CC1 = 0; CombineOpc = ISD::OR;
15396       } else {
15397         assert(SetCCOpcode == ISD::SETONE);
15398         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
15399       }
15400
15401       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15402                                  DAG.getConstant(CC0, MVT::i8));
15403       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15404                                  DAG.getConstant(CC1, MVT::i8));
15405       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15406     }
15407     // Handle all other FP comparisons here.
15408     return DAG.getNode(Opc, dl, VT, Op0, Op1,
15409                        DAG.getConstant(SSECC, MVT::i8));
15410   }
15411
15412   // Break 256-bit integer vector compare into smaller ones.
15413   if (VT.is256BitVector() && !Subtarget->hasInt256())
15414     return Lower256IntVSETCC(Op, DAG);
15415
15416   bool MaskResult = (VT.getVectorElementType() == MVT::i1);
15417   EVT OpVT = Op1.getValueType();
15418   if (Subtarget->hasAVX512()) {
15419     if (Op1.getValueType().is512BitVector() ||
15420         (Subtarget->hasBWI() && Subtarget->hasVLX()) ||
15421         (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
15422       return LowerIntVSETCC_AVX512(Op, DAG, Subtarget);
15423
15424     // In AVX-512 architecture setcc returns mask with i1 elements,
15425     // But there is no compare instruction for i8 and i16 elements in KNL.
15426     // We are not talking about 512-bit operands in this case, these
15427     // types are illegal.
15428     if (MaskResult &&
15429         (OpVT.getVectorElementType().getSizeInBits() < 32 &&
15430          OpVT.getVectorElementType().getSizeInBits() >= 8))
15431       return DAG.getNode(ISD::TRUNCATE, dl, VT,
15432                          DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15433   }
15434
15435   // We are handling one of the integer comparisons here.  Since SSE only has
15436   // GT and EQ comparisons for integer, swapping operands and multiple
15437   // operations may be required for some comparisons.
15438   unsigned Opc;
15439   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15440   bool Subus = false;
15441
15442   switch (SetCCOpcode) {
15443   default: llvm_unreachable("Unexpected SETCC condition");
15444   case ISD::SETNE:  Invert = true;
15445   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15446   case ISD::SETLT:  Swap = true;
15447   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15448   case ISD::SETGE:  Swap = true;
15449   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15450                     Invert = true; break;
15451   case ISD::SETULT: Swap = true;
15452   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15453                     FlipSigns = true; break;
15454   case ISD::SETUGE: Swap = true;
15455   case ISD::SETULE: Opc = X86ISD::PCMPGT;
15456                     FlipSigns = true; Invert = true; break;
15457   }
15458
15459   // Special case: Use min/max operations for SETULE/SETUGE
15460   MVT VET = VT.getVectorElementType();
15461   bool hasMinMax =
15462        (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15463     || (Subtarget->hasSSE2()  && (VET == MVT::i8));
15464
15465   if (hasMinMax) {
15466     switch (SetCCOpcode) {
15467     default: break;
15468     case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
15469     case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
15470     }
15471
15472     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15473   }
15474
15475   bool hasSubus = Subtarget->hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15476   if (!MinMax && hasSubus) {
15477     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15478     // Op0 u<= Op1:
15479     //   t = psubus Op0, Op1
15480     //   pcmpeq t, <0..0>
15481     switch (SetCCOpcode) {
15482     default: break;
15483     case ISD::SETULT: {
15484       // If the comparison is against a constant we can turn this into a
15485       // setule.  With psubus, setule does not require a swap.  This is
15486       // beneficial because the constant in the register is no longer
15487       // destructed as the destination so it can be hoisted out of a loop.
15488       // Only do this pre-AVX since vpcmp* is no longer destructive.
15489       if (Subtarget->hasAVX())
15490         break;
15491       SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
15492       if (ULEOp1.getNode()) {
15493         Op1 = ULEOp1;
15494         Subus = true; Invert = false; Swap = false;
15495       }
15496       break;
15497     }
15498     // Psubus is better than flip-sign because it requires no inversion.
15499     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15500     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15501     }
15502
15503     if (Subus) {
15504       Opc = X86ISD::SUBUS;
15505       FlipSigns = false;
15506     }
15507   }
15508
15509   if (Swap)
15510     std::swap(Op0, Op1);
15511
15512   // Check that the operation in question is available (most are plain SSE2,
15513   // but PCMPGTQ and PCMPEQQ have different requirements).
15514   if (VT == MVT::v2i64) {
15515     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
15516       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
15517
15518       // First cast everything to the right type.
15519       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15520       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15521
15522       // Since SSE has no unsigned integer comparisons, we need to flip the sign
15523       // bits of the inputs before performing those operations. The lower
15524       // compare is always unsigned.
15525       SDValue SB;
15526       if (FlipSigns) {
15527         SB = DAG.getConstant(0x80000000U, MVT::v4i32);
15528       } else {
15529         SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
15530         SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
15531         SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
15532                          Sign, Zero, Sign, Zero);
15533       }
15534       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15535       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15536
15537       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15538       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15539       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15540
15541       // Create masks for only the low parts/high parts of the 64 bit integers.
15542       static const int MaskHi[] = { 1, 1, 3, 3 };
15543       static const int MaskLo[] = { 0, 0, 2, 2 };
15544       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15545       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15546       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15547
15548       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15549       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15550
15551       if (Invert)
15552         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15553
15554       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15555     }
15556
15557     if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
15558       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15559       // pcmpeqd + pshufd + pand.
15560       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
15561
15562       // First cast everything to the right type.
15563       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
15564       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
15565
15566       // Do the compare.
15567       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15568
15569       // Make sure the lower and upper halves are both all-ones.
15570       static const int Mask[] = { 1, 0, 3, 2 };
15571       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15572       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15573
15574       if (Invert)
15575         Result = DAG.getNOT(dl, Result, MVT::v4i32);
15576
15577       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
15578     }
15579   }
15580
15581   // Since SSE has no unsigned integer comparisons, we need to flip the sign
15582   // bits of the inputs before performing those operations.
15583   if (FlipSigns) {
15584     EVT EltVT = VT.getVectorElementType();
15585     SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
15586     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15587     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15588   }
15589
15590   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15591
15592   // If the logical-not of the result is required, perform that now.
15593   if (Invert)
15594     Result = DAG.getNOT(dl, Result, VT);
15595
15596   if (MinMax)
15597     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15598
15599   if (Subus)
15600     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15601                          getZeroVector(VT, Subtarget, DAG, dl));
15602
15603   return Result;
15604 }
15605
15606 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15607
15608   MVT VT = Op.getSimpleValueType();
15609
15610   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15611
15612   assert(((!Subtarget->hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15613          && "SetCC type must be 8-bit or 1-bit integer");
15614   SDValue Op0 = Op.getOperand(0);
15615   SDValue Op1 = Op.getOperand(1);
15616   SDLoc dl(Op);
15617   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15618
15619   // Optimize to BT if possible.
15620   // Lower (X & (1 << N)) == 0 to BT(X, N).
15621   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15622   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15623   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15624       Op1.getOpcode() == ISD::Constant &&
15625       cast<ConstantSDNode>(Op1)->isNullValue() &&
15626       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15627     SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
15628     if (NewSetCC.getNode()) {
15629       if (VT == MVT::i1)
15630         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15631       return NewSetCC;
15632     }
15633   }
15634
15635   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15636   // these.
15637   if (Op1.getOpcode() == ISD::Constant &&
15638       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
15639        cast<ConstantSDNode>(Op1)->isNullValue()) &&
15640       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15641
15642     // If the input is a setcc, then reuse the input setcc or use a new one with
15643     // the inverted condition.
15644     if (Op0.getOpcode() == X86ISD::SETCC) {
15645       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15646       bool Invert = (CC == ISD::SETNE) ^
15647         cast<ConstantSDNode>(Op1)->isNullValue();
15648       if (!Invert)
15649         return Op0;
15650
15651       CCode = X86::GetOppositeBranchCondition(CCode);
15652       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15653                                   DAG.getConstant(CCode, MVT::i8),
15654                                   Op0.getOperand(1));
15655       if (VT == MVT::i1)
15656         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15657       return SetCC;
15658     }
15659   }
15660   if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) &&
15661       (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) &&
15662       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15663
15664     ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15665     return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, MVT::i1), NewCC);
15666   }
15667
15668   bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15669   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
15670   if (X86CC == X86::COND_INVALID)
15671     return SDValue();
15672
15673   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15674   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15675   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15676                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
15677   if (VT == MVT::i1)
15678     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15679   return SetCC;
15680 }
15681
15682 // isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
15683 static bool isX86LogicalCmp(SDValue Op) {
15684   unsigned Opc = Op.getNode()->getOpcode();
15685   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15686       Opc == X86ISD::SAHF)
15687     return true;
15688   if (Op.getResNo() == 1 &&
15689       (Opc == X86ISD::ADD ||
15690        Opc == X86ISD::SUB ||
15691        Opc == X86ISD::ADC ||
15692        Opc == X86ISD::SBB ||
15693        Opc == X86ISD::SMUL ||
15694        Opc == X86ISD::UMUL ||
15695        Opc == X86ISD::INC ||
15696        Opc == X86ISD::DEC ||
15697        Opc == X86ISD::OR ||
15698        Opc == X86ISD::XOR ||
15699        Opc == X86ISD::AND))
15700     return true;
15701
15702   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15703     return true;
15704
15705   return false;
15706 }
15707
15708 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15709   if (V.getOpcode() != ISD::TRUNCATE)
15710     return false;
15711
15712   SDValue VOp0 = V.getOperand(0);
15713   unsigned InBits = VOp0.getValueSizeInBits();
15714   unsigned Bits = V.getValueSizeInBits();
15715   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
15716 }
15717
15718 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15719   bool addTest = true;
15720   SDValue Cond  = Op.getOperand(0);
15721   SDValue Op1 = Op.getOperand(1);
15722   SDValue Op2 = Op.getOperand(2);
15723   SDLoc DL(Op);
15724   EVT VT = Op1.getValueType();
15725   SDValue CC;
15726
15727   // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15728   // are available. Otherwise fp cmovs get lowered into a less efficient branch
15729   // sequence later on.
15730   if (Cond.getOpcode() == ISD::SETCC &&
15731       ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15732        (Subtarget->hasSSE1() && VT == MVT::f32)) &&
15733       VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
15734     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15735     int SSECC = translateX86FSETCC(
15736         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15737
15738     if (SSECC != 8) {
15739       if (Subtarget->hasAVX512()) {
15740         SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15741                                   DAG.getConstant(SSECC, MVT::i8));
15742         return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15743       }
15744       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15745                                 DAG.getConstant(SSECC, MVT::i8));
15746       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15747       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15748       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15749     }
15750   }
15751
15752   if (Cond.getOpcode() == ISD::SETCC) {
15753     SDValue NewCond = LowerSETCC(Cond, DAG);
15754     if (NewCond.getNode())
15755       Cond = NewCond;
15756   }
15757
15758   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15759   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15760   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15761   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15762   if (Cond.getOpcode() == X86ISD::SETCC &&
15763       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15764       isZero(Cond.getOperand(1).getOperand(1))) {
15765     SDValue Cmp = Cond.getOperand(1);
15766
15767     unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15768
15769     if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
15770         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15771       SDValue Y = isAllOnes(Op2) ? Op1 : Op2;
15772
15773       SDValue CmpOp0 = Cmp.getOperand(0);
15774       // Apply further optimizations for special cases
15775       // (select (x != 0), -1, 0) -> neg & sbb
15776       // (select (x == 0), 0, -1) -> neg & sbb
15777       if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y))
15778         if (YC->isNullValue() &&
15779             (isAllOnes(Op1) == (CondCode == X86::COND_NE))) {
15780           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15781           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15782                                     DAG.getConstant(0, CmpOp0.getValueType()),
15783                                     CmpOp0);
15784           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15785                                     DAG.getConstant(X86::COND_B, MVT::i8),
15786                                     SDValue(Neg.getNode(), 1));
15787           return Res;
15788         }
15789
15790       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15791                         CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
15792       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15793
15794       SDValue Res =   // Res = 0 or -1.
15795         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15796                     DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
15797
15798       if (isAllOnes(Op1) != (CondCode == X86::COND_E))
15799         Res = DAG.getNOT(DL, Res, Res.getValueType());
15800
15801       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
15802       if (!N2C || !N2C->isNullValue())
15803         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15804       return Res;
15805     }
15806   }
15807
15808   // Look past (and (setcc_carry (cmp ...)), 1).
15809   if (Cond.getOpcode() == ISD::AND &&
15810       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
15811     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
15812     if (C && C->getAPIntValue() == 1)
15813       Cond = Cond.getOperand(0);
15814   }
15815
15816   // If condition flag is set by a X86ISD::CMP, then use it as the condition
15817   // setting operand in place of the X86ISD::SETCC.
15818   unsigned CondOpcode = Cond.getOpcode();
15819   if (CondOpcode == X86ISD::SETCC ||
15820       CondOpcode == X86ISD::SETCC_CARRY) {
15821     CC = Cond.getOperand(0);
15822
15823     SDValue Cmp = Cond.getOperand(1);
15824     unsigned Opc = Cmp.getOpcode();
15825     MVT VT = Op.getSimpleValueType();
15826
15827     bool IllegalFPCMov = false;
15828     if (VT.isFloatingPoint() && !VT.isVector() &&
15829         !isScalarFPTypeInSSEReg(VT))  // FPStack?
15830       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15831
15832     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15833         Opc == X86ISD::BT) { // FIXME
15834       Cond = Cmp;
15835       addTest = false;
15836     }
15837   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15838              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15839              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15840               Cond.getOperand(0).getValueType() != MVT::i8)) {
15841     SDValue LHS = Cond.getOperand(0);
15842     SDValue RHS = Cond.getOperand(1);
15843     unsigned X86Opcode;
15844     unsigned X86Cond;
15845     SDVTList VTs;
15846     switch (CondOpcode) {
15847     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15848     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15849     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15850     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15851     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15852     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15853     default: llvm_unreachable("unexpected overflowing operator");
15854     }
15855     if (CondOpcode == ISD::UMULO)
15856       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15857                           MVT::i32);
15858     else
15859       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15860
15861     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15862
15863     if (CondOpcode == ISD::UMULO)
15864       Cond = X86Op.getValue(2);
15865     else
15866       Cond = X86Op.getValue(1);
15867
15868     CC = DAG.getConstant(X86Cond, MVT::i8);
15869     addTest = false;
15870   }
15871
15872   if (addTest) {
15873     // Look pass the truncate if the high bits are known zero.
15874     if (isTruncWithZeroHighBitsInput(Cond, DAG))
15875         Cond = Cond.getOperand(0);
15876
15877     // We know the result of AND is compared against zero. Try to match
15878     // it to BT.
15879     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15880       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG);
15881       if (NewSetCC.getNode()) {
15882         CC = NewSetCC.getOperand(0);
15883         Cond = NewSetCC.getOperand(1);
15884         addTest = false;
15885       }
15886     }
15887   }
15888
15889   if (addTest) {
15890     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
15891     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15892   }
15893
15894   // a <  b ? -1 :  0 -> RES = ~setcc_carry
15895   // a <  b ?  0 : -1 -> RES = setcc_carry
15896   // a >= b ? -1 :  0 -> RES = setcc_carry
15897   // a >= b ?  0 : -1 -> RES = ~setcc_carry
15898   if (Cond.getOpcode() == X86ISD::SUB) {
15899     Cond = ConvertCmpIfNecessary(Cond, DAG);
15900     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15901
15902     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15903         (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) {
15904       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15905                                 DAG.getConstant(X86::COND_B, MVT::i8), Cond);
15906       if (isAllOnes(Op1) != (CondCode == X86::COND_B))
15907         return DAG.getNOT(DL, Res, Res.getValueType());
15908       return Res;
15909     }
15910   }
15911
15912   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15913   // widen the cmov and push the truncate through. This avoids introducing a new
15914   // branch during isel and doesn't add any extensions.
15915   if (Op.getValueType() == MVT::i8 &&
15916       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15917     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15918     if (T1.getValueType() == T2.getValueType() &&
15919         // Blacklist CopyFromReg to avoid partial register stalls.
15920         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15921       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15922       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15923       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15924     }
15925   }
15926
15927   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15928   // condition is true.
15929   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15930   SDValue Ops[] = { Op2, Op1, CC, Cond };
15931   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15932 }
15933
15934 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget,
15935                                        SelectionDAG &DAG) {
15936   MVT VT = Op->getSimpleValueType(0);
15937   SDValue In = Op->getOperand(0);
15938   MVT InVT = In.getSimpleValueType();
15939   MVT VTElt = VT.getVectorElementType();
15940   MVT InVTElt = InVT.getVectorElementType();
15941   SDLoc dl(Op);
15942
15943   // SKX processor
15944   if ((InVTElt == MVT::i1) &&
15945       (((Subtarget->hasBWI() && Subtarget->hasVLX() &&
15946         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
15947
15948        ((Subtarget->hasBWI() && VT.is512BitVector() &&
15949         VTElt.getSizeInBits() <= 16)) ||
15950
15951        ((Subtarget->hasDQI() && Subtarget->hasVLX() &&
15952         VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
15953
15954        ((Subtarget->hasDQI() && VT.is512BitVector() &&
15955         VTElt.getSizeInBits() >= 32))))
15956     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15957
15958   unsigned int NumElts = VT.getVectorNumElements();
15959
15960   if (NumElts != 8 && NumElts != 16)
15961     return SDValue();
15962
15963   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
15964     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
15965       return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
15966     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
15967   }
15968
15969   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15970   assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
15971
15972   MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
15973   Constant *C = ConstantInt::get(*DAG.getContext(),
15974     APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
15975
15976   SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
15977   unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
15978   SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
15979                           MachinePointerInfo::getConstantPool(),
15980                           false, false, false, Alignment);
15981   SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
15982   if (VT.is512BitVector())
15983     return Brcst;
15984   return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
15985 }
15986
15987 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
15988                                 SelectionDAG &DAG) {
15989   MVT VT = Op->getSimpleValueType(0);
15990   SDValue In = Op->getOperand(0);
15991   MVT InVT = In.getSimpleValueType();
15992   SDLoc dl(Op);
15993
15994   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15995     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
15996
15997   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
15998       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
15999       (VT != MVT::v16i16 || InVT != MVT::v16i8))
16000     return SDValue();
16001
16002   if (Subtarget->hasInt256())
16003     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16004
16005   // Optimize vectors in AVX mode
16006   // Sign extend  v8i16 to v8i32 and
16007   //              v4i32 to v4i64
16008   //
16009   // Divide input vector into two parts
16010   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16011   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16012   // concat the vectors to original VT
16013
16014   unsigned NumElems = InVT.getVectorNumElements();
16015   SDValue Undef = DAG.getUNDEF(InVT);
16016
16017   SmallVector<int,8> ShufMask1(NumElems, -1);
16018   for (unsigned i = 0; i != NumElems/2; ++i)
16019     ShufMask1[i] = i;
16020
16021   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
16022
16023   SmallVector<int,8> ShufMask2(NumElems, -1);
16024   for (unsigned i = 0; i != NumElems/2; ++i)
16025     ShufMask2[i] = i + NumElems/2;
16026
16027   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
16028
16029   MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
16030                                 VT.getVectorNumElements()/2);
16031
16032   OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16033   OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16034
16035   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16036 }
16037
16038 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
16039 // may emit an illegal shuffle but the expansion is still better than scalar
16040 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16041 // we'll emit a shuffle and a arithmetic shift.
16042 // TODO: It is possible to support ZExt by zeroing the undef values during
16043 // the shuffle phase or after the shuffle.
16044 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget,
16045                                  SelectionDAG &DAG) {
16046   MVT RegVT = Op.getSimpleValueType();
16047   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16048   assert(RegVT.isInteger() &&
16049          "We only custom lower integer vector sext loads.");
16050
16051   // Nothing useful we can do without SSE2 shuffles.
16052   assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2.");
16053
16054   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16055   SDLoc dl(Ld);
16056   EVT MemVT = Ld->getMemoryVT();
16057   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16058   unsigned RegSz = RegVT.getSizeInBits();
16059
16060   ISD::LoadExtType Ext = Ld->getExtensionType();
16061
16062   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16063          && "Only anyext and sext are currently implemented.");
16064   assert(MemVT != RegVT && "Cannot extend to the same type");
16065   assert(MemVT.isVector() && "Must load a vector from memory");
16066
16067   unsigned NumElems = RegVT.getVectorNumElements();
16068   unsigned MemSz = MemVT.getSizeInBits();
16069   assert(RegSz > MemSz && "Register size must be greater than the mem size");
16070
16071   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) {
16072     // The only way in which we have a legal 256-bit vector result but not the
16073     // integer 256-bit operations needed to directly lower a sextload is if we
16074     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16075     // a 128-bit vector and a normal sign_extend to 256-bits that should get
16076     // correctly legalized. We do this late to allow the canonical form of
16077     // sextload to persist throughout the rest of the DAG combiner -- it wants
16078     // to fold together any extensions it can, and so will fuse a sign_extend
16079     // of an sextload into a sextload targeting a wider value.
16080     SDValue Load;
16081     if (MemSz == 128) {
16082       // Just switch this to a normal load.
16083       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16084                                        "it must be a legal 128-bit vector "
16085                                        "type!");
16086       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16087                   Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16088                   Ld->isInvariant(), Ld->getAlignment());
16089     } else {
16090       assert(MemSz < 128 &&
16091              "Can't extend a type wider than 128 bits to a 256 bit vector!");
16092       // Do an sext load to a 128-bit vector type. We want to use the same
16093       // number of elements, but elements half as wide. This will end up being
16094       // recursively lowered by this routine, but will succeed as we definitely
16095       // have all the necessary features if we're using AVX1.
16096       EVT HalfEltVT =
16097           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16098       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16099       Load =
16100           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16101                          Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16102                          Ld->isNonTemporal(), Ld->isInvariant(),
16103                          Ld->getAlignment());
16104     }
16105
16106     // Replace chain users with the new chain.
16107     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16108     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16109
16110     // Finally, do a normal sign-extend to the desired register.
16111     return DAG.getSExtOrTrunc(Load, dl, RegVT);
16112   }
16113
16114   // All sizes must be a power of two.
16115   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16116          "Non-power-of-two elements are not custom lowered!");
16117
16118   // Attempt to load the original value using scalar loads.
16119   // Find the largest scalar type that divides the total loaded size.
16120   MVT SclrLoadTy = MVT::i8;
16121   for (MVT Tp : MVT::integer_valuetypes()) {
16122     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16123       SclrLoadTy = Tp;
16124     }
16125   }
16126
16127   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16128   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16129       (64 <= MemSz))
16130     SclrLoadTy = MVT::f64;
16131
16132   // Calculate the number of scalar loads that we need to perform
16133   // in order to load our vector from memory.
16134   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16135
16136   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16137          "Can only lower sext loads with a single scalar load!");
16138
16139   unsigned loadRegZize = RegSz;
16140   if (Ext == ISD::SEXTLOAD && RegSz == 256)
16141     loadRegZize /= 2;
16142
16143   // Represent our vector as a sequence of elements which are the
16144   // largest scalar that we can load.
16145   EVT LoadUnitVecVT = EVT::getVectorVT(
16146       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16147
16148   // Represent the data using the same element type that is stored in
16149   // memory. In practice, we ''widen'' MemVT.
16150   EVT WideVecVT =
16151       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16152                        loadRegZize / MemVT.getScalarType().getSizeInBits());
16153
16154   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16155          "Invalid vector type");
16156
16157   // We can't shuffle using an illegal type.
16158   assert(TLI.isTypeLegal(WideVecVT) &&
16159          "We only lower types that form legal widened vector types");
16160
16161   SmallVector<SDValue, 8> Chains;
16162   SDValue Ptr = Ld->getBasePtr();
16163   SDValue Increment =
16164       DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy());
16165   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16166
16167   for (unsigned i = 0; i < NumLoads; ++i) {
16168     // Perform a single load.
16169     SDValue ScalarLoad =
16170         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16171                     Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16172                     Ld->getAlignment());
16173     Chains.push_back(ScalarLoad.getValue(1));
16174     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16175     // another round of DAGCombining.
16176     if (i == 0)
16177       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16178     else
16179       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16180                         ScalarLoad, DAG.getIntPtrConstant(i));
16181
16182     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16183   }
16184
16185   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16186
16187   // Bitcast the loaded value to a vector of the original element type, in
16188   // the size of the target vector type.
16189   SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
16190   unsigned SizeRatio = RegSz / MemSz;
16191
16192   if (Ext == ISD::SEXTLOAD) {
16193     // If we have SSE4.1, we can directly emit a VSEXT node.
16194     if (Subtarget->hasSSE41()) {
16195       SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16196       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16197       return Sext;
16198     }
16199
16200     // Otherwise we'll shuffle the small elements in the high bits of the
16201     // larger type and perform an arithmetic shift. If the shift is not legal
16202     // it's better to scalarize.
16203     assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) &&
16204            "We can't implement a sext load without an arithmetic right shift!");
16205
16206     // Redistribute the loaded elements into the different locations.
16207     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16208     for (unsigned i = 0; i != NumElems; ++i)
16209       ShuffleVec[i * SizeRatio + SizeRatio - 1] = i;
16210
16211     SDValue Shuff = DAG.getVectorShuffle(
16212         WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16213
16214     Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16215
16216     // Build the arithmetic shift.
16217     unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
16218                    MemVT.getVectorElementType().getSizeInBits();
16219     Shuff =
16220         DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT));
16221
16222     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16223     return Shuff;
16224   }
16225
16226   // Redistribute the loaded elements into the different locations.
16227   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16228   for (unsigned i = 0; i != NumElems; ++i)
16229     ShuffleVec[i * SizeRatio] = i;
16230
16231   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16232                                        DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
16233
16234   // Bitcast to the requested type.
16235   Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
16236   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16237   return Shuff;
16238 }
16239
16240 // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
16241 // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
16242 // from the AND / OR.
16243 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16244   Opc = Op.getOpcode();
16245   if (Opc != ISD::OR && Opc != ISD::AND)
16246     return false;
16247   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16248           Op.getOperand(0).hasOneUse() &&
16249           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16250           Op.getOperand(1).hasOneUse());
16251 }
16252
16253 // isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
16254 // 1 and that the SETCC node has a single use.
16255 static bool isXor1OfSetCC(SDValue Op) {
16256   if (Op.getOpcode() != ISD::XOR)
16257     return false;
16258   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16259   if (N1C && N1C->getAPIntValue() == 1) {
16260     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16261       Op.getOperand(0).hasOneUse();
16262   }
16263   return false;
16264 }
16265
16266 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16267   bool addTest = true;
16268   SDValue Chain = Op.getOperand(0);
16269   SDValue Cond  = Op.getOperand(1);
16270   SDValue Dest  = Op.getOperand(2);
16271   SDLoc dl(Op);
16272   SDValue CC;
16273   bool Inverted = false;
16274
16275   if (Cond.getOpcode() == ISD::SETCC) {
16276     // Check for setcc([su]{add,sub,mul}o == 0).
16277     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16278         isa<ConstantSDNode>(Cond.getOperand(1)) &&
16279         cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() &&
16280         Cond.getOperand(0).getResNo() == 1 &&
16281         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16282          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16283          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16284          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16285          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16286          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16287       Inverted = true;
16288       Cond = Cond.getOperand(0);
16289     } else {
16290       SDValue NewCond = LowerSETCC(Cond, DAG);
16291       if (NewCond.getNode())
16292         Cond = NewCond;
16293     }
16294   }
16295 #if 0
16296   // FIXME: LowerXALUO doesn't handle these!!
16297   else if (Cond.getOpcode() == X86ISD::ADD  ||
16298            Cond.getOpcode() == X86ISD::SUB  ||
16299            Cond.getOpcode() == X86ISD::SMUL ||
16300            Cond.getOpcode() == X86ISD::UMUL)
16301     Cond = LowerXALUO(Cond, DAG);
16302 #endif
16303
16304   // Look pass (and (setcc_carry (cmp ...)), 1).
16305   if (Cond.getOpcode() == ISD::AND &&
16306       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
16307     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
16308     if (C && C->getAPIntValue() == 1)
16309       Cond = Cond.getOperand(0);
16310   }
16311
16312   // If condition flag is set by a X86ISD::CMP, then use it as the condition
16313   // setting operand in place of the X86ISD::SETCC.
16314   unsigned CondOpcode = Cond.getOpcode();
16315   if (CondOpcode == X86ISD::SETCC ||
16316       CondOpcode == X86ISD::SETCC_CARRY) {
16317     CC = Cond.getOperand(0);
16318
16319     SDValue Cmp = Cond.getOperand(1);
16320     unsigned Opc = Cmp.getOpcode();
16321     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16322     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16323       Cond = Cmp;
16324       addTest = false;
16325     } else {
16326       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16327       default: break;
16328       case X86::COND_O:
16329       case X86::COND_B:
16330         // These can only come from an arithmetic instruction with overflow,
16331         // e.g. SADDO, UADDO.
16332         Cond = Cond.getNode()->getOperand(1);
16333         addTest = false;
16334         break;
16335       }
16336     }
16337   }
16338   CondOpcode = Cond.getOpcode();
16339   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16340       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16341       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16342        Cond.getOperand(0).getValueType() != MVT::i8)) {
16343     SDValue LHS = Cond.getOperand(0);
16344     SDValue RHS = Cond.getOperand(1);
16345     unsigned X86Opcode;
16346     unsigned X86Cond;
16347     SDVTList VTs;
16348     // Keep this in sync with LowerXALUO, otherwise we might create redundant
16349     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16350     // X86ISD::INC).
16351     switch (CondOpcode) {
16352     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16353     case ISD::SADDO:
16354       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16355         if (C->isOne()) {
16356           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16357           break;
16358         }
16359       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16360     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16361     case ISD::SSUBO:
16362       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
16363         if (C->isOne()) {
16364           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16365           break;
16366         }
16367       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16368     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16369     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16370     default: llvm_unreachable("unexpected overflowing operator");
16371     }
16372     if (Inverted)
16373       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16374     if (CondOpcode == ISD::UMULO)
16375       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16376                           MVT::i32);
16377     else
16378       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16379
16380     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16381
16382     if (CondOpcode == ISD::UMULO)
16383       Cond = X86Op.getValue(2);
16384     else
16385       Cond = X86Op.getValue(1);
16386
16387     CC = DAG.getConstant(X86Cond, MVT::i8);
16388     addTest = false;
16389   } else {
16390     unsigned CondOpc;
16391     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16392       SDValue Cmp = Cond.getOperand(0).getOperand(1);
16393       if (CondOpc == ISD::OR) {
16394         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16395         // two branches instead of an explicit OR instruction with a
16396         // separate test.
16397         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16398             isX86LogicalCmp(Cmp)) {
16399           CC = Cond.getOperand(0).getOperand(0);
16400           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16401                               Chain, Dest, CC, Cmp);
16402           CC = Cond.getOperand(1).getOperand(0);
16403           Cond = Cmp;
16404           addTest = false;
16405         }
16406       } else { // ISD::AND
16407         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16408         // two branches instead of an explicit AND instruction with a
16409         // separate test. However, we only do this if this block doesn't
16410         // have a fall-through edge, because this requires an explicit
16411         // jmp when the condition is false.
16412         if (Cmp == Cond.getOperand(1).getOperand(1) &&
16413             isX86LogicalCmp(Cmp) &&
16414             Op.getNode()->hasOneUse()) {
16415           X86::CondCode CCode =
16416             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16417           CCode = X86::GetOppositeBranchCondition(CCode);
16418           CC = DAG.getConstant(CCode, MVT::i8);
16419           SDNode *User = *Op.getNode()->use_begin();
16420           // Look for an unconditional branch following this conditional branch.
16421           // We need this because we need to reverse the successors in order
16422           // to implement FCMP_OEQ.
16423           if (User->getOpcode() == ISD::BR) {
16424             SDValue FalseBB = User->getOperand(1);
16425             SDNode *NewBR =
16426               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16427             assert(NewBR == User);
16428             (void)NewBR;
16429             Dest = FalseBB;
16430
16431             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16432                                 Chain, Dest, CC, Cmp);
16433             X86::CondCode CCode =
16434               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16435             CCode = X86::GetOppositeBranchCondition(CCode);
16436             CC = DAG.getConstant(CCode, MVT::i8);
16437             Cond = Cmp;
16438             addTest = false;
16439           }
16440         }
16441       }
16442     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16443       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16444       // It should be transformed during dag combiner except when the condition
16445       // is set by a arithmetics with overflow node.
16446       X86::CondCode CCode =
16447         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16448       CCode = X86::GetOppositeBranchCondition(CCode);
16449       CC = DAG.getConstant(CCode, MVT::i8);
16450       Cond = Cond.getOperand(0).getOperand(1);
16451       addTest = false;
16452     } else if (Cond.getOpcode() == ISD::SETCC &&
16453                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16454       // For FCMP_OEQ, we can emit
16455       // two branches instead of an explicit AND instruction with a
16456       // separate test. However, we only do this if this block doesn't
16457       // have a fall-through edge, because this requires an explicit
16458       // jmp when the condition is false.
16459       if (Op.getNode()->hasOneUse()) {
16460         SDNode *User = *Op.getNode()->use_begin();
16461         // Look for an unconditional branch following this conditional branch.
16462         // We need this because we need to reverse the successors in order
16463         // to implement FCMP_OEQ.
16464         if (User->getOpcode() == ISD::BR) {
16465           SDValue FalseBB = User->getOperand(1);
16466           SDNode *NewBR =
16467             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16468           assert(NewBR == User);
16469           (void)NewBR;
16470           Dest = FalseBB;
16471
16472           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16473                                     Cond.getOperand(0), Cond.getOperand(1));
16474           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16475           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16476           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16477                               Chain, Dest, CC, Cmp);
16478           CC = DAG.getConstant(X86::COND_P, MVT::i8);
16479           Cond = Cmp;
16480           addTest = false;
16481         }
16482       }
16483     } else if (Cond.getOpcode() == ISD::SETCC &&
16484                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16485       // For FCMP_UNE, we can emit
16486       // two branches instead of an explicit AND instruction with a
16487       // separate test. However, we only do this if this block doesn't
16488       // have a fall-through edge, because this requires an explicit
16489       // jmp when the condition is false.
16490       if (Op.getNode()->hasOneUse()) {
16491         SDNode *User = *Op.getNode()->use_begin();
16492         // Look for an unconditional branch following this conditional branch.
16493         // We need this because we need to reverse the successors in order
16494         // to implement FCMP_UNE.
16495         if (User->getOpcode() == ISD::BR) {
16496           SDValue FalseBB = User->getOperand(1);
16497           SDNode *NewBR =
16498             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16499           assert(NewBR == User);
16500           (void)NewBR;
16501
16502           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16503                                     Cond.getOperand(0), Cond.getOperand(1));
16504           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16505           CC = DAG.getConstant(X86::COND_NE, MVT::i8);
16506           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16507                               Chain, Dest, CC, Cmp);
16508           CC = DAG.getConstant(X86::COND_NP, MVT::i8);
16509           Cond = Cmp;
16510           addTest = false;
16511           Dest = FalseBB;
16512         }
16513       }
16514     }
16515   }
16516
16517   if (addTest) {
16518     // Look pass the truncate if the high bits are known zero.
16519     if (isTruncWithZeroHighBitsInput(Cond, DAG))
16520         Cond = Cond.getOperand(0);
16521
16522     // We know the result of AND is compared against zero. Try to match
16523     // it to BT.
16524     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16525       SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
16526       if (NewSetCC.getNode()) {
16527         CC = NewSetCC.getOperand(0);
16528         Cond = NewSetCC.getOperand(1);
16529         addTest = false;
16530       }
16531     }
16532   }
16533
16534   if (addTest) {
16535     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16536     CC = DAG.getConstant(X86Cond, MVT::i8);
16537     Cond = EmitTest(Cond, X86Cond, dl, DAG);
16538   }
16539   Cond = ConvertCmpIfNecessary(Cond, DAG);
16540   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16541                      Chain, Dest, CC, Cond);
16542 }
16543
16544 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16545 // Calls to _alloca are needed to probe the stack when allocating more than 4k
16546 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
16547 // that the guard pages used by the OS virtual memory manager are allocated in
16548 // correct sequence.
16549 SDValue
16550 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16551                                            SelectionDAG &DAG) const {
16552   MachineFunction &MF = DAG.getMachineFunction();
16553   bool SplitStack = MF.shouldSplitStack();
16554   bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) ||
16555                SplitStack;
16556   SDLoc dl(Op);
16557
16558   if (!Lower) {
16559     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16560     SDNode* Node = Op.getNode();
16561
16562     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16563     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16564         " not tell us which reg is the stack pointer!");
16565     EVT VT = Node->getValueType(0);
16566     SDValue Tmp1 = SDValue(Node, 0);
16567     SDValue Tmp2 = SDValue(Node, 1);
16568     SDValue Tmp3 = Node->getOperand(2);
16569     SDValue Chain = Tmp1.getOperand(0);
16570
16571     // Chain the dynamic stack allocation so that it doesn't modify the stack
16572     // pointer when other instructions are using the stack.
16573     Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
16574         SDLoc(Node));
16575
16576     SDValue Size = Tmp2.getOperand(1);
16577     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16578     Chain = SP.getValue(1);
16579     unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
16580     const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering();
16581     unsigned StackAlign = TFI.getStackAlignment();
16582     Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16583     if (Align > StackAlign)
16584       Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
16585           DAG.getConstant(-(uint64_t)Align, VT));
16586     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
16587
16588     Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
16589         DAG.getIntPtrConstant(0, true), SDValue(),
16590         SDLoc(Node));
16591
16592     SDValue Ops[2] = { Tmp1, Tmp2 };
16593     return DAG.getMergeValues(Ops, dl);
16594   }
16595
16596   // Get the inputs.
16597   SDValue Chain = Op.getOperand(0);
16598   SDValue Size  = Op.getOperand(1);
16599   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16600   EVT VT = Op.getNode()->getValueType(0);
16601
16602   bool Is64Bit = Subtarget->is64Bit();
16603   EVT SPTy = getPointerTy();
16604
16605   if (SplitStack) {
16606     MachineRegisterInfo &MRI = MF.getRegInfo();
16607
16608     if (Is64Bit) {
16609       // The 64 bit implementation of segmented stacks needs to clobber both r10
16610       // r11. This makes it impossible to use it along with nested parameters.
16611       const Function *F = MF.getFunction();
16612
16613       for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
16614            I != E; ++I)
16615         if (I->hasNestAttr())
16616           report_fatal_error("Cannot use segmented stacks with functions that "
16617                              "have nested arguments.");
16618     }
16619
16620     const TargetRegisterClass *AddrRegClass =
16621       getRegClassFor(getPointerTy());
16622     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16623     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16624     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16625                                 DAG.getRegister(Vreg, SPTy));
16626     SDValue Ops1[2] = { Value, Chain };
16627     return DAG.getMergeValues(Ops1, dl);
16628   } else {
16629     SDValue Flag;
16630     const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX);
16631
16632     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
16633     Flag = Chain.getValue(1);
16634     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16635
16636     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
16637
16638     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
16639         DAG.getSubtarget().getRegisterInfo());
16640     unsigned SPReg = RegInfo->getStackRegister();
16641     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16642     Chain = SP.getValue(1);
16643
16644     if (Align) {
16645       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16646                        DAG.getConstant(-(uint64_t)Align, VT));
16647       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16648     }
16649
16650     SDValue Ops1[2] = { SP, Chain };
16651     return DAG.getMergeValues(Ops1, dl);
16652   }
16653 }
16654
16655 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16656   MachineFunction &MF = DAG.getMachineFunction();
16657   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16658
16659   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16660   SDLoc DL(Op);
16661
16662   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
16663     // vastart just stores the address of the VarArgsFrameIndex slot into the
16664     // memory location argument.
16665     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16666                                    getPointerTy());
16667     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16668                         MachinePointerInfo(SV), false, false, 0);
16669   }
16670
16671   // __va_list_tag:
16672   //   gp_offset         (0 - 6 * 8)
16673   //   fp_offset         (48 - 48 + 8 * 16)
16674   //   overflow_arg_area (point to parameters coming in memory).
16675   //   reg_save_area
16676   SmallVector<SDValue, 8> MemOps;
16677   SDValue FIN = Op.getOperand(1);
16678   // Store gp_offset
16679   SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16680                                DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16681                                                MVT::i32),
16682                                FIN, MachinePointerInfo(SV), false, false, 0);
16683   MemOps.push_back(Store);
16684
16685   // Store fp_offset
16686   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16687                     FIN, DAG.getIntPtrConstant(4));
16688   Store = DAG.getStore(Op.getOperand(0), DL,
16689                        DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
16690                                        MVT::i32),
16691                        FIN, MachinePointerInfo(SV, 4), false, false, 0);
16692   MemOps.push_back(Store);
16693
16694   // Store ptr to overflow_arg_area
16695   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16696                     FIN, DAG.getIntPtrConstant(4));
16697   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
16698                                     getPointerTy());
16699   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16700                        MachinePointerInfo(SV, 8),
16701                        false, false, 0);
16702   MemOps.push_back(Store);
16703
16704   // Store ptr to reg_save_area.
16705   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
16706                     FIN, DAG.getIntPtrConstant(8));
16707   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
16708                                     getPointerTy());
16709   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
16710                        MachinePointerInfo(SV, 16), false, false, 0);
16711   MemOps.push_back(Store);
16712   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16713 }
16714
16715 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16716   assert(Subtarget->is64Bit() &&
16717          "LowerVAARG only handles 64-bit va_arg!");
16718   assert((Subtarget->isTargetLinux() ||
16719           Subtarget->isTargetDarwin()) &&
16720           "Unhandled target in LowerVAARG");
16721   assert(Op.getNode()->getNumOperands() == 4);
16722   SDValue Chain = Op.getOperand(0);
16723   SDValue SrcPtr = Op.getOperand(1);
16724   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16725   unsigned Align = Op.getConstantOperandVal(3);
16726   SDLoc dl(Op);
16727
16728   EVT ArgVT = Op.getNode()->getValueType(0);
16729   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16730   uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
16731   uint8_t ArgMode;
16732
16733   // Decide which area this value should be read from.
16734   // TODO: Implement the AMD64 ABI in its entirety. This simple
16735   // selection mechanism works only for the basic types.
16736   if (ArgVT == MVT::f80) {
16737     llvm_unreachable("va_arg for f80 not yet implemented");
16738   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16739     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16740   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16741     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16742   } else {
16743     llvm_unreachable("Unhandled argument type in LowerVAARG");
16744   }
16745
16746   if (ArgMode == 2) {
16747     // Sanity Check: Make sure using fp_offset makes sense.
16748     assert(!DAG.getTarget().Options.UseSoftFloat &&
16749            !(DAG.getMachineFunction()
16750                 .getFunction()->getAttributes()
16751                 .hasAttribute(AttributeSet::FunctionIndex,
16752                               Attribute::NoImplicitFloat)) &&
16753            Subtarget->hasSSE1());
16754   }
16755
16756   // Insert VAARG_64 node into the DAG
16757   // VAARG_64 returns two values: Variable Argument Address, Chain
16758   SmallVector<SDValue, 11> InstOps;
16759   InstOps.push_back(Chain);
16760   InstOps.push_back(SrcPtr);
16761   InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32));
16762   InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8));
16763   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
16764   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
16765   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16766                                           VTs, InstOps, MVT::i64,
16767                                           MachinePointerInfo(SV),
16768                                           /*Align=*/0,
16769                                           /*Volatile=*/false,
16770                                           /*ReadMem=*/true,
16771                                           /*WriteMem=*/true);
16772   Chain = VAARG.getValue(1);
16773
16774   // Load the next argument and return it
16775   return DAG.getLoad(ArgVT, dl,
16776                      Chain,
16777                      VAARG,
16778                      MachinePointerInfo(),
16779                      false, false, false, 0);
16780 }
16781
16782 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
16783                            SelectionDAG &DAG) {
16784   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
16785   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
16786   SDValue Chain = Op.getOperand(0);
16787   SDValue DstPtr = Op.getOperand(1);
16788   SDValue SrcPtr = Op.getOperand(2);
16789   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
16790   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
16791   SDLoc DL(Op);
16792
16793   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
16794                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
16795                        false,
16796                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
16797 }
16798
16799 // getTargetVShiftByConstNode - Handle vector element shifts where the shift
16800 // amount is a constant. Takes immediate version of shift as input.
16801 static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
16802                                           SDValue SrcOp, uint64_t ShiftAmt,
16803                                           SelectionDAG &DAG) {
16804   MVT ElementType = VT.getVectorElementType();
16805
16806   // Fold this packed shift into its first operand if ShiftAmt is 0.
16807   if (ShiftAmt == 0)
16808     return SrcOp;
16809
16810   // Check for ShiftAmt >= element width
16811   if (ShiftAmt >= ElementType.getSizeInBits()) {
16812     if (Opc == X86ISD::VSRAI)
16813       ShiftAmt = ElementType.getSizeInBits() - 1;
16814     else
16815       return DAG.getConstant(0, VT);
16816   }
16817
16818   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
16819          && "Unknown target vector shift-by-constant node");
16820
16821   // Fold this packed vector shift into a build vector if SrcOp is a
16822   // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
16823   if (VT == SrcOp.getSimpleValueType() &&
16824       ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
16825     SmallVector<SDValue, 8> Elts;
16826     unsigned NumElts = SrcOp->getNumOperands();
16827     ConstantSDNode *ND;
16828
16829     switch(Opc) {
16830     default: llvm_unreachable(nullptr);
16831     case X86ISD::VSHLI:
16832       for (unsigned i=0; i!=NumElts; ++i) {
16833         SDValue CurrentOp = SrcOp->getOperand(i);
16834         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16835           Elts.push_back(CurrentOp);
16836           continue;
16837         }
16838         ND = cast<ConstantSDNode>(CurrentOp);
16839         const APInt &C = ND->getAPIntValue();
16840         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), ElementType));
16841       }
16842       break;
16843     case X86ISD::VSRLI:
16844       for (unsigned i=0; i!=NumElts; ++i) {
16845         SDValue CurrentOp = SrcOp->getOperand(i);
16846         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16847           Elts.push_back(CurrentOp);
16848           continue;
16849         }
16850         ND = cast<ConstantSDNode>(CurrentOp);
16851         const APInt &C = ND->getAPIntValue();
16852         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), ElementType));
16853       }
16854       break;
16855     case X86ISD::VSRAI:
16856       for (unsigned i=0; i!=NumElts; ++i) {
16857         SDValue CurrentOp = SrcOp->getOperand(i);
16858         if (CurrentOp->getOpcode() == ISD::UNDEF) {
16859           Elts.push_back(CurrentOp);
16860           continue;
16861         }
16862         ND = cast<ConstantSDNode>(CurrentOp);
16863         const APInt &C = ND->getAPIntValue();
16864         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), ElementType));
16865       }
16866       break;
16867     }
16868
16869     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
16870   }
16871
16872   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
16873 }
16874
16875 // getTargetVShiftNode - Handle vector element shifts where the shift amount
16876 // may or may not be a constant. Takes immediate version of shift as input.
16877 static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
16878                                    SDValue SrcOp, SDValue ShAmt,
16879                                    SelectionDAG &DAG) {
16880   MVT SVT = ShAmt.getSimpleValueType();
16881   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
16882
16883   // Catch shift-by-constant.
16884   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
16885     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
16886                                       CShAmt->getZExtValue(), DAG);
16887
16888   // Change opcode to non-immediate version
16889   switch (Opc) {
16890     default: llvm_unreachable("Unknown target vector shift node");
16891     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
16892     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
16893     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
16894   }
16895
16896   const X86Subtarget &Subtarget =
16897       DAG.getTarget().getSubtarget<X86Subtarget>();
16898   if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
16899       ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
16900     // Let the shuffle legalizer expand this shift amount node.
16901     SDValue Op0 = ShAmt.getOperand(0);
16902     Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
16903     ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
16904   } else {
16905     // Need to build a vector containing shift amount.
16906     // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
16907     SmallVector<SDValue, 4> ShOps;
16908     ShOps.push_back(ShAmt);
16909     if (SVT == MVT::i32) {
16910       ShOps.push_back(DAG.getConstant(0, SVT));
16911       ShOps.push_back(DAG.getUNDEF(SVT));
16912     }
16913     ShOps.push_back(DAG.getUNDEF(SVT));
16914
16915     MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
16916     ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
16917   }
16918
16919   // The return type has to be a 128-bit type with the same element
16920   // type as the input type.
16921   MVT EltVT = VT.getVectorElementType();
16922   EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
16923
16924   ShAmt = DAG.getNode(ISD::BITCAST, dl, ShVT, ShAmt);
16925   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
16926 }
16927
16928 /// \brief Return (and \p Op, \p Mask) for compare instructions or
16929 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
16930 /// necessary casting for \p Mask when lowering masking intrinsics.
16931 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
16932                                     SDValue PreservedSrc,
16933                                     const X86Subtarget *Subtarget,
16934                                     SelectionDAG &DAG) {
16935     EVT VT = Op.getValueType();
16936     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(),
16937                                   MVT::i1, VT.getVectorNumElements());
16938     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
16939                                      Mask.getValueType().getSizeInBits());
16940     SDLoc dl(Op);
16941
16942     assert(MaskVT.isSimple() && "invalid mask type");
16943
16944     if (isAllOnes(Mask))
16945       return Op;
16946
16947     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
16948     // are extracted by EXTRACT_SUBVECTOR.
16949     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
16950                               DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
16951                               DAG.getIntPtrConstant(0));
16952
16953     switch (Op.getOpcode()) {
16954       default: break;
16955       case X86ISD::PCMPEQM:
16956       case X86ISD::PCMPGTM:
16957       case X86ISD::CMPM:
16958       case X86ISD::CMPMU:
16959         return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
16960     }
16961     if (PreservedSrc.getOpcode() == ISD::UNDEF)
16962       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16963     return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
16964 }
16965
16966 /// \brief Creates an SDNode for a predicated scalar operation.
16967 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
16968 /// The mask is comming as MVT::i8 and it should be truncated
16969 /// to MVT::i1 while lowering masking intrinsics.
16970 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
16971 /// "X86select" instead of "vselect". We just can't create the "vselect" node for
16972 /// a scalar instruction.
16973 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
16974                                     SDValue PreservedSrc,
16975                                     const X86Subtarget *Subtarget,
16976                                     SelectionDAG &DAG) {
16977     if (isAllOnes(Mask))
16978       return Op;
16979
16980     EVT VT = Op.getValueType();
16981     SDLoc dl(Op);
16982     // The mask should be of type MVT::i1
16983     SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
16984
16985     if (PreservedSrc.getOpcode() == ISD::UNDEF)
16986       PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
16987     return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
16988 }
16989
16990 static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
16991     switch (IntNo) {
16992     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
16993     case Intrinsic::x86_fma_vfmadd_ps:
16994     case Intrinsic::x86_fma_vfmadd_pd:
16995     case Intrinsic::x86_fma_vfmadd_ps_256:
16996     case Intrinsic::x86_fma_vfmadd_pd_256:
16997     case Intrinsic::x86_fma_mask_vfmadd_ps_512:
16998     case Intrinsic::x86_fma_mask_vfmadd_pd_512:
16999       return X86ISD::FMADD;
17000     case Intrinsic::x86_fma_vfmsub_ps:
17001     case Intrinsic::x86_fma_vfmsub_pd:
17002     case Intrinsic::x86_fma_vfmsub_ps_256:
17003     case Intrinsic::x86_fma_vfmsub_pd_256:
17004     case Intrinsic::x86_fma_mask_vfmsub_ps_512:
17005     case Intrinsic::x86_fma_mask_vfmsub_pd_512:
17006       return X86ISD::FMSUB;
17007     case Intrinsic::x86_fma_vfnmadd_ps:
17008     case Intrinsic::x86_fma_vfnmadd_pd:
17009     case Intrinsic::x86_fma_vfnmadd_ps_256:
17010     case Intrinsic::x86_fma_vfnmadd_pd_256:
17011     case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
17012     case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17013       return X86ISD::FNMADD;
17014     case Intrinsic::x86_fma_vfnmsub_ps:
17015     case Intrinsic::x86_fma_vfnmsub_pd:
17016     case Intrinsic::x86_fma_vfnmsub_ps_256:
17017     case Intrinsic::x86_fma_vfnmsub_pd_256:
17018     case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17019     case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17020       return X86ISD::FNMSUB;
17021     case Intrinsic::x86_fma_vfmaddsub_ps:
17022     case Intrinsic::x86_fma_vfmaddsub_pd:
17023     case Intrinsic::x86_fma_vfmaddsub_ps_256:
17024     case Intrinsic::x86_fma_vfmaddsub_pd_256:
17025     case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17026     case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17027       return X86ISD::FMADDSUB;
17028     case Intrinsic::x86_fma_vfmsubadd_ps:
17029     case Intrinsic::x86_fma_vfmsubadd_pd:
17030     case Intrinsic::x86_fma_vfmsubadd_ps_256:
17031     case Intrinsic::x86_fma_vfmsubadd_pd_256:
17032     case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17033     case Intrinsic::x86_fma_mask_vfmsubadd_pd_512:
17034       return X86ISD::FMSUBADD;
17035     }
17036 }
17037
17038 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17039                                        SelectionDAG &DAG) {
17040   SDLoc dl(Op);
17041   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17042   EVT VT = Op.getValueType();
17043   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17044   if (IntrData) {
17045     switch(IntrData->Type) {
17046     case INTR_TYPE_1OP:
17047       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17048     case INTR_TYPE_2OP:
17049       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17050         Op.getOperand(2));
17051     case INTR_TYPE_3OP:
17052       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17053         Op.getOperand(2), Op.getOperand(3));
17054     case INTR_TYPE_1OP_MASK_RM: {
17055       SDValue Src = Op.getOperand(1);
17056       SDValue Src0 = Op.getOperand(2);
17057       SDValue Mask = Op.getOperand(3);
17058       SDValue RoundingMode = Op.getOperand(4);
17059       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17060                                               RoundingMode),
17061                                   Mask, Src0, Subtarget, DAG);
17062     }
17063     case INTR_TYPE_SCALAR_MASK_RM: {
17064       SDValue Src1 = Op.getOperand(1);
17065       SDValue Src2 = Op.getOperand(2);
17066       SDValue Src0 = Op.getOperand(3);
17067       SDValue Mask = Op.getOperand(4);
17068       SDValue RoundingMode = Op.getOperand(5);
17069       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17070                                               RoundingMode),
17071                                   Mask, Src0, Subtarget, DAG);
17072     }
17073     case INTR_TYPE_2OP_MASK: {
17074       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
17075                                               Op.getOperand(2)),
17076                                   Op.getOperand(4), Op.getOperand(3), Subtarget, DAG);
17077     }
17078     case CMP_MASK:
17079     case CMP_MASK_CC: {
17080       // Comparison intrinsics with masks.
17081       // Example of transformation:
17082       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17083       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17084       // (i8 (bitcast
17085       //   (v8i1 (insert_subvector undef,
17086       //           (v2i1 (and (PCMPEQM %a, %b),
17087       //                      (extract_subvector
17088       //                         (v8i1 (bitcast %mask)), 0))), 0))))
17089       EVT VT = Op.getOperand(1).getValueType();
17090       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17091                                     VT.getVectorNumElements());
17092       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17093       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17094                                        Mask.getValueType().getSizeInBits());
17095       SDValue Cmp;
17096       if (IntrData->Type == CMP_MASK_CC) {
17097         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17098                     Op.getOperand(2), Op.getOperand(3));
17099       } else {
17100         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17101         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17102                     Op.getOperand(2));
17103       }
17104       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17105                                              DAG.getTargetConstant(0, MaskVT),
17106                                              Subtarget, DAG);
17107       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17108                                 DAG.getUNDEF(BitcastVT), CmpMask,
17109                                 DAG.getIntPtrConstant(0));
17110       return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
17111     }
17112     case COMI: { // Comparison intrinsics
17113       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17114       SDValue LHS = Op.getOperand(1);
17115       SDValue RHS = Op.getOperand(2);
17116       unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
17117       assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
17118       SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17119       SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17120                                   DAG.getConstant(X86CC, MVT::i8), Cond);
17121       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17122     }
17123     case VSHIFT:
17124       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17125                                  Op.getOperand(1), Op.getOperand(2), DAG);
17126     case VSHIFT_MASK:
17127       return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl,
17128                                                       Op.getSimpleValueType(),
17129                                                       Op.getOperand(1),
17130                                                       Op.getOperand(2), DAG),
17131                                   Op.getOperand(4), Op.getOperand(3), Subtarget,
17132                                   DAG);
17133     case COMPRESS_EXPAND_IN_REG: {
17134       SDValue Mask = Op.getOperand(3);
17135       SDValue DataToCompress = Op.getOperand(1);
17136       SDValue PassThru = Op.getOperand(2);
17137       if (isAllOnes(Mask)) // return data as is
17138         return Op.getOperand(1);
17139       EVT VT = Op.getValueType();
17140       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17141                                     VT.getVectorNumElements());
17142       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17143                                        Mask.getValueType().getSizeInBits());
17144       SDLoc dl(Op);
17145       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17146                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17147                                   DAG.getIntPtrConstant(0));
17148
17149       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress,
17150                          PassThru);
17151     }
17152     case BLEND: {
17153       SDValue Mask = Op.getOperand(3);
17154       EVT VT = Op.getValueType();
17155       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17156                                     VT.getVectorNumElements());
17157       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17158                                        Mask.getValueType().getSizeInBits());
17159       SDLoc dl(Op);
17160       SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17161                                   DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17162                                   DAG.getIntPtrConstant(0));
17163       return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1),
17164                          Op.getOperand(2));
17165     }
17166     case FMA_OP_MASK:
17167     {
17168         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17169             dl, Op.getValueType(),
17170             Op.getOperand(1),
17171             Op.getOperand(2),
17172             Op.getOperand(3)),
17173             Op.getOperand(4), Op.getOperand(1),
17174             Subtarget, DAG);
17175     }
17176     default:
17177       break;
17178     }
17179   }
17180
17181   switch (IntNo) {
17182   default: return SDValue();    // Don't custom lower most intrinsics.
17183
17184   case Intrinsic::x86_avx2_permd:
17185   case Intrinsic::x86_avx2_permps:
17186     // Operands intentionally swapped. Mask is last operand to intrinsic,
17187     // but second operand for node/instruction.
17188     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17189                        Op.getOperand(2), Op.getOperand(1));
17190
17191   case Intrinsic::x86_avx512_mask_valign_q_512:
17192   case Intrinsic::x86_avx512_mask_valign_d_512:
17193     // Vector source operands are swapped.
17194     return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
17195                                             Op.getValueType(), Op.getOperand(2),
17196                                             Op.getOperand(1),
17197                                             Op.getOperand(3)),
17198                                 Op.getOperand(5), Op.getOperand(4),
17199                                 Subtarget, DAG);
17200
17201   // ptest and testp intrinsics. The intrinsic these come from are designed to
17202   // return an integer value, not just an instruction so lower it to the ptest
17203   // or testp pattern and a setcc for the result.
17204   case Intrinsic::x86_sse41_ptestz:
17205   case Intrinsic::x86_sse41_ptestc:
17206   case Intrinsic::x86_sse41_ptestnzc:
17207   case Intrinsic::x86_avx_ptestz_256:
17208   case Intrinsic::x86_avx_ptestc_256:
17209   case Intrinsic::x86_avx_ptestnzc_256:
17210   case Intrinsic::x86_avx_vtestz_ps:
17211   case Intrinsic::x86_avx_vtestc_ps:
17212   case Intrinsic::x86_avx_vtestnzc_ps:
17213   case Intrinsic::x86_avx_vtestz_pd:
17214   case Intrinsic::x86_avx_vtestc_pd:
17215   case Intrinsic::x86_avx_vtestnzc_pd:
17216   case Intrinsic::x86_avx_vtestz_ps_256:
17217   case Intrinsic::x86_avx_vtestc_ps_256:
17218   case Intrinsic::x86_avx_vtestnzc_ps_256:
17219   case Intrinsic::x86_avx_vtestz_pd_256:
17220   case Intrinsic::x86_avx_vtestc_pd_256:
17221   case Intrinsic::x86_avx_vtestnzc_pd_256: {
17222     bool IsTestPacked = false;
17223     unsigned X86CC;
17224     switch (IntNo) {
17225     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17226     case Intrinsic::x86_avx_vtestz_ps:
17227     case Intrinsic::x86_avx_vtestz_pd:
17228     case Intrinsic::x86_avx_vtestz_ps_256:
17229     case Intrinsic::x86_avx_vtestz_pd_256:
17230       IsTestPacked = true; // Fallthrough
17231     case Intrinsic::x86_sse41_ptestz:
17232     case Intrinsic::x86_avx_ptestz_256:
17233       // ZF = 1
17234       X86CC = X86::COND_E;
17235       break;
17236     case Intrinsic::x86_avx_vtestc_ps:
17237     case Intrinsic::x86_avx_vtestc_pd:
17238     case Intrinsic::x86_avx_vtestc_ps_256:
17239     case Intrinsic::x86_avx_vtestc_pd_256:
17240       IsTestPacked = true; // Fallthrough
17241     case Intrinsic::x86_sse41_ptestc:
17242     case Intrinsic::x86_avx_ptestc_256:
17243       // CF = 1
17244       X86CC = X86::COND_B;
17245       break;
17246     case Intrinsic::x86_avx_vtestnzc_ps:
17247     case Intrinsic::x86_avx_vtestnzc_pd:
17248     case Intrinsic::x86_avx_vtestnzc_ps_256:
17249     case Intrinsic::x86_avx_vtestnzc_pd_256:
17250       IsTestPacked = true; // Fallthrough
17251     case Intrinsic::x86_sse41_ptestnzc:
17252     case Intrinsic::x86_avx_ptestnzc_256:
17253       // ZF and CF = 0
17254       X86CC = X86::COND_A;
17255       break;
17256     }
17257
17258     SDValue LHS = Op.getOperand(1);
17259     SDValue RHS = Op.getOperand(2);
17260     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
17261     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
17262     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17263     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
17264     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17265   }
17266   case Intrinsic::x86_avx512_kortestz_w:
17267   case Intrinsic::x86_avx512_kortestc_w: {
17268     unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
17269     SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
17270     SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
17271     SDValue CC = DAG.getConstant(X86CC, MVT::i8);
17272     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
17273     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i1, CC, Test);
17274     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17275   }
17276
17277   case Intrinsic::x86_sse42_pcmpistria128:
17278   case Intrinsic::x86_sse42_pcmpestria128:
17279   case Intrinsic::x86_sse42_pcmpistric128:
17280   case Intrinsic::x86_sse42_pcmpestric128:
17281   case Intrinsic::x86_sse42_pcmpistrio128:
17282   case Intrinsic::x86_sse42_pcmpestrio128:
17283   case Intrinsic::x86_sse42_pcmpistris128:
17284   case Intrinsic::x86_sse42_pcmpestris128:
17285   case Intrinsic::x86_sse42_pcmpistriz128:
17286   case Intrinsic::x86_sse42_pcmpestriz128: {
17287     unsigned Opcode;
17288     unsigned X86CC;
17289     switch (IntNo) {
17290     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
17291     case Intrinsic::x86_sse42_pcmpistria128:
17292       Opcode = X86ISD::PCMPISTRI;
17293       X86CC = X86::COND_A;
17294       break;
17295     case Intrinsic::x86_sse42_pcmpestria128:
17296       Opcode = X86ISD::PCMPESTRI;
17297       X86CC = X86::COND_A;
17298       break;
17299     case Intrinsic::x86_sse42_pcmpistric128:
17300       Opcode = X86ISD::PCMPISTRI;
17301       X86CC = X86::COND_B;
17302       break;
17303     case Intrinsic::x86_sse42_pcmpestric128:
17304       Opcode = X86ISD::PCMPESTRI;
17305       X86CC = X86::COND_B;
17306       break;
17307     case Intrinsic::x86_sse42_pcmpistrio128:
17308       Opcode = X86ISD::PCMPISTRI;
17309       X86CC = X86::COND_O;
17310       break;
17311     case Intrinsic::x86_sse42_pcmpestrio128:
17312       Opcode = X86ISD::PCMPESTRI;
17313       X86CC = X86::COND_O;
17314       break;
17315     case Intrinsic::x86_sse42_pcmpistris128:
17316       Opcode = X86ISD::PCMPISTRI;
17317       X86CC = X86::COND_S;
17318       break;
17319     case Intrinsic::x86_sse42_pcmpestris128:
17320       Opcode = X86ISD::PCMPESTRI;
17321       X86CC = X86::COND_S;
17322       break;
17323     case Intrinsic::x86_sse42_pcmpistriz128:
17324       Opcode = X86ISD::PCMPISTRI;
17325       X86CC = X86::COND_E;
17326       break;
17327     case Intrinsic::x86_sse42_pcmpestriz128:
17328       Opcode = X86ISD::PCMPESTRI;
17329       X86CC = X86::COND_E;
17330       break;
17331     }
17332     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17333     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17334     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
17335     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17336                                 DAG.getConstant(X86CC, MVT::i8),
17337                                 SDValue(PCMP.getNode(), 1));
17338     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17339   }
17340
17341   case Intrinsic::x86_sse42_pcmpistri128:
17342   case Intrinsic::x86_sse42_pcmpestri128: {
17343     unsigned Opcode;
17344     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
17345       Opcode = X86ISD::PCMPISTRI;
17346     else
17347       Opcode = X86ISD::PCMPESTRI;
17348
17349     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
17350     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17351     return DAG.getNode(Opcode, dl, VTs, NewOps);
17352   }
17353
17354   case Intrinsic::x86_fma_mask_vfmadd_ps_512:
17355   case Intrinsic::x86_fma_mask_vfmadd_pd_512:
17356   case Intrinsic::x86_fma_mask_vfmsub_ps_512:
17357   case Intrinsic::x86_fma_mask_vfmsub_pd_512:
17358   case Intrinsic::x86_fma_mask_vfnmadd_ps_512:
17359   case Intrinsic::x86_fma_mask_vfnmadd_pd_512:
17360   case Intrinsic::x86_fma_mask_vfnmsub_ps_512:
17361   case Intrinsic::x86_fma_mask_vfnmsub_pd_512:
17362   case Intrinsic::x86_fma_mask_vfmaddsub_ps_512:
17363   case Intrinsic::x86_fma_mask_vfmaddsub_pd_512:
17364   case Intrinsic::x86_fma_mask_vfmsubadd_ps_512:
17365   case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: {
17366     auto *SAE = cast<ConstantSDNode>(Op.getOperand(5));
17367     if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION)
17368       return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo),
17369                                               dl, Op.getValueType(),
17370                                               Op.getOperand(1),
17371                                               Op.getOperand(2),
17372                                               Op.getOperand(3)),
17373                                   Op.getOperand(4), Op.getOperand(1),
17374                                   Subtarget, DAG);
17375     else
17376       return SDValue();
17377   }
17378
17379   case Intrinsic::x86_fma_vfmadd_ps:
17380   case Intrinsic::x86_fma_vfmadd_pd:
17381   case Intrinsic::x86_fma_vfmsub_ps:
17382   case Intrinsic::x86_fma_vfmsub_pd:
17383   case Intrinsic::x86_fma_vfnmadd_ps:
17384   case Intrinsic::x86_fma_vfnmadd_pd:
17385   case Intrinsic::x86_fma_vfnmsub_ps:
17386   case Intrinsic::x86_fma_vfnmsub_pd:
17387   case Intrinsic::x86_fma_vfmaddsub_ps:
17388   case Intrinsic::x86_fma_vfmaddsub_pd:
17389   case Intrinsic::x86_fma_vfmsubadd_ps:
17390   case Intrinsic::x86_fma_vfmsubadd_pd:
17391   case Intrinsic::x86_fma_vfmadd_ps_256:
17392   case Intrinsic::x86_fma_vfmadd_pd_256:
17393   case Intrinsic::x86_fma_vfmsub_ps_256:
17394   case Intrinsic::x86_fma_vfmsub_pd_256:
17395   case Intrinsic::x86_fma_vfnmadd_ps_256:
17396   case Intrinsic::x86_fma_vfnmadd_pd_256:
17397   case Intrinsic::x86_fma_vfnmsub_ps_256:
17398   case Intrinsic::x86_fma_vfnmsub_pd_256:
17399   case Intrinsic::x86_fma_vfmaddsub_ps_256:
17400   case Intrinsic::x86_fma_vfmaddsub_pd_256:
17401   case Intrinsic::x86_fma_vfmsubadd_ps_256:
17402   case Intrinsic::x86_fma_vfmsubadd_pd_256:
17403     return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(),
17404                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
17405   }
17406 }
17407
17408 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17409                               SDValue Src, SDValue Mask, SDValue Base,
17410                               SDValue Index, SDValue ScaleOp, SDValue Chain,
17411                               const X86Subtarget * Subtarget) {
17412   SDLoc dl(Op);
17413   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17414   assert(C && "Invalid scale type");
17415   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17416   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17417                              Index.getSimpleValueType().getVectorNumElements());
17418   SDValue MaskInReg;
17419   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17420   if (MaskC)
17421     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17422   else
17423     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17424   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
17425   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17426   SDValue Segment = DAG.getRegister(0, MVT::i32);
17427   if (Src.getOpcode() == ISD::UNDEF)
17428     Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
17429   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17430   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17431   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
17432   return DAG.getMergeValues(RetOps, dl);
17433 }
17434
17435 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17436                                SDValue Src, SDValue Mask, SDValue Base,
17437                                SDValue Index, SDValue ScaleOp, SDValue Chain) {
17438   SDLoc dl(Op);
17439   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17440   assert(C && "Invalid scale type");
17441   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17442   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17443   SDValue Segment = DAG.getRegister(0, MVT::i32);
17444   EVT MaskVT = MVT::getVectorVT(MVT::i1,
17445                              Index.getSimpleValueType().getVectorNumElements());
17446   SDValue MaskInReg;
17447   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17448   if (MaskC)
17449     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17450   else
17451     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17452   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
17453   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
17454   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
17455   return SDValue(Res, 1);
17456 }
17457
17458 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
17459                                SDValue Mask, SDValue Base, SDValue Index,
17460                                SDValue ScaleOp, SDValue Chain) {
17461   SDLoc dl(Op);
17462   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
17463   assert(C && "Invalid scale type");
17464   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
17465   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
17466   SDValue Segment = DAG.getRegister(0, MVT::i32);
17467   EVT MaskVT =
17468     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
17469   SDValue MaskInReg;
17470   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
17471   if (MaskC)
17472     MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
17473   else
17474     MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
17475   //SDVTList VTs = DAG.getVTList(MVT::Other);
17476   SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
17477   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
17478   return SDValue(Res, 0);
17479 }
17480
17481 // getReadPerformanceCounter - Handles the lowering of builtin intrinsics that
17482 // read performance monitor counters (x86_rdpmc).
17483 static void getReadPerformanceCounter(SDNode *N, SDLoc DL,
17484                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17485                               SmallVectorImpl<SDValue> &Results) {
17486   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17487   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17488   SDValue LO, HI;
17489
17490   // The ECX register is used to select the index of the performance counter
17491   // to read.
17492   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
17493                                    N->getOperand(2));
17494   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
17495
17496   // Reads the content of a 64-bit performance counter and returns it in the
17497   // registers EDX:EAX.
17498   if (Subtarget->is64Bit()) {
17499     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17500     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17501                             LO.getValue(2));
17502   } else {
17503     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17504     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17505                             LO.getValue(2));
17506   }
17507   Chain = HI.getValue(1);
17508
17509   if (Subtarget->is64Bit()) {
17510     // The EAX register is loaded with the low-order 32 bits. The EDX register
17511     // is loaded with the supported high-order bits of the counter.
17512     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17513                               DAG.getConstant(32, MVT::i8));
17514     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17515     Results.push_back(Chain);
17516     return;
17517   }
17518
17519   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17520   SDValue Ops[] = { LO, HI };
17521   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17522   Results.push_back(Pair);
17523   Results.push_back(Chain);
17524 }
17525
17526 // getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
17527 // read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
17528 // also used to custom lower READCYCLECOUNTER nodes.
17529 static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
17530                               SelectionDAG &DAG, const X86Subtarget *Subtarget,
17531                               SmallVectorImpl<SDValue> &Results) {
17532   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
17533   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
17534   SDValue LO, HI;
17535
17536   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
17537   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
17538   // and the EAX register is loaded with the low-order 32 bits.
17539   if (Subtarget->is64Bit()) {
17540     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
17541     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
17542                             LO.getValue(2));
17543   } else {
17544     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
17545     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
17546                             LO.getValue(2));
17547   }
17548   SDValue Chain = HI.getValue(1);
17549
17550   if (Opcode == X86ISD::RDTSCP_DAG) {
17551     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
17552
17553     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
17554     // the ECX register. Add 'ecx' explicitly to the chain.
17555     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
17556                                      HI.getValue(2));
17557     // Explicitly store the content of ECX at the location passed in input
17558     // to the 'rdtscp' intrinsic.
17559     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
17560                          MachinePointerInfo(), false, false, 0);
17561   }
17562
17563   if (Subtarget->is64Bit()) {
17564     // The EDX register is loaded with the high-order 32 bits of the MSR, and
17565     // the EAX register is loaded with the low-order 32 bits.
17566     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
17567                               DAG.getConstant(32, MVT::i8));
17568     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
17569     Results.push_back(Chain);
17570     return;
17571   }
17572
17573   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
17574   SDValue Ops[] = { LO, HI };
17575   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
17576   Results.push_back(Pair);
17577   Results.push_back(Chain);
17578 }
17579
17580 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
17581                                      SelectionDAG &DAG) {
17582   SmallVector<SDValue, 2> Results;
17583   SDLoc DL(Op);
17584   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
17585                           Results);
17586   return DAG.getMergeValues(Results, DL);
17587 }
17588
17589
17590 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
17591                                       SelectionDAG &DAG) {
17592   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
17593
17594   const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
17595   if (!IntrData)
17596     return SDValue();
17597
17598   SDLoc dl(Op);
17599   switch(IntrData->Type) {
17600   default:
17601     llvm_unreachable("Unknown Intrinsic Type");
17602     break;
17603   case RDSEED:
17604   case RDRAND: {
17605     // Emit the node with the right value type.
17606     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
17607     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17608
17609     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
17610     // Otherwise return the value from Rand, which is always 0, casted to i32.
17611     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
17612                       DAG.getConstant(1, Op->getValueType(1)),
17613                       DAG.getConstant(X86::COND_B, MVT::i32),
17614                       SDValue(Result.getNode(), 1) };
17615     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
17616                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
17617                                   Ops);
17618
17619     // Return { result, isValid, chain }.
17620     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
17621                        SDValue(Result.getNode(), 2));
17622   }
17623   case GATHER: {
17624   //gather(v1, mask, index, base, scale);
17625     SDValue Chain = Op.getOperand(0);
17626     SDValue Src   = Op.getOperand(2);
17627     SDValue Base  = Op.getOperand(3);
17628     SDValue Index = Op.getOperand(4);
17629     SDValue Mask  = Op.getOperand(5);
17630     SDValue Scale = Op.getOperand(6);
17631     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
17632                           Subtarget);
17633   }
17634   case SCATTER: {
17635   //scatter(base, mask, index, v1, scale);
17636     SDValue Chain = Op.getOperand(0);
17637     SDValue Base  = Op.getOperand(2);
17638     SDValue Mask  = Op.getOperand(3);
17639     SDValue Index = Op.getOperand(4);
17640     SDValue Src   = Op.getOperand(5);
17641     SDValue Scale = Op.getOperand(6);
17642     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
17643   }
17644   case PREFETCH: {
17645     SDValue Hint = Op.getOperand(6);
17646     unsigned HintVal;
17647     if (dyn_cast<ConstantSDNode> (Hint) == nullptr ||
17648         (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
17649       llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
17650     unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
17651     SDValue Chain = Op.getOperand(0);
17652     SDValue Mask  = Op.getOperand(2);
17653     SDValue Index = Op.getOperand(3);
17654     SDValue Base  = Op.getOperand(4);
17655     SDValue Scale = Op.getOperand(5);
17656     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
17657   }
17658   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
17659   case RDTSC: {
17660     SmallVector<SDValue, 2> Results;
17661     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results);
17662     return DAG.getMergeValues(Results, dl);
17663   }
17664   // Read Performance Monitoring Counters.
17665   case RDPMC: {
17666     SmallVector<SDValue, 2> Results;
17667     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
17668     return DAG.getMergeValues(Results, dl);
17669   }
17670   // XTEST intrinsics.
17671   case XTEST: {
17672     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17673     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
17674     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17675                                 DAG.getConstant(X86::COND_NE, MVT::i8),
17676                                 InTrans);
17677     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
17678     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
17679                        Ret, SDValue(InTrans.getNode(), 1));
17680   }
17681   // ADC/ADCX/SBB
17682   case ADX: {
17683     SmallVector<SDValue, 2> Results;
17684     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
17685     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
17686     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
17687                                 DAG.getConstant(-1, MVT::i8));
17688     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
17689                               Op.getOperand(4), GenCF.getValue(1));
17690     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
17691                                  Op.getOperand(5), MachinePointerInfo(),
17692                                  false, false, 0);
17693     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17694                                 DAG.getConstant(X86::COND_B, MVT::i8),
17695                                 Res.getValue(1));
17696     Results.push_back(SetCC);
17697     Results.push_back(Store);
17698     return DAG.getMergeValues(Results, dl);
17699   }
17700   case COMPRESS_TO_MEM: {
17701     SDLoc dl(Op);
17702     SDValue Mask = Op.getOperand(4);
17703     SDValue DataToCompress = Op.getOperand(3);
17704     SDValue Addr = Op.getOperand(2);
17705     SDValue Chain = Op.getOperand(0);
17706
17707     if (isAllOnes(Mask)) // return just a store
17708       return DAG.getStore(Chain, dl, DataToCompress, Addr,
17709                           MachinePointerInfo(), false, false, 0);
17710
17711     EVT VT = DataToCompress.getValueType();
17712     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17713                                   VT.getVectorNumElements());
17714     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17715                                      Mask.getValueType().getSizeInBits());
17716     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17717                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17718                                 DAG.getIntPtrConstant(0));
17719
17720     SDValue Compressed =  DAG.getNode(IntrData->Opc0, dl, VT, VMask,
17721                                       DataToCompress, DAG.getUNDEF(VT));
17722     return DAG.getStore(Chain, dl, Compressed, Addr,
17723                         MachinePointerInfo(), false, false, 0);
17724   }
17725   case EXPAND_FROM_MEM: {
17726     SDLoc dl(Op);
17727     SDValue Mask = Op.getOperand(4);
17728     SDValue PathThru = Op.getOperand(3);
17729     SDValue Addr = Op.getOperand(2);
17730     SDValue Chain = Op.getOperand(0);
17731     EVT VT = Op.getValueType();
17732
17733     if (isAllOnes(Mask)) // return just a load
17734       return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false,
17735                          false, 0);
17736     EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17737                                   VT.getVectorNumElements());
17738     EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
17739                                      Mask.getValueType().getSizeInBits());
17740     SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17741                                 DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask),
17742                                 DAG.getIntPtrConstant(0));
17743
17744     SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(),
17745                                    false, false, false, 0);
17746
17747     SmallVector<SDValue, 2> Results;
17748     Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand,
17749                                   PathThru));
17750     Results.push_back(Chain);
17751     return DAG.getMergeValues(Results, dl);
17752   }
17753   }
17754 }
17755
17756 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
17757                                            SelectionDAG &DAG) const {
17758   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17759   MFI->setReturnAddressIsTaken(true);
17760
17761   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
17762     return SDValue();
17763
17764   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17765   SDLoc dl(Op);
17766   EVT PtrVT = getPointerTy();
17767
17768   if (Depth > 0) {
17769     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
17770     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17771         DAG.getSubtarget().getRegisterInfo());
17772     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
17773     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17774                        DAG.getNode(ISD::ADD, dl, PtrVT,
17775                                    FrameAddr, Offset),
17776                        MachinePointerInfo(), false, false, false, 0);
17777   }
17778
17779   // Just load the return address.
17780   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
17781   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
17782                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
17783 }
17784
17785 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
17786   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
17787   MFI->setFrameAddressIsTaken(true);
17788
17789   EVT VT = Op.getValueType();
17790   SDLoc dl(Op);  // FIXME probably not meaningful
17791   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17792   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17793       DAG.getSubtarget().getRegisterInfo());
17794   unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(
17795       DAG.getMachineFunction());
17796   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
17797           (FrameReg == X86::EBP && VT == MVT::i32)) &&
17798          "Invalid Frame Register!");
17799   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
17800   while (Depth--)
17801     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
17802                             MachinePointerInfo(),
17803                             false, false, false, 0);
17804   return FrameAddr;
17805 }
17806
17807 // FIXME? Maybe this could be a TableGen attribute on some registers and
17808 // this table could be generated automatically from RegInfo.
17809 unsigned X86TargetLowering::getRegisterByName(const char* RegName,
17810                                               EVT VT) const {
17811   unsigned Reg = StringSwitch<unsigned>(RegName)
17812                        .Case("esp", X86::ESP)
17813                        .Case("rsp", X86::RSP)
17814                        .Default(0);
17815   if (Reg)
17816     return Reg;
17817   report_fatal_error("Invalid register name global variable");
17818 }
17819
17820 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
17821                                                      SelectionDAG &DAG) const {
17822   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17823       DAG.getSubtarget().getRegisterInfo());
17824   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
17825 }
17826
17827 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
17828   SDValue Chain     = Op.getOperand(0);
17829   SDValue Offset    = Op.getOperand(1);
17830   SDValue Handler   = Op.getOperand(2);
17831   SDLoc dl      (Op);
17832
17833   EVT PtrVT = getPointerTy();
17834   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
17835       DAG.getSubtarget().getRegisterInfo());
17836   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
17837   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
17838           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
17839          "Invalid Frame Register!");
17840   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
17841   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
17842
17843   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
17844                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
17845   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
17846   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
17847                        false, false, 0);
17848   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
17849
17850   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
17851                      DAG.getRegister(StoreAddrReg, PtrVT));
17852 }
17853
17854 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
17855                                                SelectionDAG &DAG) const {
17856   SDLoc DL(Op);
17857   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
17858                      DAG.getVTList(MVT::i32, MVT::Other),
17859                      Op.getOperand(0), Op.getOperand(1));
17860 }
17861
17862 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
17863                                                 SelectionDAG &DAG) const {
17864   SDLoc DL(Op);
17865   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
17866                      Op.getOperand(0), Op.getOperand(1));
17867 }
17868
17869 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
17870   return Op.getOperand(0);
17871 }
17872
17873 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
17874                                                 SelectionDAG &DAG) const {
17875   SDValue Root = Op.getOperand(0);
17876   SDValue Trmp = Op.getOperand(1); // trampoline
17877   SDValue FPtr = Op.getOperand(2); // nested function
17878   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
17879   SDLoc dl (Op);
17880
17881   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17882   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
17883
17884   if (Subtarget->is64Bit()) {
17885     SDValue OutChains[6];
17886
17887     // Large code-model.
17888     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
17889     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
17890
17891     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
17892     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
17893
17894     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
17895
17896     // Load the pointer to the nested function into R11.
17897     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
17898     SDValue Addr = Trmp;
17899     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17900                                 Addr, MachinePointerInfo(TrmpAddr),
17901                                 false, false, 0);
17902
17903     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17904                        DAG.getConstant(2, MVT::i64));
17905     OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
17906                                 MachinePointerInfo(TrmpAddr, 2),
17907                                 false, false, 2);
17908
17909     // Load the 'nest' parameter value into R10.
17910     // R10 is specified in X86CallingConv.td
17911     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
17912     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17913                        DAG.getConstant(10, MVT::i64));
17914     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17915                                 Addr, MachinePointerInfo(TrmpAddr, 10),
17916                                 false, false, 0);
17917
17918     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17919                        DAG.getConstant(12, MVT::i64));
17920     OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
17921                                 MachinePointerInfo(TrmpAddr, 12),
17922                                 false, false, 2);
17923
17924     // Jump to the nested function.
17925     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
17926     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17927                        DAG.getConstant(20, MVT::i64));
17928     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
17929                                 Addr, MachinePointerInfo(TrmpAddr, 20),
17930                                 false, false, 0);
17931
17932     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
17933     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
17934                        DAG.getConstant(22, MVT::i64));
17935     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
17936                                 MachinePointerInfo(TrmpAddr, 22),
17937                                 false, false, 0);
17938
17939     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
17940   } else {
17941     const Function *Func =
17942       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
17943     CallingConv::ID CC = Func->getCallingConv();
17944     unsigned NestReg;
17945
17946     switch (CC) {
17947     default:
17948       llvm_unreachable("Unsupported calling convention");
17949     case CallingConv::C:
17950     case CallingConv::X86_StdCall: {
17951       // Pass 'nest' parameter in ECX.
17952       // Must be kept in sync with X86CallingConv.td
17953       NestReg = X86::ECX;
17954
17955       // Check that ECX wasn't needed by an 'inreg' parameter.
17956       FunctionType *FTy = Func->getFunctionType();
17957       const AttributeSet &Attrs = Func->getAttributes();
17958
17959       if (!Attrs.isEmpty() && !Func->isVarArg()) {
17960         unsigned InRegCount = 0;
17961         unsigned Idx = 1;
17962
17963         for (FunctionType::param_iterator I = FTy->param_begin(),
17964              E = FTy->param_end(); I != E; ++I, ++Idx)
17965           if (Attrs.hasAttribute(Idx, Attribute::InReg))
17966             // FIXME: should only count parameters that are lowered to integers.
17967             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
17968
17969         if (InRegCount > 2) {
17970           report_fatal_error("Nest register in use - reduce number of inreg"
17971                              " parameters!");
17972         }
17973       }
17974       break;
17975     }
17976     case CallingConv::X86_FastCall:
17977     case CallingConv::X86_ThisCall:
17978     case CallingConv::Fast:
17979       // Pass 'nest' parameter in EAX.
17980       // Must be kept in sync with X86CallingConv.td
17981       NestReg = X86::EAX;
17982       break;
17983     }
17984
17985     SDValue OutChains[4];
17986     SDValue Addr, Disp;
17987
17988     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
17989                        DAG.getConstant(10, MVT::i32));
17990     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
17991
17992     // This is storing the opcode for MOV32ri.
17993     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
17994     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
17995     OutChains[0] = DAG.getStore(Root, dl,
17996                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
17997                                 Trmp, MachinePointerInfo(TrmpAddr),
17998                                 false, false, 0);
17999
18000     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18001                        DAG.getConstant(1, MVT::i32));
18002     OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18003                                 MachinePointerInfo(TrmpAddr, 1),
18004                                 false, false, 1);
18005
18006     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18007     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18008                        DAG.getConstant(5, MVT::i32));
18009     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
18010                                 MachinePointerInfo(TrmpAddr, 5),
18011                                 false, false, 1);
18012
18013     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18014                        DAG.getConstant(6, MVT::i32));
18015     OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18016                                 MachinePointerInfo(TrmpAddr, 6),
18017                                 false, false, 1);
18018
18019     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18020   }
18021 }
18022
18023 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18024                                             SelectionDAG &DAG) const {
18025   /*
18026    The rounding mode is in bits 11:10 of FPSR, and has the following
18027    settings:
18028      00 Round to nearest
18029      01 Round to -inf
18030      10 Round to +inf
18031      11 Round to 0
18032
18033   FLT_ROUNDS, on the other hand, expects the following:
18034     -1 Undefined
18035      0 Round to 0
18036      1 Round to nearest
18037      2 Round to +inf
18038      3 Round to -inf
18039
18040   To perform the conversion, we do:
18041     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18042   */
18043
18044   MachineFunction &MF = DAG.getMachineFunction();
18045   const TargetMachine &TM = MF.getTarget();
18046   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
18047   unsigned StackAlignment = TFI.getStackAlignment();
18048   MVT VT = Op.getSimpleValueType();
18049   SDLoc DL(Op);
18050
18051   // Save FP Control Word to stack slot
18052   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18053   SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
18054
18055   MachineMemOperand *MMO =
18056    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
18057                            MachineMemOperand::MOStore, 2, 2);
18058
18059   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18060   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18061                                           DAG.getVTList(MVT::Other),
18062                                           Ops, MVT::i16, MMO);
18063
18064   // Load FP Control Word from stack slot
18065   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18066                             MachinePointerInfo(), false, false, false, 0);
18067
18068   // Transform as necessary
18069   SDValue CWD1 =
18070     DAG.getNode(ISD::SRL, DL, MVT::i16,
18071                 DAG.getNode(ISD::AND, DL, MVT::i16,
18072                             CWD, DAG.getConstant(0x800, MVT::i16)),
18073                 DAG.getConstant(11, MVT::i8));
18074   SDValue CWD2 =
18075     DAG.getNode(ISD::SRL, DL, MVT::i16,
18076                 DAG.getNode(ISD::AND, DL, MVT::i16,
18077                             CWD, DAG.getConstant(0x400, MVT::i16)),
18078                 DAG.getConstant(9, MVT::i8));
18079
18080   SDValue RetVal =
18081     DAG.getNode(ISD::AND, DL, MVT::i16,
18082                 DAG.getNode(ISD::ADD, DL, MVT::i16,
18083                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18084                             DAG.getConstant(1, MVT::i16)),
18085                 DAG.getConstant(3, MVT::i16));
18086
18087   return DAG.getNode((VT.getSizeInBits() < 16 ?
18088                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18089 }
18090
18091 static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
18092   MVT VT = Op.getSimpleValueType();
18093   EVT OpVT = VT;
18094   unsigned NumBits = VT.getSizeInBits();
18095   SDLoc dl(Op);
18096
18097   Op = Op.getOperand(0);
18098   if (VT == MVT::i8) {
18099     // Zero extend to i32 since there is not an i8 bsr.
18100     OpVT = MVT::i32;
18101     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18102   }
18103
18104   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
18105   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18106   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18107
18108   // If src is zero (i.e. bsr sets ZF), returns NumBits.
18109   SDValue Ops[] = {
18110     Op,
18111     DAG.getConstant(NumBits+NumBits-1, OpVT),
18112     DAG.getConstant(X86::COND_E, MVT::i8),
18113     Op.getValue(1)
18114   };
18115   Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
18116
18117   // Finally xor with NumBits-1.
18118   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18119
18120   if (VT == MVT::i8)
18121     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18122   return Op;
18123 }
18124
18125 static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
18126   MVT VT = Op.getSimpleValueType();
18127   EVT OpVT = VT;
18128   unsigned NumBits = VT.getSizeInBits();
18129   SDLoc dl(Op);
18130
18131   Op = Op.getOperand(0);
18132   if (VT == MVT::i8) {
18133     // Zero extend to i32 since there is not an i8 bsr.
18134     OpVT = MVT::i32;
18135     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
18136   }
18137
18138   // Issue a bsr (scan bits in reverse).
18139   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
18140   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
18141
18142   // And xor with NumBits-1.
18143   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
18144
18145   if (VT == MVT::i8)
18146     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
18147   return Op;
18148 }
18149
18150 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
18151   MVT VT = Op.getSimpleValueType();
18152   unsigned NumBits = VT.getSizeInBits();
18153   SDLoc dl(Op);
18154   Op = Op.getOperand(0);
18155
18156   // Issue a bsf (scan bits forward) which also sets EFLAGS.
18157   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18158   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
18159
18160   // If src is zero (i.e. bsf sets ZF), returns NumBits.
18161   SDValue Ops[] = {
18162     Op,
18163     DAG.getConstant(NumBits, VT),
18164     DAG.getConstant(X86::COND_E, MVT::i8),
18165     Op.getValue(1)
18166   };
18167   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
18168 }
18169
18170 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
18171 // ones, and then concatenate the result back.
18172 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
18173   MVT VT = Op.getSimpleValueType();
18174
18175   assert(VT.is256BitVector() && VT.isInteger() &&
18176          "Unsupported value type for operation");
18177
18178   unsigned NumElems = VT.getVectorNumElements();
18179   SDLoc dl(Op);
18180
18181   // Extract the LHS vectors
18182   SDValue LHS = Op.getOperand(0);
18183   SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
18184   SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
18185
18186   // Extract the RHS vectors
18187   SDValue RHS = Op.getOperand(1);
18188   SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, dl);
18189   SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
18190
18191   MVT EltVT = VT.getVectorElementType();
18192   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18193
18194   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18195                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
18196                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
18197 }
18198
18199 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
18200   assert(Op.getSimpleValueType().is256BitVector() &&
18201          Op.getSimpleValueType().isInteger() &&
18202          "Only handle AVX 256-bit vector integer operation");
18203   return Lower256IntArith(Op, DAG);
18204 }
18205
18206 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
18207   assert(Op.getSimpleValueType().is256BitVector() &&
18208          Op.getSimpleValueType().isInteger() &&
18209          "Only handle AVX 256-bit vector integer operation");
18210   return Lower256IntArith(Op, DAG);
18211 }
18212
18213 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
18214                         SelectionDAG &DAG) {
18215   SDLoc dl(Op);
18216   MVT VT = Op.getSimpleValueType();
18217
18218   // Decompose 256-bit ops into smaller 128-bit ops.
18219   if (VT.is256BitVector() && !Subtarget->hasInt256())
18220     return Lower256IntArith(Op, DAG);
18221
18222   SDValue A = Op.getOperand(0);
18223   SDValue B = Op.getOperand(1);
18224
18225   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
18226   if (VT == MVT::v4i32) {
18227     assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
18228            "Should not custom lower when pmuldq is available!");
18229
18230     // Extract the odd parts.
18231     static const int UnpackMask[] = { 1, -1, 3, -1 };
18232     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
18233     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
18234
18235     // Multiply the even parts.
18236     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
18237     // Now multiply odd parts.
18238     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
18239
18240     Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
18241     Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
18242
18243     // Merge the two vectors back together with a shuffle. This expands into 2
18244     // shuffles.
18245     static const int ShufMask[] = { 0, 4, 2, 6 };
18246     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
18247   }
18248
18249   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
18250          "Only know how to lower V2I64/V4I64/V8I64 multiply");
18251
18252   //  Ahi = psrlqi(a, 32);
18253   //  Bhi = psrlqi(b, 32);
18254   //
18255   //  AloBlo = pmuludq(a, b);
18256   //  AloBhi = pmuludq(a, Bhi);
18257   //  AhiBlo = pmuludq(Ahi, b);
18258
18259   //  AloBhi = psllqi(AloBhi, 32);
18260   //  AhiBlo = psllqi(AhiBlo, 32);
18261   //  return AloBlo + AloBhi + AhiBlo;
18262
18263   SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
18264   SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
18265
18266   // Bit cast to 32-bit vectors for MULUDQ
18267   EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
18268                                   (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
18269   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
18270   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
18271   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
18272   Bhi = DAG.getNode(ISD::BITCAST, dl, MulVT, Bhi);
18273
18274   SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
18275   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
18276   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
18277
18278   AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
18279   AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
18280
18281   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
18282   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
18283 }
18284
18285 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
18286   assert(Subtarget->isTargetWin64() && "Unexpected target");
18287   EVT VT = Op.getValueType();
18288   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
18289          "Unexpected return type for lowering");
18290
18291   RTLIB::Libcall LC;
18292   bool isSigned;
18293   switch (Op->getOpcode()) {
18294   default: llvm_unreachable("Unexpected request for libcall!");
18295   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
18296   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
18297   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
18298   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
18299   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
18300   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
18301   }
18302
18303   SDLoc dl(Op);
18304   SDValue InChain = DAG.getEntryNode();
18305
18306   TargetLowering::ArgListTy Args;
18307   TargetLowering::ArgListEntry Entry;
18308   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
18309     EVT ArgVT = Op->getOperand(i).getValueType();
18310     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
18311            "Unexpected argument type for lowering");
18312     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
18313     Entry.Node = StackPtr;
18314     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
18315                            false, false, 16);
18316     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18317     Entry.Ty = PointerType::get(ArgTy,0);
18318     Entry.isSExt = false;
18319     Entry.isZExt = false;
18320     Args.push_back(Entry);
18321   }
18322
18323   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
18324                                          getPointerTy());
18325
18326   TargetLowering::CallLoweringInfo CLI(DAG);
18327   CLI.setDebugLoc(dl).setChain(InChain)
18328     .setCallee(getLibcallCallingConv(LC),
18329                static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
18330                Callee, std::move(Args), 0)
18331     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
18332
18333   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
18334   return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
18335 }
18336
18337 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
18338                              SelectionDAG &DAG) {
18339   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
18340   EVT VT = Op0.getValueType();
18341   SDLoc dl(Op);
18342
18343   assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
18344          (VT == MVT::v8i32 && Subtarget->hasInt256()));
18345
18346   // PMULxD operations multiply each even value (starting at 0) of LHS with
18347   // the related value of RHS and produce a widen result.
18348   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18349   // => <2 x i64> <ae|cg>
18350   //
18351   // In other word, to have all the results, we need to perform two PMULxD:
18352   // 1. one with the even values.
18353   // 2. one with the odd values.
18354   // To achieve #2, with need to place the odd values at an even position.
18355   //
18356   // Place the odd value at an even position (basically, shift all values 1
18357   // step to the left):
18358   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
18359   // <a|b|c|d> => <b|undef|d|undef>
18360   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
18361   // <e|f|g|h> => <f|undef|h|undef>
18362   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
18363
18364   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
18365   // ints.
18366   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
18367   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
18368   unsigned Opcode =
18369       (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
18370   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
18371   // => <2 x i64> <ae|cg>
18372   SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
18373                              DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
18374   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
18375   // => <2 x i64> <bf|dh>
18376   SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
18377                              DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
18378
18379   // Shuffle it back into the right order.
18380   SDValue Highs, Lows;
18381   if (VT == MVT::v8i32) {
18382     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
18383     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18384     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
18385     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18386   } else {
18387     const int HighMask[] = {1, 5, 3, 7};
18388     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
18389     const int LowMask[] = {0, 4, 2, 6};
18390     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
18391   }
18392
18393   // If we have a signed multiply but no PMULDQ fix up the high parts of a
18394   // unsigned multiply.
18395   if (IsSigned && !Subtarget->hasSSE41()) {
18396     SDValue ShAmt =
18397         DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
18398     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
18399                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
18400     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
18401                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
18402
18403     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
18404     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
18405   }
18406
18407   // The first result of MUL_LOHI is actually the low value, followed by the
18408   // high value.
18409   SDValue Ops[] = {Lows, Highs};
18410   return DAG.getMergeValues(Ops, dl);
18411 }
18412
18413 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
18414                                          const X86Subtarget *Subtarget) {
18415   MVT VT = Op.getSimpleValueType();
18416   SDLoc dl(Op);
18417   SDValue R = Op.getOperand(0);
18418   SDValue Amt = Op.getOperand(1);
18419
18420   // Optimize shl/srl/sra with constant shift amount.
18421   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
18422     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
18423       uint64_t ShiftAmt = ShiftConst->getZExtValue();
18424
18425       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
18426           (Subtarget->hasInt256() &&
18427            (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18428           (Subtarget->hasAVX512() &&
18429            (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18430         if (Op.getOpcode() == ISD::SHL)
18431           return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18432                                             DAG);
18433         if (Op.getOpcode() == ISD::SRL)
18434           return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18435                                             DAG);
18436         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
18437           return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18438                                             DAG);
18439       }
18440
18441       if (VT == MVT::v16i8) {
18442         if (Op.getOpcode() == ISD::SHL) {
18443           // Make a large shift.
18444           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18445                                                    MVT::v8i16, R, ShiftAmt,
18446                                                    DAG);
18447           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18448           // Zero out the rightmost bits.
18449           SmallVector<SDValue, 16> V(16,
18450                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18451                                                      MVT::i8));
18452           return DAG.getNode(ISD::AND, dl, VT, SHL,
18453                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18454         }
18455         if (Op.getOpcode() == ISD::SRL) {
18456           // Make a large shift.
18457           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18458                                                    MVT::v8i16, R, ShiftAmt,
18459                                                    DAG);
18460           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18461           // Zero out the leftmost bits.
18462           SmallVector<SDValue, 16> V(16,
18463                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18464                                                      MVT::i8));
18465           return DAG.getNode(ISD::AND, dl, VT, SRL,
18466                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18467         }
18468         if (Op.getOpcode() == ISD::SRA) {
18469           if (ShiftAmt == 7) {
18470             // R s>> 7  ===  R s< 0
18471             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18472             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18473           }
18474
18475           // R s>> a === ((R u>> a) ^ m) - m
18476           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18477           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
18478                                                          MVT::i8));
18479           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18480           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18481           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18482           return Res;
18483         }
18484         llvm_unreachable("Unknown shift opcode.");
18485       }
18486
18487       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
18488         if (Op.getOpcode() == ISD::SHL) {
18489           // Make a large shift.
18490           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
18491                                                    MVT::v16i16, R, ShiftAmt,
18492                                                    DAG);
18493           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
18494           // Zero out the rightmost bits.
18495           SmallVector<SDValue, 32> V(32,
18496                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
18497                                                      MVT::i8));
18498           return DAG.getNode(ISD::AND, dl, VT, SHL,
18499                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18500         }
18501         if (Op.getOpcode() == ISD::SRL) {
18502           // Make a large shift.
18503           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
18504                                                    MVT::v16i16, R, ShiftAmt,
18505                                                    DAG);
18506           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
18507           // Zero out the leftmost bits.
18508           SmallVector<SDValue, 32> V(32,
18509                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
18510                                                      MVT::i8));
18511           return DAG.getNode(ISD::AND, dl, VT, SRL,
18512                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
18513         }
18514         if (Op.getOpcode() == ISD::SRA) {
18515           if (ShiftAmt == 7) {
18516             // R s>> 7  ===  R s< 0
18517             SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
18518             return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
18519           }
18520
18521           // R s>> a === ((R u>> a) ^ m) - m
18522           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
18523           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
18524                                                          MVT::i8));
18525           SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
18526           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
18527           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
18528           return Res;
18529         }
18530         llvm_unreachable("Unknown shift opcode.");
18531       }
18532     }
18533   }
18534
18535   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18536   if (!Subtarget->is64Bit() &&
18537       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
18538       Amt.getOpcode() == ISD::BITCAST &&
18539       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18540     Amt = Amt.getOperand(0);
18541     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18542                      VT.getVectorNumElements();
18543     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
18544     uint64_t ShiftAmt = 0;
18545     for (unsigned i = 0; i != Ratio; ++i) {
18546       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
18547       if (!C)
18548         return SDValue();
18549       // 6 == Log2(64)
18550       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
18551     }
18552     // Check remaining shift amounts.
18553     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18554       uint64_t ShAmt = 0;
18555       for (unsigned j = 0; j != Ratio; ++j) {
18556         ConstantSDNode *C =
18557           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
18558         if (!C)
18559           return SDValue();
18560         // 6 == Log2(64)
18561         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
18562       }
18563       if (ShAmt != ShiftAmt)
18564         return SDValue();
18565     }
18566     switch (Op.getOpcode()) {
18567     default:
18568       llvm_unreachable("Unknown shift opcode!");
18569     case ISD::SHL:
18570       return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
18571                                         DAG);
18572     case ISD::SRL:
18573       return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
18574                                         DAG);
18575     case ISD::SRA:
18576       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
18577                                         DAG);
18578     }
18579   }
18580
18581   return SDValue();
18582 }
18583
18584 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
18585                                         const X86Subtarget* Subtarget) {
18586   MVT VT = Op.getSimpleValueType();
18587   SDLoc dl(Op);
18588   SDValue R = Op.getOperand(0);
18589   SDValue Amt = Op.getOperand(1);
18590
18591   if ((VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) ||
18592       VT == MVT::v4i32 || VT == MVT::v8i16 ||
18593       (Subtarget->hasInt256() &&
18594        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
18595         VT == MVT::v8i32 || VT == MVT::v16i16)) ||
18596        (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
18597     SDValue BaseShAmt;
18598     EVT EltVT = VT.getVectorElementType();
18599
18600     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
18601       // Check if this build_vector node is doing a splat.
18602       // If so, then set BaseShAmt equal to the splat value.
18603       BaseShAmt = BV->getSplatValue();
18604       if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF)
18605         BaseShAmt = SDValue();
18606     } else {
18607       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
18608         Amt = Amt.getOperand(0);
18609
18610       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
18611       if (SVN && SVN->isSplat()) {
18612         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
18613         SDValue InVec = Amt.getOperand(0);
18614         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
18615           assert((SplatIdx < InVec.getValueType().getVectorNumElements()) &&
18616                  "Unexpected shuffle index found!");
18617           BaseShAmt = InVec.getOperand(SplatIdx);
18618         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
18619            if (ConstantSDNode *C =
18620                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
18621              if (C->getZExtValue() == SplatIdx)
18622                BaseShAmt = InVec.getOperand(1);
18623            }
18624         }
18625
18626         if (!BaseShAmt)
18627           // Avoid introducing an extract element from a shuffle.
18628           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
18629                                     DAG.getIntPtrConstant(SplatIdx));
18630       }
18631     }
18632
18633     if (BaseShAmt.getNode()) {
18634       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
18635       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
18636         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
18637       else if (EltVT.bitsLT(MVT::i32))
18638         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
18639
18640       switch (Op.getOpcode()) {
18641       default:
18642         llvm_unreachable("Unknown shift opcode!");
18643       case ISD::SHL:
18644         switch (VT.SimpleTy) {
18645         default: return SDValue();
18646         case MVT::v2i64:
18647         case MVT::v4i32:
18648         case MVT::v8i16:
18649         case MVT::v4i64:
18650         case MVT::v8i32:
18651         case MVT::v16i16:
18652         case MVT::v16i32:
18653         case MVT::v8i64:
18654           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
18655         }
18656       case ISD::SRA:
18657         switch (VT.SimpleTy) {
18658         default: return SDValue();
18659         case MVT::v4i32:
18660         case MVT::v8i16:
18661         case MVT::v8i32:
18662         case MVT::v16i16:
18663         case MVT::v16i32:
18664         case MVT::v8i64:
18665           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
18666         }
18667       case ISD::SRL:
18668         switch (VT.SimpleTy) {
18669         default: return SDValue();
18670         case MVT::v2i64:
18671         case MVT::v4i32:
18672         case MVT::v8i16:
18673         case MVT::v4i64:
18674         case MVT::v8i32:
18675         case MVT::v16i16:
18676         case MVT::v16i32:
18677         case MVT::v8i64:
18678           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
18679         }
18680       }
18681     }
18682   }
18683
18684   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
18685   if (!Subtarget->is64Bit() &&
18686       (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
18687       (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
18688       Amt.getOpcode() == ISD::BITCAST &&
18689       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
18690     Amt = Amt.getOperand(0);
18691     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
18692                      VT.getVectorNumElements();
18693     std::vector<SDValue> Vals(Ratio);
18694     for (unsigned i = 0; i != Ratio; ++i)
18695       Vals[i] = Amt.getOperand(i);
18696     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
18697       for (unsigned j = 0; j != Ratio; ++j)
18698         if (Vals[j] != Amt.getOperand(i + j))
18699           return SDValue();
18700     }
18701     switch (Op.getOpcode()) {
18702     default:
18703       llvm_unreachable("Unknown shift opcode!");
18704     case ISD::SHL:
18705       return DAG.getNode(X86ISD::VSHL, dl, VT, R, Op.getOperand(1));
18706     case ISD::SRL:
18707       return DAG.getNode(X86ISD::VSRL, dl, VT, R, Op.getOperand(1));
18708     case ISD::SRA:
18709       return DAG.getNode(X86ISD::VSRA, dl, VT, R, Op.getOperand(1));
18710     }
18711   }
18712
18713   return SDValue();
18714 }
18715
18716 static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
18717                           SelectionDAG &DAG) {
18718   MVT VT = Op.getSimpleValueType();
18719   SDLoc dl(Op);
18720   SDValue R = Op.getOperand(0);
18721   SDValue Amt = Op.getOperand(1);
18722   SDValue V;
18723
18724   assert(VT.isVector() && "Custom lowering only for vector shifts!");
18725   assert(Subtarget->hasSSE2() && "Only custom lower when we have SSE2!");
18726
18727   V = LowerScalarImmediateShift(Op, DAG, Subtarget);
18728   if (V.getNode())
18729     return V;
18730
18731   V = LowerScalarVariableShift(Op, DAG, Subtarget);
18732   if (V.getNode())
18733       return V;
18734
18735   if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
18736     return Op;
18737   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
18738   if (Subtarget->hasInt256()) {
18739     if (Op.getOpcode() == ISD::SRL &&
18740         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18741          VT == MVT::v4i64 || VT == MVT::v8i32))
18742       return Op;
18743     if (Op.getOpcode() == ISD::SHL &&
18744         (VT == MVT::v2i64 || VT == MVT::v4i32 ||
18745          VT == MVT::v4i64 || VT == MVT::v8i32))
18746       return Op;
18747     if (Op.getOpcode() == ISD::SRA && (VT == MVT::v4i32 || VT == MVT::v8i32))
18748       return Op;
18749   }
18750
18751   // If possible, lower this packed shift into a vector multiply instead of
18752   // expanding it into a sequence of scalar shifts.
18753   // Do this only if the vector shift count is a constant build_vector.
18754   if (Op.getOpcode() == ISD::SHL &&
18755       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
18756        (Subtarget->hasInt256() && VT == MVT::v16i16)) &&
18757       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18758     SmallVector<SDValue, 8> Elts;
18759     EVT SVT = VT.getScalarType();
18760     unsigned SVTBits = SVT.getSizeInBits();
18761     const APInt &One = APInt(SVTBits, 1);
18762     unsigned NumElems = VT.getVectorNumElements();
18763
18764     for (unsigned i=0; i !=NumElems; ++i) {
18765       SDValue Op = Amt->getOperand(i);
18766       if (Op->getOpcode() == ISD::UNDEF) {
18767         Elts.push_back(Op);
18768         continue;
18769       }
18770
18771       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
18772       const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue());
18773       uint64_t ShAmt = C.getZExtValue();
18774       if (ShAmt >= SVTBits) {
18775         Elts.push_back(DAG.getUNDEF(SVT));
18776         continue;
18777       }
18778       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
18779     }
18780     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
18781     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
18782   }
18783
18784   // Lower SHL with variable shift amount.
18785   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
18786     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
18787
18788     Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
18789     Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
18790     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
18791     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
18792   }
18793
18794   // If possible, lower this shift as a sequence of two shifts by
18795   // constant plus a MOVSS/MOVSD instead of scalarizing it.
18796   // Example:
18797   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
18798   //
18799   // Could be rewritten as:
18800   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
18801   //
18802   // The advantage is that the two shifts from the example would be
18803   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
18804   // the vector shift into four scalar shifts plus four pairs of vector
18805   // insert/extract.
18806   if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
18807       ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
18808     unsigned TargetOpcode = X86ISD::MOVSS;
18809     bool CanBeSimplified;
18810     // The splat value for the first packed shift (the 'X' from the example).
18811     SDValue Amt1 = Amt->getOperand(0);
18812     // The splat value for the second packed shift (the 'Y' from the example).
18813     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
18814                                         Amt->getOperand(2);
18815
18816     // See if it is possible to replace this node with a sequence of
18817     // two shifts followed by a MOVSS/MOVSD
18818     if (VT == MVT::v4i32) {
18819       // Check if it is legal to use a MOVSS.
18820       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
18821                         Amt2 == Amt->getOperand(3);
18822       if (!CanBeSimplified) {
18823         // Otherwise, check if we can still simplify this node using a MOVSD.
18824         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
18825                           Amt->getOperand(2) == Amt->getOperand(3);
18826         TargetOpcode = X86ISD::MOVSD;
18827         Amt2 = Amt->getOperand(2);
18828       }
18829     } else {
18830       // Do similar checks for the case where the machine value type
18831       // is MVT::v8i16.
18832       CanBeSimplified = Amt1 == Amt->getOperand(1);
18833       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
18834         CanBeSimplified = Amt2 == Amt->getOperand(i);
18835
18836       if (!CanBeSimplified) {
18837         TargetOpcode = X86ISD::MOVSD;
18838         CanBeSimplified = true;
18839         Amt2 = Amt->getOperand(4);
18840         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
18841           CanBeSimplified = Amt1 == Amt->getOperand(i);
18842         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
18843           CanBeSimplified = Amt2 == Amt->getOperand(j);
18844       }
18845     }
18846
18847     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
18848         isa<ConstantSDNode>(Amt2)) {
18849       // Replace this node with two shifts followed by a MOVSS/MOVSD.
18850       EVT CastVT = MVT::v4i32;
18851       SDValue Splat1 =
18852         DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
18853       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
18854       SDValue Splat2 =
18855         DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
18856       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
18857       if (TargetOpcode == X86ISD::MOVSD)
18858         CastVT = MVT::v2i64;
18859       SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
18860       SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
18861       SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
18862                                             BitCast1, DAG);
18863       return DAG.getNode(ISD::BITCAST, dl, VT, Result);
18864     }
18865   }
18866
18867   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
18868     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
18869
18870     // a = a << 5;
18871     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
18872     Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
18873
18874     // Turn 'a' into a mask suitable for VSELECT
18875     SDValue VSelM = DAG.getConstant(0x80, VT);
18876     SDValue OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18877     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18878
18879     SDValue CM1 = DAG.getConstant(0x0f, VT);
18880     SDValue CM2 = DAG.getConstant(0x3f, VT);
18881
18882     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
18883     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
18884     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
18885     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18886     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18887
18888     // a += a
18889     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18890     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18891     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18892
18893     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
18894     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
18895     M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
18896     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
18897     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
18898
18899     // a += a
18900     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
18901     OpVSel = DAG.getNode(ISD::AND, dl, VT, VSelM, Op);
18902     OpVSel = DAG.getNode(X86ISD::PCMPEQ, dl, VT, OpVSel, VSelM);
18903
18904     // return VSELECT(r, r+r, a);
18905     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel,
18906                     DAG.getNode(ISD::ADD, dl, VT, R, R), R);
18907     return R;
18908   }
18909
18910   // It's worth extending once and using the v8i32 shifts for 16-bit types, but
18911   // the extra overheads to get from v16i8 to v8i32 make the existing SSE
18912   // solution better.
18913   if (Subtarget->hasInt256() && VT == MVT::v8i16) {
18914     MVT NewVT = VT == MVT::v8i16 ? MVT::v8i32 : MVT::v16i16;
18915     unsigned ExtOpc =
18916         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
18917     R = DAG.getNode(ExtOpc, dl, NewVT, R);
18918     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, NewVT, Amt);
18919     return DAG.getNode(ISD::TRUNCATE, dl, VT,
18920                        DAG.getNode(Op.getOpcode(), dl, NewVT, R, Amt));
18921     }
18922
18923   // Decompose 256-bit shifts into smaller 128-bit shifts.
18924   if (VT.is256BitVector()) {
18925     unsigned NumElems = VT.getVectorNumElements();
18926     MVT EltVT = VT.getVectorElementType();
18927     EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18928
18929     // Extract the two vectors
18930     SDValue V1 = Extract128BitVector(R, 0, DAG, dl);
18931     SDValue V2 = Extract128BitVector(R, NumElems/2, DAG, dl);
18932
18933     // Recreate the shift amount vectors
18934     SDValue Amt1, Amt2;
18935     if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
18936       // Constant shift amount
18937       SmallVector<SDValue, 4> Amt1Csts;
18938       SmallVector<SDValue, 4> Amt2Csts;
18939       for (unsigned i = 0; i != NumElems/2; ++i)
18940         Amt1Csts.push_back(Amt->getOperand(i));
18941       for (unsigned i = NumElems/2; i != NumElems; ++i)
18942         Amt2Csts.push_back(Amt->getOperand(i));
18943
18944       Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
18945       Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
18946     } else {
18947       // Variable shift amount
18948       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
18949       Amt2 = Extract128BitVector(Amt, NumElems/2, DAG, dl);
18950     }
18951
18952     // Issue new vector shifts for the smaller types
18953     V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
18954     V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
18955
18956     // Concatenate the result back
18957     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
18958   }
18959
18960   return SDValue();
18961 }
18962
18963 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
18964   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
18965   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
18966   // looks for this combo and may remove the "setcc" instruction if the "setcc"
18967   // has only one use.
18968   SDNode *N = Op.getNode();
18969   SDValue LHS = N->getOperand(0);
18970   SDValue RHS = N->getOperand(1);
18971   unsigned BaseOp = 0;
18972   unsigned Cond = 0;
18973   SDLoc DL(Op);
18974   switch (Op.getOpcode()) {
18975   default: llvm_unreachable("Unknown ovf instruction!");
18976   case ISD::SADDO:
18977     // A subtract of one will be selected as a INC. Note that INC doesn't
18978     // set CF, so we can't do this for UADDO.
18979     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18980       if (C->isOne()) {
18981         BaseOp = X86ISD::INC;
18982         Cond = X86::COND_O;
18983         break;
18984       }
18985     BaseOp = X86ISD::ADD;
18986     Cond = X86::COND_O;
18987     break;
18988   case ISD::UADDO:
18989     BaseOp = X86ISD::ADD;
18990     Cond = X86::COND_B;
18991     break;
18992   case ISD::SSUBO:
18993     // A subtract of one will be selected as a DEC. Note that DEC doesn't
18994     // set CF, so we can't do this for USUBO.
18995     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS))
18996       if (C->isOne()) {
18997         BaseOp = X86ISD::DEC;
18998         Cond = X86::COND_O;
18999         break;
19000       }
19001     BaseOp = X86ISD::SUB;
19002     Cond = X86::COND_O;
19003     break;
19004   case ISD::USUBO:
19005     BaseOp = X86ISD::SUB;
19006     Cond = X86::COND_B;
19007     break;
19008   case ISD::SMULO:
19009     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
19010     Cond = X86::COND_O;
19011     break;
19012   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
19013     if (N->getValueType(0) == MVT::i8) {
19014       BaseOp = X86ISD::UMUL8;
19015       Cond = X86::COND_O;
19016       break;
19017     }
19018     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
19019                                  MVT::i32);
19020     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
19021
19022     SDValue SetCC =
19023       DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
19024                   DAG.getConstant(X86::COND_O, MVT::i32),
19025                   SDValue(Sum.getNode(), 2));
19026
19027     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19028   }
19029   }
19030
19031   // Also sets EFLAGS.
19032   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
19033   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
19034
19035   SDValue SetCC =
19036     DAG.getNode(X86ISD::SETCC, DL, N->getValueType(1),
19037                 DAG.getConstant(Cond, MVT::i32),
19038                 SDValue(Sum.getNode(), 1));
19039
19040   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
19041 }
19042
19043 // Sign extension of the low part of vector elements. This may be used either
19044 // when sign extend instructions are not available or if the vector element
19045 // sizes already match the sign-extended size. If the vector elements are in
19046 // their pre-extended size and sign extend instructions are available, that will
19047 // be handled by LowerSIGN_EXTEND.
19048 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
19049                                                   SelectionDAG &DAG) const {
19050   SDLoc dl(Op);
19051   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
19052   MVT VT = Op.getSimpleValueType();
19053
19054   if (!Subtarget->hasSSE2() || !VT.isVector())
19055     return SDValue();
19056
19057   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
19058                       ExtraVT.getScalarType().getSizeInBits();
19059
19060   switch (VT.SimpleTy) {
19061     default: return SDValue();
19062     case MVT::v8i32:
19063     case MVT::v16i16:
19064       if (!Subtarget->hasFp256())
19065         return SDValue();
19066       if (!Subtarget->hasInt256()) {
19067         // needs to be split
19068         unsigned NumElems = VT.getVectorNumElements();
19069
19070         // Extract the LHS vectors
19071         SDValue LHS = Op.getOperand(0);
19072         SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, dl);
19073         SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, dl);
19074
19075         MVT EltVT = VT.getVectorElementType();
19076         EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19077
19078         EVT ExtraEltVT = ExtraVT.getVectorElementType();
19079         unsigned ExtraNumElems = ExtraVT.getVectorNumElements();
19080         ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
19081                                    ExtraNumElems/2);
19082         SDValue Extra = DAG.getValueType(ExtraVT);
19083
19084         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
19085         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
19086
19087         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
19088       }
19089       // fall through
19090     case MVT::v4i32:
19091     case MVT::v8i16: {
19092       SDValue Op0 = Op.getOperand(0);
19093
19094       // This is a sign extension of some low part of vector elements without
19095       // changing the size of the vector elements themselves:
19096       // Shift-Left + Shift-Right-Algebraic.
19097       SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0,
19098                                                BitsDiff, DAG);
19099       return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff,
19100                                         DAG);
19101     }
19102   }
19103 }
19104
19105 /// Returns true if the operand type is exactly twice the native width, and
19106 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
19107 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
19108 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
19109 bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const {
19110   const X86Subtarget &Subtarget =
19111       getTargetMachine().getSubtarget<X86Subtarget>();
19112   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
19113
19114   if (OpWidth == 64)
19115     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
19116   else if (OpWidth == 128)
19117     return Subtarget.hasCmpxchg16b();
19118   else
19119     return false;
19120 }
19121
19122 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
19123   return needsCmpXchgNb(SI->getValueOperand()->getType());
19124 }
19125
19126 // Note: this turns large loads into lock cmpxchg8b/16b.
19127 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
19128 bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
19129   auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
19130   return needsCmpXchgNb(PTy->getElementType());
19131 }
19132
19133 bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
19134   const X86Subtarget &Subtarget =
19135       getTargetMachine().getSubtarget<X86Subtarget>();
19136   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19137   const Type *MemType = AI->getType();
19138
19139   // If the operand is too big, we must see if cmpxchg8/16b is available
19140   // and default to library calls otherwise.
19141   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19142     return needsCmpXchgNb(MemType);
19143
19144   AtomicRMWInst::BinOp Op = AI->getOperation();
19145   switch (Op) {
19146   default:
19147     llvm_unreachable("Unknown atomic operation");
19148   case AtomicRMWInst::Xchg:
19149   case AtomicRMWInst::Add:
19150   case AtomicRMWInst::Sub:
19151     // It's better to use xadd, xsub or xchg for these in all cases.
19152     return false;
19153   case AtomicRMWInst::Or:
19154   case AtomicRMWInst::And:
19155   case AtomicRMWInst::Xor:
19156     // If the atomicrmw's result isn't actually used, we can just add a "lock"
19157     // prefix to a normal instruction for these operations.
19158     return !AI->use_empty();
19159   case AtomicRMWInst::Nand:
19160   case AtomicRMWInst::Max:
19161   case AtomicRMWInst::Min:
19162   case AtomicRMWInst::UMax:
19163   case AtomicRMWInst::UMin:
19164     // These always require a non-trivial set of data operations on x86. We must
19165     // use a cmpxchg loop.
19166     return true;
19167   }
19168 }
19169
19170 static bool hasMFENCE(const X86Subtarget& Subtarget) {
19171   // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
19172   // no-sse2). There isn't any reason to disable it if the target processor
19173   // supports it.
19174   return Subtarget.hasSSE2() || Subtarget.is64Bit();
19175 }
19176
19177 LoadInst *
19178 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
19179   const X86Subtarget &Subtarget =
19180       getTargetMachine().getSubtarget<X86Subtarget>();
19181   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
19182   const Type *MemType = AI->getType();
19183   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
19184   // there is no benefit in turning such RMWs into loads, and it is actually
19185   // harmful as it introduces a mfence.
19186   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
19187     return nullptr;
19188
19189   auto Builder = IRBuilder<>(AI);
19190   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
19191   auto SynchScope = AI->getSynchScope();
19192   // We must restrict the ordering to avoid generating loads with Release or
19193   // ReleaseAcquire orderings.
19194   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
19195   auto Ptr = AI->getPointerOperand();
19196
19197   // Before the load we need a fence. Here is an example lifted from
19198   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
19199   // is required:
19200   // Thread 0:
19201   //   x.store(1, relaxed);
19202   //   r1 = y.fetch_add(0, release);
19203   // Thread 1:
19204   //   y.fetch_add(42, acquire);
19205   //   r2 = x.load(relaxed);
19206   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
19207   // lowered to just a load without a fence. A mfence flushes the store buffer,
19208   // making the optimization clearly correct.
19209   // FIXME: it is required if isAtLeastRelease(Order) but it is not clear
19210   // otherwise, we might be able to be more agressive on relaxed idempotent
19211   // rmw. In practice, they do not look useful, so we don't try to be
19212   // especially clever.
19213   if (SynchScope == SingleThread) {
19214     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
19215     // the IR level, so we must wrap it in an intrinsic.
19216     return nullptr;
19217   } else if (hasMFENCE(Subtarget)) {
19218     Function *MFence = llvm::Intrinsic::getDeclaration(M,
19219             Intrinsic::x86_sse2_mfence);
19220     Builder.CreateCall(MFence);
19221   } else {
19222     // FIXME: it might make sense to use a locked operation here but on a
19223     // different cache-line to prevent cache-line bouncing. In practice it
19224     // is probably a small win, and x86 processors without mfence are rare
19225     // enough that we do not bother.
19226     return nullptr;
19227   }
19228
19229   // Finally we can emit the atomic load.
19230   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
19231           AI->getType()->getPrimitiveSizeInBits());
19232   Loaded->setAtomic(Order, SynchScope);
19233   AI->replaceAllUsesWith(Loaded);
19234   AI->eraseFromParent();
19235   return Loaded;
19236 }
19237
19238 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
19239                                  SelectionDAG &DAG) {
19240   SDLoc dl(Op);
19241   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
19242     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
19243   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
19244     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
19245
19246   // The only fence that needs an instruction is a sequentially-consistent
19247   // cross-thread fence.
19248   if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
19249     if (hasMFENCE(*Subtarget))
19250       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
19251
19252     SDValue Chain = Op.getOperand(0);
19253     SDValue Zero = DAG.getConstant(0, MVT::i32);
19254     SDValue Ops[] = {
19255       DAG.getRegister(X86::ESP, MVT::i32), // Base
19256       DAG.getTargetConstant(1, MVT::i8),   // Scale
19257       DAG.getRegister(0, MVT::i32),        // Index
19258       DAG.getTargetConstant(0, MVT::i32),  // Disp
19259       DAG.getRegister(0, MVT::i32),        // Segment.
19260       Zero,
19261       Chain
19262     };
19263     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
19264     return SDValue(Res, 0);
19265   }
19266
19267   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
19268   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
19269 }
19270
19271 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
19272                              SelectionDAG &DAG) {
19273   MVT T = Op.getSimpleValueType();
19274   SDLoc DL(Op);
19275   unsigned Reg = 0;
19276   unsigned size = 0;
19277   switch(T.SimpleTy) {
19278   default: llvm_unreachable("Invalid value type!");
19279   case MVT::i8:  Reg = X86::AL;  size = 1; break;
19280   case MVT::i16: Reg = X86::AX;  size = 2; break;
19281   case MVT::i32: Reg = X86::EAX; size = 4; break;
19282   case MVT::i64:
19283     assert(Subtarget->is64Bit() && "Node not type legal!");
19284     Reg = X86::RAX; size = 8;
19285     break;
19286   }
19287   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
19288                                   Op.getOperand(2), SDValue());
19289   SDValue Ops[] = { cpIn.getValue(0),
19290                     Op.getOperand(1),
19291                     Op.getOperand(3),
19292                     DAG.getTargetConstant(size, MVT::i8),
19293                     cpIn.getValue(1) };
19294   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19295   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
19296   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
19297                                            Ops, T, MMO);
19298
19299   SDValue cpOut =
19300     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
19301   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
19302                                       MVT::i32, cpOut.getValue(2));
19303   SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
19304                                 DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19305
19306   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
19307   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
19308   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
19309   return SDValue();
19310 }
19311
19312 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
19313                             SelectionDAG &DAG) {
19314   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
19315   MVT DstVT = Op.getSimpleValueType();
19316
19317   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
19318     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19319     if (DstVT != MVT::f64)
19320       // This conversion needs to be expanded.
19321       return SDValue();
19322
19323     SDValue InVec = Op->getOperand(0);
19324     SDLoc dl(Op);
19325     unsigned NumElts = SrcVT.getVectorNumElements();
19326     EVT SVT = SrcVT.getVectorElementType();
19327
19328     // Widen the vector in input in the case of MVT::v2i32.
19329     // Example: from MVT::v2i32 to MVT::v4i32.
19330     SmallVector<SDValue, 16> Elts;
19331     for (unsigned i = 0, e = NumElts; i != e; ++i)
19332       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
19333                                  DAG.getIntPtrConstant(i)));
19334
19335     // Explicitly mark the extra elements as Undef.
19336     SDValue Undef = DAG.getUNDEF(SVT);
19337     for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
19338       Elts.push_back(Undef);
19339
19340     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19341     SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
19342     SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
19343     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
19344                        DAG.getIntPtrConstant(0));
19345   }
19346
19347   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
19348          Subtarget->hasMMX() && "Unexpected custom BITCAST");
19349   assert((DstVT == MVT::i64 ||
19350           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
19351          "Unexpected custom BITCAST");
19352   // i64 <=> MMX conversions are Legal.
19353   if (SrcVT==MVT::i64 && DstVT.isVector())
19354     return Op;
19355   if (DstVT==MVT::i64 && SrcVT.isVector())
19356     return Op;
19357   // MMX <=> MMX conversions are Legal.
19358   if (SrcVT.isVector() && DstVT.isVector())
19359     return Op;
19360   // All other conversions need to be expanded.
19361   return SDValue();
19362 }
19363
19364 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
19365                           SelectionDAG &DAG) {
19366   SDNode *Node = Op.getNode();
19367   SDLoc dl(Node);
19368
19369   Op = Op.getOperand(0);
19370   EVT VT = Op.getValueType();
19371   assert((VT.is128BitVector() || VT.is256BitVector()) &&
19372          "CTPOP lowering only implemented for 128/256-bit wide vector types");
19373
19374   unsigned NumElts = VT.getVectorNumElements();
19375   EVT EltVT = VT.getVectorElementType();
19376   unsigned Len = EltVT.getSizeInBits();
19377
19378   // This is the vectorized version of the "best" algorithm from
19379   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
19380   // with a minor tweak to use a series of adds + shifts instead of vector
19381   // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types:
19382   //
19383   //  v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled
19384   //  v8i32 => Always profitable
19385   //
19386   // FIXME: There a couple of possible improvements:
19387   //
19388   // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled).
19389   // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html
19390   //
19391   assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 &&
19392          "CTPOP not implemented for this vector element type.");
19393
19394   // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid
19395   // extra legalization.
19396   bool NeedsBitcast = EltVT == MVT::i32;
19397   MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64;
19398
19399   SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT);
19400   SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT);
19401   SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT);
19402
19403   // v = v - ((v >> 1) & 0x55555555...)
19404   SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT));
19405   SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones);
19406   SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV);
19407   if (NeedsBitcast)
19408     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19409
19410   SmallVector<SDValue, 8> Mask55(NumElts, Cst55);
19411   SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55);
19412   if (NeedsBitcast)
19413     M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55);
19414
19415   SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55);
19416   if (VT != And.getValueType())
19417     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19418   SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And);
19419
19420   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
19421   SmallVector<SDValue, 8> Mask33(NumElts, Cst33);
19422   SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33);
19423   SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT));
19424   SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos);
19425
19426   Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV);
19427   if (NeedsBitcast) {
19428     Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl);
19429     M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33);
19430     Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub);
19431   }
19432
19433   SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33);
19434   SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33);
19435   if (VT != AndRHS.getValueType()) {
19436     AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS);
19437     AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS);
19438   }
19439   SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS);
19440
19441   // v = (v + (v >> 4)) & 0x0F0F0F0F...
19442   SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT));
19443   SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours);
19444   Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV);
19445   Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19446
19447   SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F);
19448   SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F);
19449   if (NeedsBitcast) {
19450     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19451     M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F);
19452   }
19453   And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F);
19454   if (VT != And.getValueType())
19455     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19456
19457   // The algorithm mentioned above uses:
19458   //    v = (v * 0x01010101...) >> (Len - 8)
19459   //
19460   // Change it to use vector adds + vector shifts which yield faster results on
19461   // Haswell than using vector integer multiplication.
19462   //
19463   // For i32 elements:
19464   //    v = v + (v >> 8)
19465   //    v = v + (v >> 16)
19466   //
19467   // For i64 elements:
19468   //    v = v + (v >> 8)
19469   //    v = v + (v >> 16)
19470   //    v = v + (v >> 32)
19471   //
19472   Add = And;
19473   SmallVector<SDValue, 8> Csts;
19474   for (unsigned i = 8; i <= Len/2; i *= 2) {
19475     Csts.assign(NumElts, DAG.getConstant(i, EltVT));
19476     SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts);
19477     Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV);
19478     Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl);
19479     Csts.clear();
19480   }
19481
19482   // The result is on the least significant 6-bits on i32 and 7-bits on i64.
19483   SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT);
19484   SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F);
19485   SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV);
19486   if (NeedsBitcast) {
19487     Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add);
19488     M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F);
19489   }
19490   And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F);
19491   if (VT != And.getValueType())
19492     And = DAG.getNode(ISD::BITCAST, dl, VT, And);
19493
19494   return And;
19495 }
19496
19497 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
19498   SDNode *Node = Op.getNode();
19499   SDLoc dl(Node);
19500   EVT T = Node->getValueType(0);
19501   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
19502                               DAG.getConstant(0, T), Node->getOperand(2));
19503   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
19504                        cast<AtomicSDNode>(Node)->getMemoryVT(),
19505                        Node->getOperand(0),
19506                        Node->getOperand(1), negOp,
19507                        cast<AtomicSDNode>(Node)->getMemOperand(),
19508                        cast<AtomicSDNode>(Node)->getOrdering(),
19509                        cast<AtomicSDNode>(Node)->getSynchScope());
19510 }
19511
19512 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
19513   SDNode *Node = Op.getNode();
19514   SDLoc dl(Node);
19515   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
19516
19517   // Convert seq_cst store -> xchg
19518   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
19519   // FIXME: On 32-bit, store -> fist or movq would be more efficient
19520   //        (The only way to get a 16-byte store is cmpxchg16b)
19521   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
19522   if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
19523       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
19524     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
19525                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
19526                                  Node->getOperand(0),
19527                                  Node->getOperand(1), Node->getOperand(2),
19528                                  cast<AtomicSDNode>(Node)->getMemOperand(),
19529                                  cast<AtomicSDNode>(Node)->getOrdering(),
19530                                  cast<AtomicSDNode>(Node)->getSynchScope());
19531     return Swap.getValue(1);
19532   }
19533   // Other atomic stores have a simple pattern.
19534   return Op;
19535 }
19536
19537 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
19538   EVT VT = Op.getNode()->getSimpleValueType(0);
19539
19540   // Let legalize expand this if it isn't a legal type yet.
19541   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19542     return SDValue();
19543
19544   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19545
19546   unsigned Opc;
19547   bool ExtraOp = false;
19548   switch (Op.getOpcode()) {
19549   default: llvm_unreachable("Invalid code");
19550   case ISD::ADDC: Opc = X86ISD::ADD; break;
19551   case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
19552   case ISD::SUBC: Opc = X86ISD::SUB; break;
19553   case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
19554   }
19555
19556   if (!ExtraOp)
19557     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19558                        Op.getOperand(1));
19559   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
19560                      Op.getOperand(1), Op.getOperand(2));
19561 }
19562
19563 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
19564                             SelectionDAG &DAG) {
19565   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
19566
19567   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
19568   // which returns the values as { float, float } (in XMM0) or
19569   // { double, double } (which is returned in XMM0, XMM1).
19570   SDLoc dl(Op);
19571   SDValue Arg = Op.getOperand(0);
19572   EVT ArgVT = Arg.getValueType();
19573   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19574
19575   TargetLowering::ArgListTy Args;
19576   TargetLowering::ArgListEntry Entry;
19577
19578   Entry.Node = Arg;
19579   Entry.Ty = ArgTy;
19580   Entry.isSExt = false;
19581   Entry.isZExt = false;
19582   Args.push_back(Entry);
19583
19584   bool isF64 = ArgVT == MVT::f64;
19585   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
19586   // the small struct {f32, f32} is returned in (eax, edx). For f64,
19587   // the results are returned via SRet in memory.
19588   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
19589   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19590   SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
19591
19592   Type *RetTy = isF64
19593     ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
19594     : (Type*)VectorType::get(ArgTy, 4);
19595
19596   TargetLowering::CallLoweringInfo CLI(DAG);
19597   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
19598     .setCallee(CallingConv::C, RetTy, Callee, std::move(Args), 0);
19599
19600   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
19601
19602   if (isF64)
19603     // Returned in xmm0 and xmm1.
19604     return CallResult.first;
19605
19606   // Returned in bits 0:31 and 32:64 xmm0.
19607   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19608                                CallResult.first, DAG.getIntPtrConstant(0));
19609   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
19610                                CallResult.first, DAG.getIntPtrConstant(1));
19611   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
19612   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
19613 }
19614
19615 /// LowerOperation - Provide custom lowering hooks for some operations.
19616 ///
19617 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
19618   switch (Op.getOpcode()) {
19619   default: llvm_unreachable("Should not custom lower this!");
19620   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
19621   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
19622   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
19623     return LowerCMP_SWAP(Op, Subtarget, DAG);
19624   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
19625   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
19626   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
19627   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
19628   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
19629   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
19630   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
19631   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
19632   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
19633   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
19634   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
19635   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
19636   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
19637   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
19638   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
19639   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
19640   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
19641   case ISD::SHL_PARTS:
19642   case ISD::SRA_PARTS:
19643   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
19644   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
19645   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
19646   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
19647   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
19648   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
19649   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
19650   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
19651   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
19652   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
19653   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
19654   case ISD::FABS:
19655   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
19656   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
19657   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
19658   case ISD::SETCC:              return LowerSETCC(Op, DAG);
19659   case ISD::SELECT:             return LowerSELECT(Op, DAG);
19660   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
19661   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
19662   case ISD::VASTART:            return LowerVASTART(Op, DAG);
19663   case ISD::VAARG:              return LowerVAARG(Op, DAG);
19664   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
19665   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
19666   case ISD::INTRINSIC_VOID:
19667   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
19668   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
19669   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
19670   case ISD::FRAME_TO_ARGS_OFFSET:
19671                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
19672   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
19673   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
19674   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
19675   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
19676   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
19677   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
19678   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
19679   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
19680   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
19681   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
19682   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
19683   case ISD::UMUL_LOHI:
19684   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
19685   case ISD::SRA:
19686   case ISD::SRL:
19687   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
19688   case ISD::SADDO:
19689   case ISD::UADDO:
19690   case ISD::SSUBO:
19691   case ISD::USUBO:
19692   case ISD::SMULO:
19693   case ISD::UMULO:              return LowerXALUO(Op, DAG);
19694   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
19695   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
19696   case ISD::ADDC:
19697   case ISD::ADDE:
19698   case ISD::SUBC:
19699   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
19700   case ISD::ADD:                return LowerADD(Op, DAG);
19701   case ISD::SUB:                return LowerSUB(Op, DAG);
19702   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
19703   }
19704 }
19705
19706 /// ReplaceNodeResults - Replace a node with an illegal result type
19707 /// with a new node built out of custom code.
19708 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
19709                                            SmallVectorImpl<SDValue>&Results,
19710                                            SelectionDAG &DAG) const {
19711   SDLoc dl(N);
19712   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19713   switch (N->getOpcode()) {
19714   default:
19715     llvm_unreachable("Do not know how to custom type legalize this operation!");
19716   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
19717   case X86ISD::FMINC:
19718   case X86ISD::FMIN:
19719   case X86ISD::FMAXC:
19720   case X86ISD::FMAX: {
19721     EVT VT = N->getValueType(0);
19722     if (VT != MVT::v2f32)
19723       llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX.");
19724     SDValue UNDEF = DAG.getUNDEF(VT);
19725     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19726                               N->getOperand(0), UNDEF);
19727     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
19728                               N->getOperand(1), UNDEF);
19729     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
19730     return;
19731   }
19732   case ISD::SIGN_EXTEND_INREG:
19733   case ISD::ADDC:
19734   case ISD::ADDE:
19735   case ISD::SUBC:
19736   case ISD::SUBE:
19737     // We don't want to expand or promote these.
19738     return;
19739   case ISD::SDIV:
19740   case ISD::UDIV:
19741   case ISD::SREM:
19742   case ISD::UREM:
19743   case ISD::SDIVREM:
19744   case ISD::UDIVREM: {
19745     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
19746     Results.push_back(V);
19747     return;
19748   }
19749   case ISD::FP_TO_SINT:
19750   case ISD::FP_TO_UINT: {
19751     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
19752
19753     if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType()))
19754       return;
19755
19756     std::pair<SDValue,SDValue> Vals =
19757         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
19758     SDValue FIST = Vals.first, StackSlot = Vals.second;
19759     if (FIST.getNode()) {
19760       EVT VT = N->getValueType(0);
19761       // Return a load from the stack slot.
19762       if (StackSlot.getNode())
19763         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
19764                                       MachinePointerInfo(),
19765                                       false, false, false, 0));
19766       else
19767         Results.push_back(FIST);
19768     }
19769     return;
19770   }
19771   case ISD::UINT_TO_FP: {
19772     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19773     if (N->getOperand(0).getValueType() != MVT::v2i32 ||
19774         N->getValueType(0) != MVT::v2f32)
19775       return;
19776     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
19777                                  N->getOperand(0));
19778     SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
19779                                      MVT::f64);
19780     SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
19781     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
19782                              DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
19783     Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
19784     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
19785     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
19786     return;
19787   }
19788   case ISD::FP_ROUND: {
19789     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
19790         return;
19791     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
19792     Results.push_back(V);
19793     return;
19794   }
19795   case ISD::INTRINSIC_W_CHAIN: {
19796     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19797     switch (IntNo) {
19798     default : llvm_unreachable("Do not know how to custom type "
19799                                "legalize this intrinsic operation!");
19800     case Intrinsic::x86_rdtsc:
19801       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19802                                      Results);
19803     case Intrinsic::x86_rdtscp:
19804       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
19805                                      Results);
19806     case Intrinsic::x86_rdpmc:
19807       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
19808     }
19809   }
19810   case ISD::READCYCLECOUNTER: {
19811     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
19812                                    Results);
19813   }
19814   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
19815     EVT T = N->getValueType(0);
19816     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
19817     bool Regs64bit = T == MVT::i128;
19818     EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
19819     SDValue cpInL, cpInH;
19820     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19821                         DAG.getConstant(0, HalfT));
19822     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
19823                         DAG.getConstant(1, HalfT));
19824     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
19825                              Regs64bit ? X86::RAX : X86::EAX,
19826                              cpInL, SDValue());
19827     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
19828                              Regs64bit ? X86::RDX : X86::EDX,
19829                              cpInH, cpInL.getValue(1));
19830     SDValue swapInL, swapInH;
19831     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19832                           DAG.getConstant(0, HalfT));
19833     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
19834                           DAG.getConstant(1, HalfT));
19835     swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
19836                                Regs64bit ? X86::RBX : X86::EBX,
19837                                swapInL, cpInH.getValue(1));
19838     swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
19839                                Regs64bit ? X86::RCX : X86::ECX,
19840                                swapInH, swapInL.getValue(1));
19841     SDValue Ops[] = { swapInH.getValue(0),
19842                       N->getOperand(1),
19843                       swapInH.getValue(1) };
19844     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19845     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
19846     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
19847                                   X86ISD::LCMPXCHG8_DAG;
19848     SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
19849     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
19850                                         Regs64bit ? X86::RAX : X86::EAX,
19851                                         HalfT, Result.getValue(1));
19852     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
19853                                         Regs64bit ? X86::RDX : X86::EDX,
19854                                         HalfT, cpOutL.getValue(2));
19855     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
19856
19857     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
19858                                         MVT::i32, cpOutH.getValue(2));
19859     SDValue Success =
19860         DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
19861                     DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
19862     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
19863
19864     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
19865     Results.push_back(Success);
19866     Results.push_back(EFLAGS.getValue(1));
19867     return;
19868   }
19869   case ISD::ATOMIC_SWAP:
19870   case ISD::ATOMIC_LOAD_ADD:
19871   case ISD::ATOMIC_LOAD_SUB:
19872   case ISD::ATOMIC_LOAD_AND:
19873   case ISD::ATOMIC_LOAD_OR:
19874   case ISD::ATOMIC_LOAD_XOR:
19875   case ISD::ATOMIC_LOAD_NAND:
19876   case ISD::ATOMIC_LOAD_MIN:
19877   case ISD::ATOMIC_LOAD_MAX:
19878   case ISD::ATOMIC_LOAD_UMIN:
19879   case ISD::ATOMIC_LOAD_UMAX:
19880   case ISD::ATOMIC_LOAD: {
19881     // Delegate to generic TypeLegalization. Situations we can really handle
19882     // should have already been dealt with by AtomicExpandPass.cpp.
19883     break;
19884   }
19885   case ISD::BITCAST: {
19886     assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
19887     EVT DstVT = N->getValueType(0);
19888     EVT SrcVT = N->getOperand(0)->getValueType(0);
19889
19890     if (SrcVT != MVT::f64 ||
19891         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
19892       return;
19893
19894     unsigned NumElts = DstVT.getVectorNumElements();
19895     EVT SVT = DstVT.getVectorElementType();
19896     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
19897     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
19898                                    MVT::v2f64, N->getOperand(0));
19899     SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
19900
19901     if (ExperimentalVectorWideningLegalization) {
19902       // If we are legalizing vectors by widening, we already have the desired
19903       // legal vector type, just return it.
19904       Results.push_back(ToVecInt);
19905       return;
19906     }
19907
19908     SmallVector<SDValue, 8> Elts;
19909     for (unsigned i = 0, e = NumElts; i != e; ++i)
19910       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
19911                                    ToVecInt, DAG.getIntPtrConstant(i)));
19912
19913     Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
19914   }
19915   }
19916 }
19917
19918 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
19919   switch (Opcode) {
19920   default: return nullptr;
19921   case X86ISD::BSF:                return "X86ISD::BSF";
19922   case X86ISD::BSR:                return "X86ISD::BSR";
19923   case X86ISD::SHLD:               return "X86ISD::SHLD";
19924   case X86ISD::SHRD:               return "X86ISD::SHRD";
19925   case X86ISD::FAND:               return "X86ISD::FAND";
19926   case X86ISD::FANDN:              return "X86ISD::FANDN";
19927   case X86ISD::FOR:                return "X86ISD::FOR";
19928   case X86ISD::FXOR:               return "X86ISD::FXOR";
19929   case X86ISD::FSRL:               return "X86ISD::FSRL";
19930   case X86ISD::FILD:               return "X86ISD::FILD";
19931   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
19932   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
19933   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
19934   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
19935   case X86ISD::FLD:                return "X86ISD::FLD";
19936   case X86ISD::FST:                return "X86ISD::FST";
19937   case X86ISD::CALL:               return "X86ISD::CALL";
19938   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
19939   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
19940   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
19941   case X86ISD::BT:                 return "X86ISD::BT";
19942   case X86ISD::CMP:                return "X86ISD::CMP";
19943   case X86ISD::COMI:               return "X86ISD::COMI";
19944   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
19945   case X86ISD::CMPM:               return "X86ISD::CMPM";
19946   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
19947   case X86ISD::SETCC:              return "X86ISD::SETCC";
19948   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
19949   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
19950   case X86ISD::CMOV:               return "X86ISD::CMOV";
19951   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
19952   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
19953   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
19954   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
19955   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
19956   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
19957   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
19958   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
19959   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
19960   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
19961   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
19962   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
19963   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
19964   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
19965   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
19966   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
19967   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
19968   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
19969   case X86ISD::HADD:               return "X86ISD::HADD";
19970   case X86ISD::HSUB:               return "X86ISD::HSUB";
19971   case X86ISD::FHADD:              return "X86ISD::FHADD";
19972   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
19973   case X86ISD::UMAX:               return "X86ISD::UMAX";
19974   case X86ISD::UMIN:               return "X86ISD::UMIN";
19975   case X86ISD::SMAX:               return "X86ISD::SMAX";
19976   case X86ISD::SMIN:               return "X86ISD::SMIN";
19977   case X86ISD::FMAX:               return "X86ISD::FMAX";
19978   case X86ISD::FMIN:               return "X86ISD::FMIN";
19979   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
19980   case X86ISD::FMINC:              return "X86ISD::FMINC";
19981   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
19982   case X86ISD::FRCP:               return "X86ISD::FRCP";
19983   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
19984   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
19985   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
19986   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
19987   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
19988   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
19989   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
19990   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
19991   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
19992   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
19993   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
19994   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
19995   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
19996   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
19997   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
19998   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
19999   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
20000   case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
20001   case X86ISD::VINSERT:            return "X86ISD::VINSERT";
20002   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
20003   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
20004   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
20005   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
20006   case X86ISD::VSHL:               return "X86ISD::VSHL";
20007   case X86ISD::VSRL:               return "X86ISD::VSRL";
20008   case X86ISD::VSRA:               return "X86ISD::VSRA";
20009   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
20010   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
20011   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
20012   case X86ISD::CMPP:               return "X86ISD::CMPP";
20013   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
20014   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
20015   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
20016   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
20017   case X86ISD::ADD:                return "X86ISD::ADD";
20018   case X86ISD::SUB:                return "X86ISD::SUB";
20019   case X86ISD::ADC:                return "X86ISD::ADC";
20020   case X86ISD::SBB:                return "X86ISD::SBB";
20021   case X86ISD::SMUL:               return "X86ISD::SMUL";
20022   case X86ISD::UMUL:               return "X86ISD::UMUL";
20023   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
20024   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
20025   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
20026   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
20027   case X86ISD::INC:                return "X86ISD::INC";
20028   case X86ISD::DEC:                return "X86ISD::DEC";
20029   case X86ISD::OR:                 return "X86ISD::OR";
20030   case X86ISD::XOR:                return "X86ISD::XOR";
20031   case X86ISD::AND:                return "X86ISD::AND";
20032   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
20033   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
20034   case X86ISD::PTEST:              return "X86ISD::PTEST";
20035   case X86ISD::TESTP:              return "X86ISD::TESTP";
20036   case X86ISD::TESTM:              return "X86ISD::TESTM";
20037   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
20038   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
20039   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
20040   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
20041   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
20042   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
20043   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
20044   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
20045   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
20046   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
20047   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
20048   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
20049   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
20050   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
20051   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
20052   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
20053   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
20054   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
20055   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
20056   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
20057   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
20058   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
20059   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
20060   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
20061   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
20062   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
20063   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
20064   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
20065   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
20066   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
20067   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
20068   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
20069   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
20070   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
20071   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
20072   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
20073   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
20074   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
20075   case X86ISD::WIN_FTOL:           return "X86ISD::WIN_FTOL";
20076   case X86ISD::SAHF:               return "X86ISD::SAHF";
20077   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
20078   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
20079   case X86ISD::FMADD:              return "X86ISD::FMADD";
20080   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
20081   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
20082   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
20083   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
20084   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
20085   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
20086   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
20087   case X86ISD::XTEST:              return "X86ISD::XTEST";
20088   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
20089   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
20090   case X86ISD::SELECT:             return "X86ISD::SELECT";
20091   }
20092 }
20093
20094 // isLegalAddressingMode - Return true if the addressing mode represented
20095 // by AM is legal for this target, for a load/store of the specified type.
20096 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
20097                                               Type *Ty) const {
20098   // X86 supports extremely general addressing modes.
20099   CodeModel::Model M = getTargetMachine().getCodeModel();
20100   Reloc::Model R = getTargetMachine().getRelocationModel();
20101
20102   // X86 allows a sign-extended 32-bit immediate field as a displacement.
20103   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
20104     return false;
20105
20106   if (AM.BaseGV) {
20107     unsigned GVFlags =
20108       Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
20109
20110     // If a reference to this global requires an extra load, we can't fold it.
20111     if (isGlobalStubReference(GVFlags))
20112       return false;
20113
20114     // If BaseGV requires a register for the PIC base, we cannot also have a
20115     // BaseReg specified.
20116     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
20117       return false;
20118
20119     // If lower 4G is not available, then we must use rip-relative addressing.
20120     if ((M != CodeModel::Small || R != Reloc::Static) &&
20121         Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
20122       return false;
20123   }
20124
20125   switch (AM.Scale) {
20126   case 0:
20127   case 1:
20128   case 2:
20129   case 4:
20130   case 8:
20131     // These scales always work.
20132     break;
20133   case 3:
20134   case 5:
20135   case 9:
20136     // These scales are formed with basereg+scalereg.  Only accept if there is
20137     // no basereg yet.
20138     if (AM.HasBaseReg)
20139       return false;
20140     break;
20141   default:  // Other stuff never works.
20142     return false;
20143   }
20144
20145   return true;
20146 }
20147
20148 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
20149   unsigned Bits = Ty->getScalarSizeInBits();
20150
20151   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
20152   // particularly cheaper than those without.
20153   if (Bits == 8)
20154     return false;
20155
20156   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
20157   // variable shifts just as cheap as scalar ones.
20158   if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
20159     return false;
20160
20161   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
20162   // fully general vector.
20163   return true;
20164 }
20165
20166 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
20167   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20168     return false;
20169   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
20170   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
20171   return NumBits1 > NumBits2;
20172 }
20173
20174 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
20175   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
20176     return false;
20177
20178   if (!isTypeLegal(EVT::getEVT(Ty1)))
20179     return false;
20180
20181   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
20182
20183   // Assuming the caller doesn't have a zeroext or signext return parameter,
20184   // truncation all the way down to i1 is valid.
20185   return true;
20186 }
20187
20188 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
20189   return isInt<32>(Imm);
20190 }
20191
20192 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
20193   // Can also use sub to handle negated immediates.
20194   return isInt<32>(Imm);
20195 }
20196
20197 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
20198   if (!VT1.isInteger() || !VT2.isInteger())
20199     return false;
20200   unsigned NumBits1 = VT1.getSizeInBits();
20201   unsigned NumBits2 = VT2.getSizeInBits();
20202   return NumBits1 > NumBits2;
20203 }
20204
20205 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
20206   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20207   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
20208 }
20209
20210 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
20211   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
20212   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
20213 }
20214
20215 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
20216   EVT VT1 = Val.getValueType();
20217   if (isZExtFree(VT1, VT2))
20218     return true;
20219
20220   if (Val.getOpcode() != ISD::LOAD)
20221     return false;
20222
20223   if (!VT1.isSimple() || !VT1.isInteger() ||
20224       !VT2.isSimple() || !VT2.isInteger())
20225     return false;
20226
20227   switch (VT1.getSimpleVT().SimpleTy) {
20228   default: break;
20229   case MVT::i8:
20230   case MVT::i16:
20231   case MVT::i32:
20232     // X86 has 8, 16, and 32-bit zero-extending loads.
20233     return true;
20234   }
20235
20236   return false;
20237 }
20238
20239 bool
20240 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
20241   if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
20242     return false;
20243
20244   VT = VT.getScalarType();
20245
20246   if (!VT.isSimple())
20247     return false;
20248
20249   switch (VT.getSimpleVT().SimpleTy) {
20250   case MVT::f32:
20251   case MVT::f64:
20252     return true;
20253   default:
20254     break;
20255   }
20256
20257   return false;
20258 }
20259
20260 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
20261   // i16 instructions are longer (0x66 prefix) and potentially slower.
20262   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
20263 }
20264
20265 /// isShuffleMaskLegal - Targets can use this to indicate that they only
20266 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
20267 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
20268 /// are assumed to be legal.
20269 bool
20270 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
20271                                       EVT VT) const {
20272   if (!VT.isSimple())
20273     return false;
20274
20275   MVT SVT = VT.getSimpleVT();
20276
20277   // Very little shuffling can be done for 64-bit vectors right now.
20278   if (VT.getSizeInBits() == 64)
20279     return false;
20280
20281   // This is an experimental legality test that is tailored to match the
20282   // legality test of the experimental lowering more closely. They are gated
20283   // separately to ease testing of performance differences.
20284   if (ExperimentalVectorShuffleLegality)
20285     // We only care that the types being shuffled are legal. The lowering can
20286     // handle any possible shuffle mask that results.
20287     return isTypeLegal(SVT);
20288
20289   // If this is a single-input shuffle with no 128 bit lane crossings we can
20290   // lower it into pshufb.
20291   if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
20292       (SVT.is256BitVector() && Subtarget->hasInt256())) {
20293     bool isLegal = true;
20294     for (unsigned I = 0, E = M.size(); I != E; ++I) {
20295       if (M[I] >= (int)SVT.getVectorNumElements() ||
20296           ShuffleCrosses128bitLane(SVT, I, M[I])) {
20297         isLegal = false;
20298         break;
20299       }
20300     }
20301     if (isLegal)
20302       return true;
20303   }
20304
20305   // FIXME: blends, shifts.
20306   return (SVT.getVectorNumElements() == 2 ||
20307           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
20308           isMOVLMask(M, SVT) ||
20309           isCommutedMOVLMask(M, SVT) ||
20310           isMOVHLPSMask(M, SVT) ||
20311           isSHUFPMask(M, SVT) ||
20312           isSHUFPMask(M, SVT, /* Commuted */ true) ||
20313           isPSHUFDMask(M, SVT) ||
20314           isPSHUFDMask(M, SVT, /* SecondOperand */ true) ||
20315           isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
20316           isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
20317           isPALIGNRMask(M, SVT, Subtarget) ||
20318           isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
20319           isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
20320           isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20321           isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
20322           isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) ||
20323           (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT)));
20324 }
20325
20326 bool
20327 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
20328                                           EVT VT) const {
20329   if (!VT.isSimple())
20330     return false;
20331
20332   MVT SVT = VT.getSimpleVT();
20333
20334   // This is an experimental legality test that is tailored to match the
20335   // legality test of the experimental lowering more closely. They are gated
20336   // separately to ease testing of performance differences.
20337   if (ExperimentalVectorShuffleLegality)
20338     // The new vector shuffle lowering is very good at managing zero-inputs.
20339     return isShuffleMaskLegal(Mask, VT);
20340
20341   unsigned NumElts = SVT.getVectorNumElements();
20342   // FIXME: This collection of masks seems suspect.
20343   if (NumElts == 2)
20344     return true;
20345   if (NumElts == 4 && SVT.is128BitVector()) {
20346     return (isMOVLMask(Mask, SVT)  ||
20347             isCommutedMOVLMask(Mask, SVT, true) ||
20348             isSHUFPMask(Mask, SVT) ||
20349             isSHUFPMask(Mask, SVT, /* Commuted */ true) ||
20350             isBlendMask(Mask, SVT, Subtarget->hasSSE41(),
20351                         Subtarget->hasInt256()));
20352   }
20353   return false;
20354 }
20355
20356 //===----------------------------------------------------------------------===//
20357 //                           X86 Scheduler Hooks
20358 //===----------------------------------------------------------------------===//
20359
20360 /// Utility function to emit xbegin specifying the start of an RTM region.
20361 static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
20362                                      const TargetInstrInfo *TII) {
20363   DebugLoc DL = MI->getDebugLoc();
20364
20365   const BasicBlock *BB = MBB->getBasicBlock();
20366   MachineFunction::iterator I = MBB;
20367   ++I;
20368
20369   // For the v = xbegin(), we generate
20370   //
20371   // thisMBB:
20372   //  xbegin sinkMBB
20373   //
20374   // mainMBB:
20375   //  eax = -1
20376   //
20377   // sinkMBB:
20378   //  v = eax
20379
20380   MachineBasicBlock *thisMBB = MBB;
20381   MachineFunction *MF = MBB->getParent();
20382   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
20383   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
20384   MF->insert(I, mainMBB);
20385   MF->insert(I, sinkMBB);
20386
20387   // Transfer the remainder of BB and its successor edges to sinkMBB.
20388   sinkMBB->splice(sinkMBB->begin(), MBB,
20389                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20390   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
20391
20392   // thisMBB:
20393   //  xbegin sinkMBB
20394   //  # fallthrough to mainMBB
20395   //  # abortion to sinkMBB
20396   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
20397   thisMBB->addSuccessor(mainMBB);
20398   thisMBB->addSuccessor(sinkMBB);
20399
20400   // mainMBB:
20401   //  EAX = -1
20402   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
20403   mainMBB->addSuccessor(sinkMBB);
20404
20405   // sinkMBB:
20406   // EAX is live into the sinkMBB
20407   sinkMBB->addLiveIn(X86::EAX);
20408   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20409           TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20410     .addReg(X86::EAX);
20411
20412   MI->eraseFromParent();
20413   return sinkMBB;
20414 }
20415
20416 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
20417 // or XMM0_V32I8 in AVX all of this code can be replaced with that
20418 // in the .td file.
20419 static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
20420                                        const TargetInstrInfo *TII) {
20421   unsigned Opc;
20422   switch (MI->getOpcode()) {
20423   default: llvm_unreachable("illegal opcode!");
20424   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
20425   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
20426   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
20427   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
20428   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
20429   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
20430   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
20431   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
20432   }
20433
20434   DebugLoc dl = MI->getDebugLoc();
20435   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20436
20437   unsigned NumArgs = MI->getNumOperands();
20438   for (unsigned i = 1; i < NumArgs; ++i) {
20439     MachineOperand &Op = MI->getOperand(i);
20440     if (!(Op.isReg() && Op.isImplicit()))
20441       MIB.addOperand(Op);
20442   }
20443   if (MI->hasOneMemOperand())
20444     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20445
20446   BuildMI(*BB, MI, dl,
20447     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20448     .addReg(X86::XMM0);
20449
20450   MI->eraseFromParent();
20451   return BB;
20452 }
20453
20454 // FIXME: Custom handling because TableGen doesn't support multiple implicit
20455 // defs in an instruction pattern
20456 static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
20457                                        const TargetInstrInfo *TII) {
20458   unsigned Opc;
20459   switch (MI->getOpcode()) {
20460   default: llvm_unreachable("illegal opcode!");
20461   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
20462   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
20463   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
20464   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
20465   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
20466   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
20467   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
20468   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
20469   }
20470
20471   DebugLoc dl = MI->getDebugLoc();
20472   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
20473
20474   unsigned NumArgs = MI->getNumOperands(); // remove the results
20475   for (unsigned i = 1; i < NumArgs; ++i) {
20476     MachineOperand &Op = MI->getOperand(i);
20477     if (!(Op.isReg() && Op.isImplicit()))
20478       MIB.addOperand(Op);
20479   }
20480   if (MI->hasOneMemOperand())
20481     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
20482
20483   BuildMI(*BB, MI, dl,
20484     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
20485     .addReg(X86::ECX);
20486
20487   MI->eraseFromParent();
20488   return BB;
20489 }
20490
20491 static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
20492                                        const TargetInstrInfo *TII,
20493                                        const X86Subtarget* Subtarget) {
20494   DebugLoc dl = MI->getDebugLoc();
20495
20496   // Address into RAX/EAX, other two args into ECX, EDX.
20497   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
20498   unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
20499   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
20500   for (int i = 0; i < X86::AddrNumOperands; ++i)
20501     MIB.addOperand(MI->getOperand(i));
20502
20503   unsigned ValOps = X86::AddrNumOperands;
20504   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
20505     .addReg(MI->getOperand(ValOps).getReg());
20506   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
20507     .addReg(MI->getOperand(ValOps+1).getReg());
20508
20509   // The instruction doesn't actually take any operands though.
20510   BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));
20511
20512   MI->eraseFromParent(); // The pseudo is gone now.
20513   return BB;
20514 }
20515
20516 MachineBasicBlock *
20517 X86TargetLowering::EmitVAARG64WithCustomInserter(
20518                    MachineInstr *MI,
20519                    MachineBasicBlock *MBB) const {
20520   // Emit va_arg instruction on X86-64.
20521
20522   // Operands to this pseudo-instruction:
20523   // 0  ) Output        : destination address (reg)
20524   // 1-5) Input         : va_list address (addr, i64mem)
20525   // 6  ) ArgSize       : Size (in bytes) of vararg type
20526   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
20527   // 8  ) Align         : Alignment of type
20528   // 9  ) EFLAGS (implicit-def)
20529
20530   assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
20531   assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");
20532
20533   unsigned DestReg = MI->getOperand(0).getReg();
20534   MachineOperand &Base = MI->getOperand(1);
20535   MachineOperand &Scale = MI->getOperand(2);
20536   MachineOperand &Index = MI->getOperand(3);
20537   MachineOperand &Disp = MI->getOperand(4);
20538   MachineOperand &Segment = MI->getOperand(5);
20539   unsigned ArgSize = MI->getOperand(6).getImm();
20540   unsigned ArgMode = MI->getOperand(7).getImm();
20541   unsigned Align = MI->getOperand(8).getImm();
20542
20543   // Memory Reference
20544   assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
20545   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
20546   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
20547
20548   // Machine Information
20549   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20550   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
20551   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
20552   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
20553   DebugLoc DL = MI->getDebugLoc();
20554
20555   // struct va_list {
20556   //   i32   gp_offset
20557   //   i32   fp_offset
20558   //   i64   overflow_area (address)
20559   //   i64   reg_save_area (address)
20560   // }
20561   // sizeof(va_list) = 24
20562   // alignment(va_list) = 8
20563
20564   unsigned TotalNumIntRegs = 6;
20565   unsigned TotalNumXMMRegs = 8;
20566   bool UseGPOffset = (ArgMode == 1);
20567   bool UseFPOffset = (ArgMode == 2);
20568   unsigned MaxOffset = TotalNumIntRegs * 8 +
20569                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
20570
20571   /* Align ArgSize to a multiple of 8 */
20572   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
20573   bool NeedsAlign = (Align > 8);
20574
20575   MachineBasicBlock *thisMBB = MBB;
20576   MachineBasicBlock *overflowMBB;
20577   MachineBasicBlock *offsetMBB;
20578   MachineBasicBlock *endMBB;
20579
20580   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
20581   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
20582   unsigned OffsetReg = 0;
20583
20584   if (!UseGPOffset && !UseFPOffset) {
20585     // If we only pull from the overflow region, we don't create a branch.
20586     // We don't need to alter control flow.
20587     OffsetDestReg = 0; // unused
20588     OverflowDestReg = DestReg;
20589
20590     offsetMBB = nullptr;
20591     overflowMBB = thisMBB;
20592     endMBB = thisMBB;
20593   } else {
20594     // First emit code to check if gp_offset (or fp_offset) is below the bound.
20595     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
20596     // If not, pull from overflow_area. (branch to overflowMBB)
20597     //
20598     //       thisMBB
20599     //         |     .
20600     //         |        .
20601     //     offsetMBB   overflowMBB
20602     //         |        .
20603     //         |     .
20604     //        endMBB
20605
20606     // Registers for the PHI in endMBB
20607     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
20608     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
20609
20610     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20611     MachineFunction *MF = MBB->getParent();
20612     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20613     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20614     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20615
20616     MachineFunction::iterator MBBIter = MBB;
20617     ++MBBIter;
20618
20619     // Insert the new basic blocks
20620     MF->insert(MBBIter, offsetMBB);
20621     MF->insert(MBBIter, overflowMBB);
20622     MF->insert(MBBIter, endMBB);
20623
20624     // Transfer the remainder of MBB and its successor edges to endMBB.
20625     endMBB->splice(endMBB->begin(), thisMBB,
20626                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
20627     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
20628
20629     // Make offsetMBB and overflowMBB successors of thisMBB
20630     thisMBB->addSuccessor(offsetMBB);
20631     thisMBB->addSuccessor(overflowMBB);
20632
20633     // endMBB is a successor of both offsetMBB and overflowMBB
20634     offsetMBB->addSuccessor(endMBB);
20635     overflowMBB->addSuccessor(endMBB);
20636
20637     // Load the offset value into a register
20638     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20639     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
20640       .addOperand(Base)
20641       .addOperand(Scale)
20642       .addOperand(Index)
20643       .addDisp(Disp, UseFPOffset ? 4 : 0)
20644       .addOperand(Segment)
20645       .setMemRefs(MMOBegin, MMOEnd);
20646
20647     // Check if there is enough room left to pull this argument.
20648     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
20649       .addReg(OffsetReg)
20650       .addImm(MaxOffset + 8 - ArgSizeA8);
20651
20652     // Branch to "overflowMBB" if offset >= max
20653     // Fall through to "offsetMBB" otherwise
20654     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
20655       .addMBB(overflowMBB);
20656   }
20657
20658   // In offsetMBB, emit code to use the reg_save_area.
20659   if (offsetMBB) {
20660     assert(OffsetReg != 0);
20661
20662     // Read the reg_save_area address.
20663     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
20664     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
20665       .addOperand(Base)
20666       .addOperand(Scale)
20667       .addOperand(Index)
20668       .addDisp(Disp, 16)
20669       .addOperand(Segment)
20670       .setMemRefs(MMOBegin, MMOEnd);
20671
20672     // Zero-extend the offset
20673     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
20674       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
20675         .addImm(0)
20676         .addReg(OffsetReg)
20677         .addImm(X86::sub_32bit);
20678
20679     // Add the offset to the reg_save_area to get the final address.
20680     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
20681       .addReg(OffsetReg64)
20682       .addReg(RegSaveReg);
20683
20684     // Compute the offset for the next argument
20685     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
20686     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
20687       .addReg(OffsetReg)
20688       .addImm(UseFPOffset ? 16 : 8);
20689
20690     // Store it back into the va_list.
20691     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
20692       .addOperand(Base)
20693       .addOperand(Scale)
20694       .addOperand(Index)
20695       .addDisp(Disp, UseFPOffset ? 4 : 0)
20696       .addOperand(Segment)
20697       .addReg(NextOffsetReg)
20698       .setMemRefs(MMOBegin, MMOEnd);
20699
20700     // Jump to endMBB
20701     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
20702       .addMBB(endMBB);
20703   }
20704
20705   //
20706   // Emit code to use overflow area
20707   //
20708
20709   // Load the overflow_area address into a register.
20710   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
20711   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
20712     .addOperand(Base)
20713     .addOperand(Scale)
20714     .addOperand(Index)
20715     .addDisp(Disp, 8)
20716     .addOperand(Segment)
20717     .setMemRefs(MMOBegin, MMOEnd);
20718
20719   // If we need to align it, do so. Otherwise, just copy the address
20720   // to OverflowDestReg.
20721   if (NeedsAlign) {
20722     // Align the overflow address
20723     assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
20724     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
20725
20726     // aligned_addr = (addr + (align-1)) & ~(align-1)
20727     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
20728       .addReg(OverflowAddrReg)
20729       .addImm(Align-1);
20730
20731     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
20732       .addReg(TmpReg)
20733       .addImm(~(uint64_t)(Align-1));
20734   } else {
20735     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
20736       .addReg(OverflowAddrReg);
20737   }
20738
20739   // Compute the next overflow address after this argument.
20740   // (the overflow address should be kept 8-byte aligned)
20741   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
20742   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
20743     .addReg(OverflowDestReg)
20744     .addImm(ArgSizeA8);
20745
20746   // Store the new overflow address.
20747   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
20748     .addOperand(Base)
20749     .addOperand(Scale)
20750     .addOperand(Index)
20751     .addDisp(Disp, 8)
20752     .addOperand(Segment)
20753     .addReg(NextAddrReg)
20754     .setMemRefs(MMOBegin, MMOEnd);
20755
20756   // If we branched, emit the PHI to the front of endMBB.
20757   if (offsetMBB) {
20758     BuildMI(*endMBB, endMBB->begin(), DL,
20759             TII->get(X86::PHI), DestReg)
20760       .addReg(OffsetDestReg).addMBB(offsetMBB)
20761       .addReg(OverflowDestReg).addMBB(overflowMBB);
20762   }
20763
20764   // Erase the pseudo instruction
20765   MI->eraseFromParent();
20766
20767   return endMBB;
20768 }
20769
20770 MachineBasicBlock *
20771 X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
20772                                                  MachineInstr *MI,
20773                                                  MachineBasicBlock *MBB) const {
20774   // Emit code to save XMM registers to the stack. The ABI says that the
20775   // number of registers to save is given in %al, so it's theoretically
20776   // possible to do an indirect jump trick to avoid saving all of them,
20777   // however this code takes a simpler approach and just executes all
20778   // of the stores if %al is non-zero. It's less code, and it's probably
20779   // easier on the hardware branch predictor, and stores aren't all that
20780   // expensive anyway.
20781
20782   // Create the new basic blocks. One block contains all the XMM stores,
20783   // and one block is the final destination regardless of whether any
20784   // stores were performed.
20785   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
20786   MachineFunction *F = MBB->getParent();
20787   MachineFunction::iterator MBBIter = MBB;
20788   ++MBBIter;
20789   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
20790   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
20791   F->insert(MBBIter, XMMSaveMBB);
20792   F->insert(MBBIter, EndMBB);
20793
20794   // Transfer the remainder of MBB and its successor edges to EndMBB.
20795   EndMBB->splice(EndMBB->begin(), MBB,
20796                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
20797   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
20798
20799   // The original block will now fall through to the XMM save block.
20800   MBB->addSuccessor(XMMSaveMBB);
20801   // The XMMSaveMBB will fall through to the end block.
20802   XMMSaveMBB->addSuccessor(EndMBB);
20803
20804   // Now add the instructions.
20805   const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo();
20806   DebugLoc DL = MI->getDebugLoc();
20807
20808   unsigned CountReg = MI->getOperand(0).getReg();
20809   int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
20810   int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
20811
20812   if (!Subtarget->isTargetWin64()) {
20813     // If %al is 0, branch around the XMM save block.
20814     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
20815     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
20816     MBB->addSuccessor(EndMBB);
20817   }
20818
20819   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
20820   // that was just emitted, but clearly shouldn't be "saved".
20821   assert((MI->getNumOperands() <= 3 ||
20822           !MI->getOperand(MI->getNumOperands() - 1).isReg() ||
20823           MI->getOperand(MI->getNumOperands() - 1).getReg() == X86::EFLAGS)
20824          && "Expected last argument to be EFLAGS");
20825   unsigned MOVOpc = Subtarget->hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
20826   // In the XMM save block, save all the XMM argument registers.
20827   for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) {
20828     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
20829     MachineMemOperand *MMO =
20830       F->getMachineMemOperand(
20831           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
20832         MachineMemOperand::MOStore,
20833         /*Size=*/16, /*Align=*/16);
20834     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
20835       .addFrameIndex(RegSaveFrameIndex)
20836       .addImm(/*Scale=*/1)
20837       .addReg(/*IndexReg=*/0)
20838       .addImm(/*Disp=*/Offset)
20839       .addReg(/*Segment=*/0)
20840       .addReg(MI->getOperand(i).getReg())
20841       .addMemOperand(MMO);
20842   }
20843
20844   MI->eraseFromParent();   // The pseudo instruction is gone now.
20845
20846   return EndMBB;
20847 }
20848
20849 // The EFLAGS operand of SelectItr might be missing a kill marker
20850 // because there were multiple uses of EFLAGS, and ISel didn't know
20851 // which to mark. Figure out whether SelectItr should have had a
20852 // kill marker, and set it if it should. Returns the correct kill
20853 // marker value.
20854 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
20855                                      MachineBasicBlock* BB,
20856                                      const TargetRegisterInfo* TRI) {
20857   // Scan forward through BB for a use/def of EFLAGS.
20858   MachineBasicBlock::iterator miI(std::next(SelectItr));
20859   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
20860     const MachineInstr& mi = *miI;
20861     if (mi.readsRegister(X86::EFLAGS))
20862       return false;
20863     if (mi.definesRegister(X86::EFLAGS))
20864       break; // Should have kill-flag - update below.
20865   }
20866
20867   // If we hit the end of the block, check whether EFLAGS is live into a
20868   // successor.
20869   if (miI == BB->end()) {
20870     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
20871                                           sEnd = BB->succ_end();
20872          sItr != sEnd; ++sItr) {
20873       MachineBasicBlock* succ = *sItr;
20874       if (succ->isLiveIn(X86::EFLAGS))
20875         return false;
20876     }
20877   }
20878
20879   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
20880   // out. SelectMI should have a kill flag on EFLAGS.
20881   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
20882   return true;
20883 }
20884
20885 MachineBasicBlock *
20886 X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
20887                                      MachineBasicBlock *BB) const {
20888   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
20889   DebugLoc DL = MI->getDebugLoc();
20890
20891   // To "insert" a SELECT_CC instruction, we actually have to insert the
20892   // diamond control-flow pattern.  The incoming instruction knows the
20893   // destination vreg to set, the condition code register to branch on, the
20894   // true/false values to select between, and a branch opcode to use.
20895   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20896   MachineFunction::iterator It = BB;
20897   ++It;
20898
20899   //  thisMBB:
20900   //  ...
20901   //   TrueVal = ...
20902   //   cmpTY ccX, r1, r2
20903   //   bCC copy1MBB
20904   //   fallthrough --> copy0MBB
20905   MachineBasicBlock *thisMBB = BB;
20906   MachineFunction *F = BB->getParent();
20907   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
20908   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
20909   F->insert(It, copy0MBB);
20910   F->insert(It, sinkMBB);
20911
20912   // If the EFLAGS register isn't dead in the terminator, then claim that it's
20913   // live into the sink and copy blocks.
20914   const TargetRegisterInfo *TRI =
20915       BB->getParent()->getSubtarget().getRegisterInfo();
20916   if (!MI->killsRegister(X86::EFLAGS) &&
20917       !checkAndUpdateEFLAGSKill(MI, BB, TRI)) {
20918     copy0MBB->addLiveIn(X86::EFLAGS);
20919     sinkMBB->addLiveIn(X86::EFLAGS);
20920   }
20921
20922   // Transfer the remainder of BB and its successor edges to sinkMBB.
20923   sinkMBB->splice(sinkMBB->begin(), BB,
20924                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
20925   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
20926
20927   // Add the true and fallthrough blocks as its successors.
20928   BB->addSuccessor(copy0MBB);
20929   BB->addSuccessor(sinkMBB);
20930
20931   // Create the conditional branch instruction.
20932   unsigned Opc =
20933     X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
20934   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
20935
20936   //  copy0MBB:
20937   //   %FalseValue = ...
20938   //   # fallthrough to sinkMBB
20939   copy0MBB->addSuccessor(sinkMBB);
20940
20941   //  sinkMBB:
20942   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
20943   //  ...
20944   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
20945           TII->get(X86::PHI), MI->getOperand(0).getReg())
20946     .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
20947     .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
20948
20949   MI->eraseFromParent();   // The pseudo instruction is gone now.
20950   return sinkMBB;
20951 }
20952
20953 MachineBasicBlock *
20954 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI,
20955                                         MachineBasicBlock *BB) const {
20956   MachineFunction *MF = BB->getParent();
20957   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
20958   DebugLoc DL = MI->getDebugLoc();
20959   const BasicBlock *LLVM_BB = BB->getBasicBlock();
20960
20961   assert(MF->shouldSplitStack());
20962
20963   const bool Is64Bit = Subtarget->is64Bit();
20964   const bool IsLP64 = Subtarget->isTarget64BitLP64();
20965
20966   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
20967   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
20968
20969   // BB:
20970   //  ... [Till the alloca]
20971   // If stacklet is not large enough, jump to mallocMBB
20972   //
20973   // bumpMBB:
20974   //  Allocate by subtracting from RSP
20975   //  Jump to continueMBB
20976   //
20977   // mallocMBB:
20978   //  Allocate by call to runtime
20979   //
20980   // continueMBB:
20981   //  ...
20982   //  [rest of original BB]
20983   //
20984
20985   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20986   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20987   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
20988
20989   MachineRegisterInfo &MRI = MF->getRegInfo();
20990   const TargetRegisterClass *AddrRegClass =
20991     getRegClassFor(getPointerTy());
20992
20993   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20994     bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
20995     tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
20996     SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
20997     sizeVReg = MI->getOperand(1).getReg(),
20998     physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP;
20999
21000   MachineFunction::iterator MBBIter = BB;
21001   ++MBBIter;
21002
21003   MF->insert(MBBIter, bumpMBB);
21004   MF->insert(MBBIter, mallocMBB);
21005   MF->insert(MBBIter, continueMBB);
21006
21007   continueMBB->splice(continueMBB->begin(), BB,
21008                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
21009   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
21010
21011   // Add code to the main basic block to check if the stack limit has been hit,
21012   // and if so, jump to mallocMBB otherwise to bumpMBB.
21013   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
21014   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
21015     .addReg(tmpSPVReg).addReg(sizeVReg);
21016   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
21017     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
21018     .addReg(SPLimitVReg);
21019   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
21020
21021   // bumpMBB simply decreases the stack pointer, since we know the current
21022   // stacklet has enough space.
21023   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
21024     .addReg(SPLimitVReg);
21025   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
21026     .addReg(SPLimitVReg);
21027   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21028
21029   // Calls into a routine in libgcc to allocate more space from the heap.
21030   const uint32_t *RegMask = MF->getTarget()
21031                                 .getSubtargetImpl()
21032                                 ->getRegisterInfo()
21033                                 ->getCallPreservedMask(CallingConv::C);
21034   if (IsLP64) {
21035     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
21036       .addReg(sizeVReg);
21037     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21038       .addExternalSymbol("__morestack_allocate_stack_space")
21039       .addRegMask(RegMask)
21040       .addReg(X86::RDI, RegState::Implicit)
21041       .addReg(X86::RAX, RegState::ImplicitDefine);
21042   } else if (Is64Bit) {
21043     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
21044       .addReg(sizeVReg);
21045     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
21046       .addExternalSymbol("__morestack_allocate_stack_space")
21047       .addRegMask(RegMask)
21048       .addReg(X86::EDI, RegState::Implicit)
21049       .addReg(X86::EAX, RegState::ImplicitDefine);
21050   } else {
21051     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
21052       .addImm(12);
21053     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
21054     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
21055       .addExternalSymbol("__morestack_allocate_stack_space")
21056       .addRegMask(RegMask)
21057       .addReg(X86::EAX, RegState::ImplicitDefine);
21058   }
21059
21060   if (!Is64Bit)
21061     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
21062       .addImm(16);
21063
21064   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
21065     .addReg(IsLP64 ? X86::RAX : X86::EAX);
21066   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
21067
21068   // Set up the CFG correctly.
21069   BB->addSuccessor(bumpMBB);
21070   BB->addSuccessor(mallocMBB);
21071   mallocMBB->addSuccessor(continueMBB);
21072   bumpMBB->addSuccessor(continueMBB);
21073
21074   // Take care of the PHI nodes.
21075   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
21076           MI->getOperand(0).getReg())
21077     .addReg(mallocPtrVReg).addMBB(mallocMBB)
21078     .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
21079
21080   // Delete the original pseudo instruction.
21081   MI->eraseFromParent();
21082
21083   // And we're done.
21084   return continueMBB;
21085 }
21086
21087 MachineBasicBlock *
21088 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
21089                                         MachineBasicBlock *BB) const {
21090   const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo();
21091   DebugLoc DL = MI->getDebugLoc();
21092
21093   assert(!Subtarget->isTargetMachO());
21094
21095   X86FrameLowering::emitStackProbeCall(*BB->getParent(), *BB, MI, DL);
21096
21097   MI->eraseFromParent();   // The pseudo instruction is gone now.
21098   return BB;
21099 }
21100
21101 MachineBasicBlock *
21102 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
21103                                       MachineBasicBlock *BB) const {
21104   // This is pretty easy.  We're taking the value that we received from
21105   // our load from the relocation, sticking it in either RDI (x86-64)
21106   // or EAX and doing an indirect call.  The return value will then
21107   // be in the normal return register.
21108   MachineFunction *F = BB->getParent();
21109   const X86InstrInfo *TII =
21110       static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo());
21111   DebugLoc DL = MI->getDebugLoc();
21112
21113   assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
21114   assert(MI->getOperand(3).isGlobal() && "This should be a global");
21115
21116   // Get a register mask for the lowered call.
21117   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
21118   // proper register mask.
21119   const uint32_t *RegMask = F->getTarget()
21120                                 .getSubtargetImpl()
21121                                 ->getRegisterInfo()
21122                                 ->getCallPreservedMask(CallingConv::C);
21123   if (Subtarget->is64Bit()) {
21124     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21125                                       TII->get(X86::MOV64rm), X86::RDI)
21126     .addReg(X86::RIP)
21127     .addImm(0).addReg(0)
21128     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21129                       MI->getOperand(3).getTargetFlags())
21130     .addReg(0);
21131     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
21132     addDirectMem(MIB, X86::RDI);
21133     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
21134   } else if (F->getTarget().getRelocationModel() != Reloc::PIC_) {
21135     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21136                                       TII->get(X86::MOV32rm), X86::EAX)
21137     .addReg(0)
21138     .addImm(0).addReg(0)
21139     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21140                       MI->getOperand(3).getTargetFlags())
21141     .addReg(0);
21142     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21143     addDirectMem(MIB, X86::EAX);
21144     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21145   } else {
21146     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
21147                                       TII->get(X86::MOV32rm), X86::EAX)
21148     .addReg(TII->getGlobalBaseReg(F))
21149     .addImm(0).addReg(0)
21150     .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
21151                       MI->getOperand(3).getTargetFlags())
21152     .addReg(0);
21153     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
21154     addDirectMem(MIB, X86::EAX);
21155     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
21156   }
21157
21158   MI->eraseFromParent(); // The pseudo instruction is gone now.
21159   return BB;
21160 }
21161
21162 MachineBasicBlock *
21163 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
21164                                     MachineBasicBlock *MBB) const {
21165   DebugLoc DL = MI->getDebugLoc();
21166   MachineFunction *MF = MBB->getParent();
21167   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21168   MachineRegisterInfo &MRI = MF->getRegInfo();
21169
21170   const BasicBlock *BB = MBB->getBasicBlock();
21171   MachineFunction::iterator I = MBB;
21172   ++I;
21173
21174   // Memory Reference
21175   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21176   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21177
21178   unsigned DstReg;
21179   unsigned MemOpndSlot = 0;
21180
21181   unsigned CurOp = 0;
21182
21183   DstReg = MI->getOperand(CurOp++).getReg();
21184   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
21185   assert(RC->hasType(MVT::i32) && "Invalid destination!");
21186   unsigned mainDstReg = MRI.createVirtualRegister(RC);
21187   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
21188
21189   MemOpndSlot = CurOp;
21190
21191   MVT PVT = getPointerTy();
21192   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21193          "Invalid Pointer Size!");
21194
21195   // For v = setjmp(buf), we generate
21196   //
21197   // thisMBB:
21198   //  buf[LabelOffset] = restoreMBB
21199   //  SjLjSetup restoreMBB
21200   //
21201   // mainMBB:
21202   //  v_main = 0
21203   //
21204   // sinkMBB:
21205   //  v = phi(main, restore)
21206   //
21207   // restoreMBB:
21208   //  if base pointer being used, load it from frame
21209   //  v_restore = 1
21210
21211   MachineBasicBlock *thisMBB = MBB;
21212   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
21213   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
21214   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
21215   MF->insert(I, mainMBB);
21216   MF->insert(I, sinkMBB);
21217   MF->push_back(restoreMBB);
21218
21219   MachineInstrBuilder MIB;
21220
21221   // Transfer the remainder of BB and its successor edges to sinkMBB.
21222   sinkMBB->splice(sinkMBB->begin(), MBB,
21223                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
21224   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
21225
21226   // thisMBB:
21227   unsigned PtrStoreOpc = 0;
21228   unsigned LabelReg = 0;
21229   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21230   Reloc::Model RM = MF->getTarget().getRelocationModel();
21231   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
21232                      (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
21233
21234   // Prepare IP either in reg or imm.
21235   if (!UseImmLabel) {
21236     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
21237     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
21238     LabelReg = MRI.createVirtualRegister(PtrRC);
21239     if (Subtarget->is64Bit()) {
21240       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
21241               .addReg(X86::RIP)
21242               .addImm(0)
21243               .addReg(0)
21244               .addMBB(restoreMBB)
21245               .addReg(0);
21246     } else {
21247       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
21248       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
21249               .addReg(XII->getGlobalBaseReg(MF))
21250               .addImm(0)
21251               .addReg(0)
21252               .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
21253               .addReg(0);
21254     }
21255   } else
21256     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
21257   // Store IP
21258   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
21259   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21260     if (i == X86::AddrDisp)
21261       MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
21262     else
21263       MIB.addOperand(MI->getOperand(MemOpndSlot + i));
21264   }
21265   if (!UseImmLabel)
21266     MIB.addReg(LabelReg);
21267   else
21268     MIB.addMBB(restoreMBB);
21269   MIB.setMemRefs(MMOBegin, MMOEnd);
21270   // Setup
21271   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
21272           .addMBB(restoreMBB);
21273
21274   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21275       MF->getSubtarget().getRegisterInfo());
21276   MIB.addRegMask(RegInfo->getNoPreservedMask());
21277   thisMBB->addSuccessor(mainMBB);
21278   thisMBB->addSuccessor(restoreMBB);
21279
21280   // mainMBB:
21281   //  EAX = 0
21282   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
21283   mainMBB->addSuccessor(sinkMBB);
21284
21285   // sinkMBB:
21286   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
21287           TII->get(X86::PHI), DstReg)
21288     .addReg(mainDstReg).addMBB(mainMBB)
21289     .addReg(restoreDstReg).addMBB(restoreMBB);
21290
21291   // restoreMBB:
21292   if (RegInfo->hasBasePointer(*MF)) {
21293     const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>();
21294     const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
21295     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
21296     X86FI->setRestoreBasePointer(MF);
21297     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
21298     unsigned BasePtr = RegInfo->getBaseRegister();
21299     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
21300     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
21301                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
21302       .setMIFlag(MachineInstr::FrameSetup);
21303   }
21304   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
21305   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
21306   restoreMBB->addSuccessor(sinkMBB);
21307
21308   MI->eraseFromParent();
21309   return sinkMBB;
21310 }
21311
21312 MachineBasicBlock *
21313 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
21314                                      MachineBasicBlock *MBB) const {
21315   DebugLoc DL = MI->getDebugLoc();
21316   MachineFunction *MF = MBB->getParent();
21317   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
21318   MachineRegisterInfo &MRI = MF->getRegInfo();
21319
21320   // Memory Reference
21321   MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
21322   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
21323
21324   MVT PVT = getPointerTy();
21325   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
21326          "Invalid Pointer Size!");
21327
21328   const TargetRegisterClass *RC =
21329     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
21330   unsigned Tmp = MRI.createVirtualRegister(RC);
21331   // Since FP is only updated here but NOT referenced, it's treated as GPR.
21332   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
21333       MF->getSubtarget().getRegisterInfo());
21334   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
21335   unsigned SP = RegInfo->getStackRegister();
21336
21337   MachineInstrBuilder MIB;
21338
21339   const int64_t LabelOffset = 1 * PVT.getStoreSize();
21340   const int64_t SPOffset = 2 * PVT.getStoreSize();
21341
21342   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
21343   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
21344
21345   // Reload FP
21346   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
21347   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
21348     MIB.addOperand(MI->getOperand(i));
21349   MIB.setMemRefs(MMOBegin, MMOEnd);
21350   // Reload IP
21351   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
21352   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21353     if (i == X86::AddrDisp)
21354       MIB.addDisp(MI->getOperand(i), LabelOffset);
21355     else
21356       MIB.addOperand(MI->getOperand(i));
21357   }
21358   MIB.setMemRefs(MMOBegin, MMOEnd);
21359   // Reload SP
21360   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
21361   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
21362     if (i == X86::AddrDisp)
21363       MIB.addDisp(MI->getOperand(i), SPOffset);
21364     else
21365       MIB.addOperand(MI->getOperand(i));
21366   }
21367   MIB.setMemRefs(MMOBegin, MMOEnd);
21368   // Jump
21369   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
21370
21371   MI->eraseFromParent();
21372   return MBB;
21373 }
21374
21375 // Replace 213-type (isel default) FMA3 instructions with 231-type for
21376 // accumulator loops. Writing back to the accumulator allows the coalescer
21377 // to remove extra copies in the loop.
21378 MachineBasicBlock *
21379 X86TargetLowering::emitFMA3Instr(MachineInstr *MI,
21380                                  MachineBasicBlock *MBB) const {
21381   MachineOperand &AddendOp = MI->getOperand(3);
21382
21383   // Bail out early if the addend isn't a register - we can't switch these.
21384   if (!AddendOp.isReg())
21385     return MBB;
21386
21387   MachineFunction &MF = *MBB->getParent();
21388   MachineRegisterInfo &MRI = MF.getRegInfo();
21389
21390   // Check whether the addend is defined by a PHI:
21391   assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
21392   MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
21393   if (!AddendDef.isPHI())
21394     return MBB;
21395
21396   // Look for the following pattern:
21397   // loop:
21398   //   %addend = phi [%entry, 0], [%loop, %result]
21399   //   ...
21400   //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
21401
21402   // Replace with:
21403   //   loop:
21404   //   %addend = phi [%entry, 0], [%loop, %result]
21405   //   ...
21406   //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
21407
21408   for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
21409     assert(AddendDef.getOperand(i).isReg());
21410     MachineOperand PHISrcOp = AddendDef.getOperand(i);
21411     MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
21412     if (&PHISrcInst == MI) {
21413       // Found a matching instruction.
21414       unsigned NewFMAOpc = 0;
21415       switch (MI->getOpcode()) {
21416         case X86::VFMADDPDr213r: NewFMAOpc = X86::VFMADDPDr231r; break;
21417         case X86::VFMADDPSr213r: NewFMAOpc = X86::VFMADDPSr231r; break;
21418         case X86::VFMADDSDr213r: NewFMAOpc = X86::VFMADDSDr231r; break;
21419         case X86::VFMADDSSr213r: NewFMAOpc = X86::VFMADDSSr231r; break;
21420         case X86::VFMSUBPDr213r: NewFMAOpc = X86::VFMSUBPDr231r; break;
21421         case X86::VFMSUBPSr213r: NewFMAOpc = X86::VFMSUBPSr231r; break;
21422         case X86::VFMSUBSDr213r: NewFMAOpc = X86::VFMSUBSDr231r; break;
21423         case X86::VFMSUBSSr213r: NewFMAOpc = X86::VFMSUBSSr231r; break;
21424         case X86::VFNMADDPDr213r: NewFMAOpc = X86::VFNMADDPDr231r; break;
21425         case X86::VFNMADDPSr213r: NewFMAOpc = X86::VFNMADDPSr231r; break;
21426         case X86::VFNMADDSDr213r: NewFMAOpc = X86::VFNMADDSDr231r; break;
21427         case X86::VFNMADDSSr213r: NewFMAOpc = X86::VFNMADDSSr231r; break;
21428         case X86::VFNMSUBPDr213r: NewFMAOpc = X86::VFNMSUBPDr231r; break;
21429         case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break;
21430         case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break;
21431         case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break;
21432         case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break;
21433         case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break;
21434         case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break;
21435         case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break;
21436
21437         case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break;
21438         case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break;
21439         case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break;
21440         case X86::VFMSUBPSr213rY: NewFMAOpc = X86::VFMSUBPSr231rY; break;
21441         case X86::VFNMADDPDr213rY: NewFMAOpc = X86::VFNMADDPDr231rY; break;
21442         case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break;
21443         case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break;
21444         case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break;
21445         case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break;
21446         case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break;
21447         case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break;
21448         case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break;
21449         default: llvm_unreachable("Unrecognized FMA variant.");
21450       }
21451
21452       const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
21453       MachineInstrBuilder MIB =
21454         BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc))
21455         .addOperand(MI->getOperand(0))
21456         .addOperand(MI->getOperand(3))
21457         .addOperand(MI->getOperand(2))
21458         .addOperand(MI->getOperand(1));
21459       MBB->insert(MachineBasicBlock::iterator(MI), MIB);
21460       MI->eraseFromParent();
21461     }
21462   }
21463
21464   return MBB;
21465 }
21466
21467 MachineBasicBlock *
21468 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
21469                                                MachineBasicBlock *BB) const {
21470   switch (MI->getOpcode()) {
21471   default: llvm_unreachable("Unexpected instr type to insert");
21472   case X86::TAILJMPd64:
21473   case X86::TAILJMPr64:
21474   case X86::TAILJMPm64:
21475     llvm_unreachable("TAILJMP64 would not be touched here.");
21476   case X86::TCRETURNdi64:
21477   case X86::TCRETURNri64:
21478   case X86::TCRETURNmi64:
21479     return BB;
21480   case X86::WIN_ALLOCA:
21481     return EmitLoweredWinAlloca(MI, BB);
21482   case X86::SEG_ALLOCA_32:
21483   case X86::SEG_ALLOCA_64:
21484     return EmitLoweredSegAlloca(MI, BB);
21485   case X86::TLSCall_32:
21486   case X86::TLSCall_64:
21487     return EmitLoweredTLSCall(MI, BB);
21488   case X86::CMOV_GR8:
21489   case X86::CMOV_FR32:
21490   case X86::CMOV_FR64:
21491   case X86::CMOV_V4F32:
21492   case X86::CMOV_V2F64:
21493   case X86::CMOV_V2I64:
21494   case X86::CMOV_V8F32:
21495   case X86::CMOV_V4F64:
21496   case X86::CMOV_V4I64:
21497   case X86::CMOV_V16F32:
21498   case X86::CMOV_V8F64:
21499   case X86::CMOV_V8I64:
21500   case X86::CMOV_GR16:
21501   case X86::CMOV_GR32:
21502   case X86::CMOV_RFP32:
21503   case X86::CMOV_RFP64:
21504   case X86::CMOV_RFP80:
21505     return EmitLoweredSelect(MI, BB);
21506
21507   case X86::FP32_TO_INT16_IN_MEM:
21508   case X86::FP32_TO_INT32_IN_MEM:
21509   case X86::FP32_TO_INT64_IN_MEM:
21510   case X86::FP64_TO_INT16_IN_MEM:
21511   case X86::FP64_TO_INT32_IN_MEM:
21512   case X86::FP64_TO_INT64_IN_MEM:
21513   case X86::FP80_TO_INT16_IN_MEM:
21514   case X86::FP80_TO_INT32_IN_MEM:
21515   case X86::FP80_TO_INT64_IN_MEM: {
21516     MachineFunction *F = BB->getParent();
21517     const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo();
21518     DebugLoc DL = MI->getDebugLoc();
21519
21520     // Change the floating point control register to use "round towards zero"
21521     // mode when truncating to an integer value.
21522     int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
21523     addFrameReference(BuildMI(*BB, MI, DL,
21524                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
21525
21526     // Load the old value of the high byte of the control word...
21527     unsigned OldCW =
21528       F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
21529     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
21530                       CWFrameIdx);
21531
21532     // Set the high part to be round to zero...
21533     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
21534       .addImm(0xC7F);
21535
21536     // Reload the modified control word now...
21537     addFrameReference(BuildMI(*BB, MI, DL,
21538                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21539
21540     // Restore the memory image of control word to original value
21541     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
21542       .addReg(OldCW);
21543
21544     // Get the X86 opcode to use.
21545     unsigned Opc;
21546     switch (MI->getOpcode()) {
21547     default: llvm_unreachable("illegal opcode!");
21548     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
21549     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
21550     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
21551     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
21552     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
21553     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
21554     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
21555     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
21556     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
21557     }
21558
21559     X86AddressMode AM;
21560     MachineOperand &Op = MI->getOperand(0);
21561     if (Op.isReg()) {
21562       AM.BaseType = X86AddressMode::RegBase;
21563       AM.Base.Reg = Op.getReg();
21564     } else {
21565       AM.BaseType = X86AddressMode::FrameIndexBase;
21566       AM.Base.FrameIndex = Op.getIndex();
21567     }
21568     Op = MI->getOperand(1);
21569     if (Op.isImm())
21570       AM.Scale = Op.getImm();
21571     Op = MI->getOperand(2);
21572     if (Op.isImm())
21573       AM.IndexReg = Op.getImm();
21574     Op = MI->getOperand(3);
21575     if (Op.isGlobal()) {
21576       AM.GV = Op.getGlobal();
21577     } else {
21578       AM.Disp = Op.getImm();
21579     }
21580     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
21581                       .addReg(MI->getOperand(X86::AddrNumOperands).getReg());
21582
21583     // Reload the original control word now.
21584     addFrameReference(BuildMI(*BB, MI, DL,
21585                               TII->get(X86::FLDCW16m)), CWFrameIdx);
21586
21587     MI->eraseFromParent();   // The pseudo instruction is gone now.
21588     return BB;
21589   }
21590     // String/text processing lowering.
21591   case X86::PCMPISTRM128REG:
21592   case X86::VPCMPISTRM128REG:
21593   case X86::PCMPISTRM128MEM:
21594   case X86::VPCMPISTRM128MEM:
21595   case X86::PCMPESTRM128REG:
21596   case X86::VPCMPESTRM128REG:
21597   case X86::PCMPESTRM128MEM:
21598   case X86::VPCMPESTRM128MEM:
21599     assert(Subtarget->hasSSE42() &&
21600            "Target must have SSE4.2 or AVX features enabled");
21601     return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21602
21603   // String/text processing lowering.
21604   case X86::PCMPISTRIREG:
21605   case X86::VPCMPISTRIREG:
21606   case X86::PCMPISTRIMEM:
21607   case X86::VPCMPISTRIMEM:
21608   case X86::PCMPESTRIREG:
21609   case X86::VPCMPESTRIREG:
21610   case X86::PCMPESTRIMEM:
21611   case X86::VPCMPESTRIMEM:
21612     assert(Subtarget->hasSSE42() &&
21613            "Target must have SSE4.2 or AVX features enabled");
21614     return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21615
21616   // Thread synchronization.
21617   case X86::MONITOR:
21618     return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(),
21619                        Subtarget);
21620
21621   // xbegin
21622   case X86::XBEGIN:
21623     return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo());
21624
21625   case X86::VASTART_SAVE_XMM_REGS:
21626     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
21627
21628   case X86::VAARG_64:
21629     return EmitVAARG64WithCustomInserter(MI, BB);
21630
21631   case X86::EH_SjLj_SetJmp32:
21632   case X86::EH_SjLj_SetJmp64:
21633     return emitEHSjLjSetJmp(MI, BB);
21634
21635   case X86::EH_SjLj_LongJmp32:
21636   case X86::EH_SjLj_LongJmp64:
21637     return emitEHSjLjLongJmp(MI, BB);
21638
21639   case TargetOpcode::STATEPOINT:
21640     // As an implementation detail, STATEPOINT shares the STACKMAP format at
21641     // this point in the process.  We diverge later.
21642     return emitPatchPoint(MI, BB);
21643
21644   case TargetOpcode::STACKMAP:
21645   case TargetOpcode::PATCHPOINT:
21646     return emitPatchPoint(MI, BB);
21647
21648   case X86::VFMADDPDr213r:
21649   case X86::VFMADDPSr213r:
21650   case X86::VFMADDSDr213r:
21651   case X86::VFMADDSSr213r:
21652   case X86::VFMSUBPDr213r:
21653   case X86::VFMSUBPSr213r:
21654   case X86::VFMSUBSDr213r:
21655   case X86::VFMSUBSSr213r:
21656   case X86::VFNMADDPDr213r:
21657   case X86::VFNMADDPSr213r:
21658   case X86::VFNMADDSDr213r:
21659   case X86::VFNMADDSSr213r:
21660   case X86::VFNMSUBPDr213r:
21661   case X86::VFNMSUBPSr213r:
21662   case X86::VFNMSUBSDr213r:
21663   case X86::VFNMSUBSSr213r:
21664   case X86::VFMADDSUBPDr213r:
21665   case X86::VFMADDSUBPSr213r:
21666   case X86::VFMSUBADDPDr213r:
21667   case X86::VFMSUBADDPSr213r:
21668   case X86::VFMADDPDr213rY:
21669   case X86::VFMADDPSr213rY:
21670   case X86::VFMSUBPDr213rY:
21671   case X86::VFMSUBPSr213rY:
21672   case X86::VFNMADDPDr213rY:
21673   case X86::VFNMADDPSr213rY:
21674   case X86::VFNMSUBPDr213rY:
21675   case X86::VFNMSUBPSr213rY:
21676   case X86::VFMADDSUBPDr213rY:
21677   case X86::VFMADDSUBPSr213rY:
21678   case X86::VFMSUBADDPDr213rY:
21679   case X86::VFMSUBADDPSr213rY:
21680     return emitFMA3Instr(MI, BB);
21681   }
21682 }
21683
21684 //===----------------------------------------------------------------------===//
21685 //                           X86 Optimization Hooks
21686 //===----------------------------------------------------------------------===//
21687
21688 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
21689                                                       APInt &KnownZero,
21690                                                       APInt &KnownOne,
21691                                                       const SelectionDAG &DAG,
21692                                                       unsigned Depth) const {
21693   unsigned BitWidth = KnownZero.getBitWidth();
21694   unsigned Opc = Op.getOpcode();
21695   assert((Opc >= ISD::BUILTIN_OP_END ||
21696           Opc == ISD::INTRINSIC_WO_CHAIN ||
21697           Opc == ISD::INTRINSIC_W_CHAIN ||
21698           Opc == ISD::INTRINSIC_VOID) &&
21699          "Should use MaskedValueIsZero if you don't know whether Op"
21700          " is a target node!");
21701
21702   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
21703   switch (Opc) {
21704   default: break;
21705   case X86ISD::ADD:
21706   case X86ISD::SUB:
21707   case X86ISD::ADC:
21708   case X86ISD::SBB:
21709   case X86ISD::SMUL:
21710   case X86ISD::UMUL:
21711   case X86ISD::INC:
21712   case X86ISD::DEC:
21713   case X86ISD::OR:
21714   case X86ISD::XOR:
21715   case X86ISD::AND:
21716     // These nodes' second result is a boolean.
21717     if (Op.getResNo() == 0)
21718       break;
21719     // Fallthrough
21720   case X86ISD::SETCC:
21721     KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
21722     break;
21723   case ISD::INTRINSIC_WO_CHAIN: {
21724     unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21725     unsigned NumLoBits = 0;
21726     switch (IntId) {
21727     default: break;
21728     case Intrinsic::x86_sse_movmsk_ps:
21729     case Intrinsic::x86_avx_movmsk_ps_256:
21730     case Intrinsic::x86_sse2_movmsk_pd:
21731     case Intrinsic::x86_avx_movmsk_pd_256:
21732     case Intrinsic::x86_mmx_pmovmskb:
21733     case Intrinsic::x86_sse2_pmovmskb_128:
21734     case Intrinsic::x86_avx2_pmovmskb: {
21735       // High bits of movmskp{s|d}, pmovmskb are known zero.
21736       switch (IntId) {
21737         default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
21738         case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
21739         case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
21740         case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
21741         case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
21742         case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
21743         case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
21744         case Intrinsic::x86_avx2_pmovmskb:      NumLoBits = 32; break;
21745       }
21746       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
21747       break;
21748     }
21749     }
21750     break;
21751   }
21752   }
21753 }
21754
21755 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
21756   SDValue Op,
21757   const SelectionDAG &,
21758   unsigned Depth) const {
21759   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
21760   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
21761     return Op.getValueType().getScalarType().getSizeInBits();
21762
21763   // Fallback case.
21764   return 1;
21765 }
21766
21767 /// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
21768 /// node is a GlobalAddress + offset.
21769 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
21770                                        const GlobalValue* &GA,
21771                                        int64_t &Offset) const {
21772   if (N->getOpcode() == X86ISD::Wrapper) {
21773     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
21774       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
21775       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
21776       return true;
21777     }
21778   }
21779   return TargetLowering::isGAPlusOffset(N, GA, Offset);
21780 }
21781
21782 /// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
21783 /// same as extracting the high 128-bit part of 256-bit vector and then
21784 /// inserting the result into the low part of a new 256-bit vector
21785 static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
21786   EVT VT = SVOp->getValueType(0);
21787   unsigned NumElems = VT.getVectorNumElements();
21788
21789   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21790   for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j)
21791     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21792         SVOp->getMaskElt(j) >= 0)
21793       return false;
21794
21795   return true;
21796 }
21797
21798 /// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
21799 /// same as extracting the low 128-bit part of 256-bit vector and then
21800 /// inserting the result into the high part of a new 256-bit vector
21801 static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
21802   EVT VT = SVOp->getValueType(0);
21803   unsigned NumElems = VT.getVectorNumElements();
21804
21805   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21806   for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j)
21807     if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
21808         SVOp->getMaskElt(j) >= 0)
21809       return false;
21810
21811   return true;
21812 }
21813
21814 /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
21815 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
21816                                         TargetLowering::DAGCombinerInfo &DCI,
21817                                         const X86Subtarget* Subtarget) {
21818   SDLoc dl(N);
21819   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
21820   SDValue V1 = SVOp->getOperand(0);
21821   SDValue V2 = SVOp->getOperand(1);
21822   EVT VT = SVOp->getValueType(0);
21823   unsigned NumElems = VT.getVectorNumElements();
21824
21825   if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
21826       V2.getOpcode() == ISD::CONCAT_VECTORS) {
21827     //
21828     //                   0,0,0,...
21829     //                      |
21830     //    V      UNDEF    BUILD_VECTOR    UNDEF
21831     //     \      /           \           /
21832     //  CONCAT_VECTOR         CONCAT_VECTOR
21833     //         \                  /
21834     //          \                /
21835     //          RESULT: V + zero extended
21836     //
21837     if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
21838         V2.getOperand(1).getOpcode() != ISD::UNDEF ||
21839         V1.getOperand(1).getOpcode() != ISD::UNDEF)
21840       return SDValue();
21841
21842     if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
21843       return SDValue();
21844
21845     // To match the shuffle mask, the first half of the mask should
21846     // be exactly the first vector, and all the rest a splat with the
21847     // first element of the second one.
21848     for (unsigned i = 0; i != NumElems/2; ++i)
21849       if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
21850           !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
21851         return SDValue();
21852
21853     // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
21854     if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
21855       if (Ld->hasNUsesOfValue(1, 0)) {
21856         SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
21857         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
21858         SDValue ResNode =
21859           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
21860                                   Ld->getMemoryVT(),
21861                                   Ld->getPointerInfo(),
21862                                   Ld->getAlignment(),
21863                                   false/*isVolatile*/, true/*ReadMem*/,
21864                                   false/*WriteMem*/);
21865
21866         // Make sure the newly-created LOAD is in the same position as Ld in
21867         // terms of dependency. We create a TokenFactor for Ld and ResNode,
21868         // and update uses of Ld's output chain to use the TokenFactor.
21869         if (Ld->hasAnyUseOfValue(1)) {
21870           SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
21871                              SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
21872           DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
21873           DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
21874                                  SDValue(ResNode.getNode(), 1));
21875         }
21876
21877         return DAG.getNode(ISD::BITCAST, dl, VT, ResNode);
21878       }
21879     }
21880
21881     // Emit a zeroed vector and insert the desired subvector on its
21882     // first half.
21883     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21884     SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
21885     return DCI.CombineTo(N, InsV);
21886   }
21887
21888   //===--------------------------------------------------------------------===//
21889   // Combine some shuffles into subvector extracts and inserts:
21890   //
21891
21892   // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
21893   if (isShuffleHigh128VectorInsertLow(SVOp)) {
21894     SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl);
21895     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl);
21896     return DCI.CombineTo(N, InsV);
21897   }
21898
21899   // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
21900   if (isShuffleLow128VectorInsertHigh(SVOp)) {
21901     SDValue V = Extract128BitVector(V1, 0, DAG, dl);
21902     SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl);
21903     return DCI.CombineTo(N, InsV);
21904   }
21905
21906   return SDValue();
21907 }
21908
21909 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
21910 /// possible.
21911 ///
21912 /// This is the leaf of the recursive combinine below. When we have found some
21913 /// chain of single-use x86 shuffle instructions and accumulated the combined
21914 /// shuffle mask represented by them, this will try to pattern match that mask
21915 /// into either a single instruction if there is a special purpose instruction
21916 /// for this operation, or into a PSHUFB instruction which is a fully general
21917 /// instruction but should only be used to replace chains over a certain depth.
21918 static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
21919                                    int Depth, bool HasPSHUFB, SelectionDAG &DAG,
21920                                    TargetLowering::DAGCombinerInfo &DCI,
21921                                    const X86Subtarget *Subtarget) {
21922   assert(!Mask.empty() && "Cannot combine an empty shuffle mask!");
21923
21924   // Find the operand that enters the chain. Note that multiple uses are OK
21925   // here, we're not going to remove the operand we find.
21926   SDValue Input = Op.getOperand(0);
21927   while (Input.getOpcode() == ISD::BITCAST)
21928     Input = Input.getOperand(0);
21929
21930   MVT VT = Input.getSimpleValueType();
21931   MVT RootVT = Root.getSimpleValueType();
21932   SDLoc DL(Root);
21933
21934   // Just remove no-op shuffle masks.
21935   if (Mask.size() == 1) {
21936     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input),
21937                   /*AddTo*/ true);
21938     return true;
21939   }
21940
21941   // Use the float domain if the operand type is a floating point type.
21942   bool FloatDomain = VT.isFloatingPoint();
21943
21944   // For floating point shuffles, we don't have free copies in the shuffle
21945   // instructions or the ability to load as part of the instruction, so
21946   // canonicalize their shuffles to UNPCK or MOV variants.
21947   //
21948   // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
21949   // vectors because it can have a load folded into it that UNPCK cannot. This
21950   // doesn't preclude something switching to the shorter encoding post-RA.
21951   if (FloatDomain) {
21952     if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
21953       bool Lo = Mask.equals(0, 0);
21954       unsigned Shuffle;
21955       MVT ShuffleVT;
21956       // Check if we have SSE3 which will let us use MOVDDUP. That instruction
21957       // is no slower than UNPCKLPD but has the option to fold the input operand
21958       // into even an unaligned memory load.
21959       if (Lo && Subtarget->hasSSE3()) {
21960         Shuffle = X86ISD::MOVDDUP;
21961         ShuffleVT = MVT::v2f64;
21962       } else {
21963         // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
21964         // than the UNPCK variants.
21965         Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
21966         ShuffleVT = MVT::v4f32;
21967       }
21968       if (Depth == 1 && Root->getOpcode() == Shuffle)
21969         return false; // Nothing to do!
21970       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21971       DCI.AddToWorklist(Op.getNode());
21972       if (Shuffle == X86ISD::MOVDDUP)
21973         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21974       else
21975         Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
21976       DCI.AddToWorklist(Op.getNode());
21977       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21978                     /*AddTo*/ true);
21979       return true;
21980     }
21981     if (Subtarget->hasSSE3() &&
21982         (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) {
21983       bool Lo = Mask.equals(0, 0, 2, 2);
21984       unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP;
21985       MVT ShuffleVT = MVT::v4f32;
21986       if (Depth == 1 && Root->getOpcode() == Shuffle)
21987         return false; // Nothing to do!
21988       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
21989       DCI.AddToWorklist(Op.getNode());
21990       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
21991       DCI.AddToWorklist(Op.getNode());
21992       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
21993                     /*AddTo*/ true);
21994       return true;
21995     }
21996     if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) {
21997       bool Lo = Mask.equals(0, 0, 1, 1);
21998       unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
21999       MVT ShuffleVT = MVT::v4f32;
22000       if (Depth == 1 && Root->getOpcode() == Shuffle)
22001         return false; // Nothing to do!
22002       Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22003       DCI.AddToWorklist(Op.getNode());
22004       Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22005       DCI.AddToWorklist(Op.getNode());
22006       DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22007                     /*AddTo*/ true);
22008       return true;
22009     }
22010   }
22011
22012   // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
22013   // variants as none of these have single-instruction variants that are
22014   // superior to the UNPCK formulation.
22015   if (!FloatDomain &&
22016       (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
22017        Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
22018        Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
22019        Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
22020                    15))) {
22021     bool Lo = Mask[0] == 0;
22022     unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
22023     if (Depth == 1 && Root->getOpcode() == Shuffle)
22024       return false; // Nothing to do!
22025     MVT ShuffleVT;
22026     switch (Mask.size()) {
22027     case 8:
22028       ShuffleVT = MVT::v8i16;
22029       break;
22030     case 16:
22031       ShuffleVT = MVT::v16i8;
22032       break;
22033     default:
22034       llvm_unreachable("Impossible mask size!");
22035     };
22036     Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
22037     DCI.AddToWorklist(Op.getNode());
22038     Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
22039     DCI.AddToWorklist(Op.getNode());
22040     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22041                   /*AddTo*/ true);
22042     return true;
22043   }
22044
22045   // Don't try to re-form single instruction chains under any circumstances now
22046   // that we've done encoding canonicalization for them.
22047   if (Depth < 2)
22048     return false;
22049
22050   // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we
22051   // can replace them with a single PSHUFB instruction profitably. Intel's
22052   // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but
22053   // in practice PSHUFB tends to be *very* fast so we're more aggressive.
22054   if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) {
22055     SmallVector<SDValue, 16> PSHUFBMask;
22056     assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!");
22057     int Ratio = 16 / Mask.size();
22058     for (unsigned i = 0; i < 16; ++i) {
22059       if (Mask[i / Ratio] == SM_SentinelUndef) {
22060         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
22061         continue;
22062       }
22063       int M = Mask[i / Ratio] != SM_SentinelZero
22064                   ? Ratio * Mask[i / Ratio] + i % Ratio
22065                   : 255;
22066       PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8));
22067     }
22068     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input);
22069     DCI.AddToWorklist(Op.getNode());
22070     SDValue PSHUFBMaskOp =
22071         DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask);
22072     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
22073     Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp);
22074     DCI.AddToWorklist(Op.getNode());
22075     DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
22076                   /*AddTo*/ true);
22077     return true;
22078   }
22079
22080   // Failed to find any combines.
22081   return false;
22082 }
22083
22084 /// \brief Fully generic combining of x86 shuffle instructions.
22085 ///
22086 /// This should be the last combine run over the x86 shuffle instructions. Once
22087 /// they have been fully optimized, this will recursively consider all chains
22088 /// of single-use shuffle instructions, build a generic model of the cumulative
22089 /// shuffle operation, and check for simpler instructions which implement this
22090 /// operation. We use this primarily for two purposes:
22091 ///
22092 /// 1) Collapse generic shuffles to specialized single instructions when
22093 ///    equivalent. In most cases, this is just an encoding size win, but
22094 ///    sometimes we will collapse multiple generic shuffles into a single
22095 ///    special-purpose shuffle.
22096 /// 2) Look for sequences of shuffle instructions with 3 or more total
22097 ///    instructions, and replace them with the slightly more expensive SSSE3
22098 ///    PSHUFB instruction if available. We do this as the last combining step
22099 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
22100 ///    a suitable short sequence of other instructions. The PHUFB will either
22101 ///    use a register or have to read from memory and so is slightly (but only
22102 ///    slightly) more expensive than the other shuffle instructions.
22103 ///
22104 /// Because this is inherently a quadratic operation (for each shuffle in
22105 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
22106 /// This should never be an issue in practice as the shuffle lowering doesn't
22107 /// produce sequences of more than 8 instructions.
22108 ///
22109 /// FIXME: We will currently miss some cases where the redundant shuffling
22110 /// would simplify under the threshold for PSHUFB formation because of
22111 /// combine-ordering. To fix this, we should do the redundant instruction
22112 /// combining in this recursive walk.
22113 static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
22114                                           ArrayRef<int> RootMask,
22115                                           int Depth, bool HasPSHUFB,
22116                                           SelectionDAG &DAG,
22117                                           TargetLowering::DAGCombinerInfo &DCI,
22118                                           const X86Subtarget *Subtarget) {
22119   // Bound the depth of our recursive combine because this is ultimately
22120   // quadratic in nature.
22121   if (Depth > 8)
22122     return false;
22123
22124   // Directly rip through bitcasts to find the underlying operand.
22125   while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
22126     Op = Op.getOperand(0);
22127
22128   MVT VT = Op.getSimpleValueType();
22129   if (!VT.isVector())
22130     return false; // Bail if we hit a non-vector.
22131   // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit
22132   // version should be added.
22133   if (VT.getSizeInBits() != 128)
22134     return false;
22135
22136   assert(Root.getSimpleValueType().isVector() &&
22137          "Shuffles operate on vector types!");
22138   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
22139          "Can only combine shuffles of the same vector register size.");
22140
22141   if (!isTargetShuffle(Op.getOpcode()))
22142     return false;
22143   SmallVector<int, 16> OpMask;
22144   bool IsUnary;
22145   bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary);
22146   // We only can combine unary shuffles which we can decode the mask for.
22147   if (!HaveMask || !IsUnary)
22148     return false;
22149
22150   assert(VT.getVectorNumElements() == OpMask.size() &&
22151          "Different mask size from vector size!");
22152   assert(((RootMask.size() > OpMask.size() &&
22153            RootMask.size() % OpMask.size() == 0) ||
22154           (OpMask.size() > RootMask.size() &&
22155            OpMask.size() % RootMask.size() == 0) ||
22156           OpMask.size() == RootMask.size()) &&
22157          "The smaller number of elements must divide the larger.");
22158   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
22159   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
22160   assert(((RootRatio == 1 && OpRatio == 1) ||
22161           (RootRatio == 1) != (OpRatio == 1)) &&
22162          "Must not have a ratio for both incoming and op masks!");
22163
22164   SmallVector<int, 16> Mask;
22165   Mask.reserve(std::max(OpMask.size(), RootMask.size()));
22166
22167   // Merge this shuffle operation's mask into our accumulated mask. Note that
22168   // this shuffle's mask will be the first applied to the input, followed by the
22169   // root mask to get us all the way to the root value arrangement. The reason
22170   // for this order is that we are recursing up the operation chain.
22171   for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) {
22172     int RootIdx = i / RootRatio;
22173     if (RootMask[RootIdx] < 0) {
22174       // This is a zero or undef lane, we're done.
22175       Mask.push_back(RootMask[RootIdx]);
22176       continue;
22177     }
22178
22179     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
22180     int OpIdx = RootMaskedIdx / OpRatio;
22181     if (OpMask[OpIdx] < 0) {
22182       // The incoming lanes are zero or undef, it doesn't matter which ones we
22183       // are using.
22184       Mask.push_back(OpMask[OpIdx]);
22185       continue;
22186     }
22187
22188     // Ok, we have non-zero lanes, map them through.
22189     Mask.push_back(OpMask[OpIdx] * OpRatio +
22190                    RootMaskedIdx % OpRatio);
22191   }
22192
22193   // See if we can recurse into the operand to combine more things.
22194   switch (Op.getOpcode()) {
22195     case X86ISD::PSHUFB:
22196       HasPSHUFB = true;
22197     case X86ISD::PSHUFD:
22198     case X86ISD::PSHUFHW:
22199     case X86ISD::PSHUFLW:
22200       if (Op.getOperand(0).hasOneUse() &&
22201           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22202                                         HasPSHUFB, DAG, DCI, Subtarget))
22203         return true;
22204       break;
22205
22206     case X86ISD::UNPCKL:
22207     case X86ISD::UNPCKH:
22208       assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!");
22209       // We can't check for single use, we have to check that this shuffle is the only user.
22210       if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
22211           combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1,
22212                                         HasPSHUFB, DAG, DCI, Subtarget))
22213           return true;
22214       break;
22215   }
22216
22217   // Minor canonicalization of the accumulated shuffle mask to make it easier
22218   // to match below. All this does is detect masks with squential pairs of
22219   // elements, and shrink them to the half-width mask. It does this in a loop
22220   // so it will reduce the size of the mask to the minimal width mask which
22221   // performs an equivalent shuffle.
22222   SmallVector<int, 16> WidenedMask;
22223   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
22224     Mask = std::move(WidenedMask);
22225     WidenedMask.clear();
22226   }
22227
22228   return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI,
22229                                 Subtarget);
22230 }
22231
22232 /// \brief Get the PSHUF-style mask from PSHUF node.
22233 ///
22234 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
22235 /// PSHUF-style masks that can be reused with such instructions.
22236 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
22237   SmallVector<int, 4> Mask;
22238   bool IsUnary;
22239   bool HaveMask = getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), Mask, IsUnary);
22240   (void)HaveMask;
22241   assert(HaveMask);
22242
22243   switch (N.getOpcode()) {
22244   case X86ISD::PSHUFD:
22245     return Mask;
22246   case X86ISD::PSHUFLW:
22247     Mask.resize(4);
22248     return Mask;
22249   case X86ISD::PSHUFHW:
22250     Mask.erase(Mask.begin(), Mask.begin() + 4);
22251     for (int &M : Mask)
22252       M -= 4;
22253     return Mask;
22254   default:
22255     llvm_unreachable("No valid shuffle instruction found!");
22256   }
22257 }
22258
22259 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
22260 ///
22261 /// We walk up the chain and look for a combinable shuffle, skipping over
22262 /// shuffles that we could hoist this shuffle's transformation past without
22263 /// altering anything.
22264 static SDValue
22265 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
22266                              SelectionDAG &DAG,
22267                              TargetLowering::DAGCombinerInfo &DCI) {
22268   assert(N.getOpcode() == X86ISD::PSHUFD &&
22269          "Called with something other than an x86 128-bit half shuffle!");
22270   SDLoc DL(N);
22271
22272   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
22273   // of the shuffles in the chain so that we can form a fresh chain to replace
22274   // this one.
22275   SmallVector<SDValue, 8> Chain;
22276   SDValue V = N.getOperand(0);
22277   for (; V.hasOneUse(); V = V.getOperand(0)) {
22278     switch (V.getOpcode()) {
22279     default:
22280       return SDValue(); // Nothing combined!
22281
22282     case ISD::BITCAST:
22283       // Skip bitcasts as we always know the type for the target specific
22284       // instructions.
22285       continue;
22286
22287     case X86ISD::PSHUFD:
22288       // Found another dword shuffle.
22289       break;
22290
22291     case X86ISD::PSHUFLW:
22292       // Check that the low words (being shuffled) are the identity in the
22293       // dword shuffle, and the high words are self-contained.
22294       if (Mask[0] != 0 || Mask[1] != 1 ||
22295           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
22296         return SDValue();
22297
22298       Chain.push_back(V);
22299       continue;
22300
22301     case X86ISD::PSHUFHW:
22302       // Check that the high words (being shuffled) are the identity in the
22303       // dword shuffle, and the low words are self-contained.
22304       if (Mask[2] != 2 || Mask[3] != 3 ||
22305           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
22306         return SDValue();
22307
22308       Chain.push_back(V);
22309       continue;
22310
22311     case X86ISD::UNPCKL:
22312     case X86ISD::UNPCKH:
22313       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
22314       // shuffle into a preceding word shuffle.
22315       if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16)
22316         return SDValue();
22317
22318       // Search for a half-shuffle which we can combine with.
22319       unsigned CombineOp =
22320           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
22321       if (V.getOperand(0) != V.getOperand(1) ||
22322           !V->isOnlyUserOf(V.getOperand(0).getNode()))
22323         return SDValue();
22324       Chain.push_back(V);
22325       V = V.getOperand(0);
22326       do {
22327         switch (V.getOpcode()) {
22328         default:
22329           return SDValue(); // Nothing to combine.
22330
22331         case X86ISD::PSHUFLW:
22332         case X86ISD::PSHUFHW:
22333           if (V.getOpcode() == CombineOp)
22334             break;
22335
22336           Chain.push_back(V);
22337
22338           // Fallthrough!
22339         case ISD::BITCAST:
22340           V = V.getOperand(0);
22341           continue;
22342         }
22343         break;
22344       } while (V.hasOneUse());
22345       break;
22346     }
22347     // Break out of the loop if we break out of the switch.
22348     break;
22349   }
22350
22351   if (!V.hasOneUse())
22352     // We fell out of the loop without finding a viable combining instruction.
22353     return SDValue();
22354
22355   // Merge this node's mask and our incoming mask.
22356   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22357   for (int &M : Mask)
22358     M = VMask[M];
22359   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
22360                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22361
22362   // Rebuild the chain around this new shuffle.
22363   while (!Chain.empty()) {
22364     SDValue W = Chain.pop_back_val();
22365
22366     if (V.getValueType() != W.getOperand(0).getValueType())
22367       V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V);
22368
22369     switch (W.getOpcode()) {
22370     default:
22371       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
22372
22373     case X86ISD::UNPCKL:
22374     case X86ISD::UNPCKH:
22375       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
22376       break;
22377
22378     case X86ISD::PSHUFD:
22379     case X86ISD::PSHUFLW:
22380     case X86ISD::PSHUFHW:
22381       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
22382       break;
22383     }
22384   }
22385   if (V.getValueType() != N.getValueType())
22386     V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V);
22387
22388   // Return the new chain to replace N.
22389   return V;
22390 }
22391
22392 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
22393 ///
22394 /// We walk up the chain, skipping shuffles of the other half and looking
22395 /// through shuffles which switch halves trying to find a shuffle of the same
22396 /// pair of dwords.
22397 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
22398                                         SelectionDAG &DAG,
22399                                         TargetLowering::DAGCombinerInfo &DCI) {
22400   assert(
22401       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
22402       "Called with something other than an x86 128-bit half shuffle!");
22403   SDLoc DL(N);
22404   unsigned CombineOpcode = N.getOpcode();
22405
22406   // Walk up a single-use chain looking for a combinable shuffle.
22407   SDValue V = N.getOperand(0);
22408   for (; V.hasOneUse(); V = V.getOperand(0)) {
22409     switch (V.getOpcode()) {
22410     default:
22411       return false; // Nothing combined!
22412
22413     case ISD::BITCAST:
22414       // Skip bitcasts as we always know the type for the target specific
22415       // instructions.
22416       continue;
22417
22418     case X86ISD::PSHUFLW:
22419     case X86ISD::PSHUFHW:
22420       if (V.getOpcode() == CombineOpcode)
22421         break;
22422
22423       // Other-half shuffles are no-ops.
22424       continue;
22425     }
22426     // Break out of the loop if we break out of the switch.
22427     break;
22428   }
22429
22430   if (!V.hasOneUse())
22431     // We fell out of the loop without finding a viable combining instruction.
22432     return false;
22433
22434   // Combine away the bottom node as its shuffle will be accumulated into
22435   // a preceding shuffle.
22436   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22437
22438   // Record the old value.
22439   SDValue Old = V;
22440
22441   // Merge this node's mask and our incoming mask (adjusted to account for all
22442   // the pshufd instructions encountered).
22443   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22444   for (int &M : Mask)
22445     M = VMask[M];
22446   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
22447                   getV4X86ShuffleImm8ForMask(Mask, DAG));
22448
22449   // Check that the shuffles didn't cancel each other out. If not, we need to
22450   // combine to the new one.
22451   if (Old != V)
22452     // Replace the combinable shuffle with the combined one, updating all users
22453     // so that we re-evaluate the chain here.
22454     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
22455
22456   return true;
22457 }
22458
22459 /// \brief Try to combine x86 target specific shuffles.
22460 static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
22461                                            TargetLowering::DAGCombinerInfo &DCI,
22462                                            const X86Subtarget *Subtarget) {
22463   SDLoc DL(N);
22464   MVT VT = N.getSimpleValueType();
22465   SmallVector<int, 4> Mask;
22466
22467   switch (N.getOpcode()) {
22468   case X86ISD::PSHUFD:
22469   case X86ISD::PSHUFLW:
22470   case X86ISD::PSHUFHW:
22471     Mask = getPSHUFShuffleMask(N);
22472     assert(Mask.size() == 4);
22473     break;
22474   default:
22475     return SDValue();
22476   }
22477
22478   // Nuke no-op shuffles that show up after combining.
22479   if (isNoopShuffleMask(Mask))
22480     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
22481
22482   // Look for simplifications involving one or two shuffle instructions.
22483   SDValue V = N.getOperand(0);
22484   switch (N.getOpcode()) {
22485   default:
22486     break;
22487   case X86ISD::PSHUFLW:
22488   case X86ISD::PSHUFHW:
22489     assert(VT == MVT::v8i16);
22490     (void)VT;
22491
22492     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
22493       return SDValue(); // We combined away this shuffle, so we're done.
22494
22495     // See if this reduces to a PSHUFD which is no more expensive and can
22496     // combine with more operations. Note that it has to at least flip the
22497     // dwords as otherwise it would have been removed as a no-op.
22498     if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) {
22499       int DMask[] = {0, 1, 2, 3};
22500       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
22501       DMask[DOffset + 0] = DOffset + 1;
22502       DMask[DOffset + 1] = DOffset + 0;
22503       V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V);
22504       DCI.AddToWorklist(V.getNode());
22505       V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V,
22506                       getV4X86ShuffleImm8ForMask(DMask, DAG));
22507       DCI.AddToWorklist(V.getNode());
22508       return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
22509     }
22510
22511     // Look for shuffle patterns which can be implemented as a single unpack.
22512     // FIXME: This doesn't handle the location of the PSHUFD generically, and
22513     // only works when we have a PSHUFD followed by two half-shuffles.
22514     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
22515         (V.getOpcode() == X86ISD::PSHUFLW ||
22516          V.getOpcode() == X86ISD::PSHUFHW) &&
22517         V.getOpcode() != N.getOpcode() &&
22518         V.hasOneUse()) {
22519       SDValue D = V.getOperand(0);
22520       while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
22521         D = D.getOperand(0);
22522       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
22523         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
22524         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
22525         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22526         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
22527         int WordMask[8];
22528         for (int i = 0; i < 4; ++i) {
22529           WordMask[i + NOffset] = Mask[i] + NOffset;
22530           WordMask[i + VOffset] = VMask[i] + VOffset;
22531         }
22532         // Map the word mask through the DWord mask.
22533         int MappedMask[8];
22534         for (int i = 0; i < 8; ++i)
22535           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
22536         const int UnpackLoMask[] = {0, 0, 1, 1, 2, 2, 3, 3};
22537         const int UnpackHiMask[] = {4, 4, 5, 5, 6, 6, 7, 7};
22538         if (std::equal(std::begin(MappedMask), std::end(MappedMask),
22539                        std::begin(UnpackLoMask)) ||
22540             std::equal(std::begin(MappedMask), std::end(MappedMask),
22541                        std::begin(UnpackHiMask))) {
22542           // We can replace all three shuffles with an unpack.
22543           V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, D.getOperand(0));
22544           DCI.AddToWorklist(V.getNode());
22545           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
22546                                                 : X86ISD::UNPCKH,
22547                              DL, MVT::v8i16, V, V);
22548         }
22549       }
22550     }
22551
22552     break;
22553
22554   case X86ISD::PSHUFD:
22555     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
22556       return NewN;
22557
22558     break;
22559   }
22560
22561   return SDValue();
22562 }
22563
22564 /// \brief Try to combine a shuffle into a target-specific add-sub node.
22565 ///
22566 /// We combine this directly on the abstract vector shuffle nodes so it is
22567 /// easier to generically match. We also insert dummy vector shuffle nodes for
22568 /// the operands which explicitly discard the lanes which are unused by this
22569 /// operation to try to flow through the rest of the combiner the fact that
22570 /// they're unused.
22571 static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) {
22572   SDLoc DL(N);
22573   EVT VT = N->getValueType(0);
22574
22575   // We only handle target-independent shuffles.
22576   // FIXME: It would be easy and harmless to use the target shuffle mask
22577   // extraction tool to support more.
22578   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
22579     return SDValue();
22580
22581   auto *SVN = cast<ShuffleVectorSDNode>(N);
22582   ArrayRef<int> Mask = SVN->getMask();
22583   SDValue V1 = N->getOperand(0);
22584   SDValue V2 = N->getOperand(1);
22585
22586   // We require the first shuffle operand to be the SUB node, and the second to
22587   // be the ADD node.
22588   // FIXME: We should support the commuted patterns.
22589   if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD)
22590     return SDValue();
22591
22592   // If there are other uses of these operations we can't fold them.
22593   if (!V1->hasOneUse() || !V2->hasOneUse())
22594     return SDValue();
22595
22596   // Ensure that both operations have the same operands. Note that we can
22597   // commute the FADD operands.
22598   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
22599   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
22600       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
22601     return SDValue();
22602
22603   // We're looking for blends between FADD and FSUB nodes. We insist on these
22604   // nodes being lined up in a specific expected pattern.
22605   if (!(isShuffleEquivalent(Mask, 0, 3) ||
22606         isShuffleEquivalent(Mask, 0, 5, 2, 7) ||
22607         isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15)))
22608     return SDValue();
22609
22610   // Only specific types are legal at this point, assert so we notice if and
22611   // when these change.
22612   assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 ||
22613           VT == MVT::v4f64) &&
22614          "Unknown vector type encountered!");
22615
22616   return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
22617 }
22618
22619 /// PerformShuffleCombine - Performs several different shuffle combines.
22620 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
22621                                      TargetLowering::DAGCombinerInfo &DCI,
22622                                      const X86Subtarget *Subtarget) {
22623   SDLoc dl(N);
22624   SDValue N0 = N->getOperand(0);
22625   SDValue N1 = N->getOperand(1);
22626   EVT VT = N->getValueType(0);
22627
22628   // Don't create instructions with illegal types after legalize types has run.
22629   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22630   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
22631     return SDValue();
22632
22633   // If we have legalized the vector types, look for blends of FADD and FSUB
22634   // nodes that we can fuse into an ADDSUB node.
22635   if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3())
22636     if (SDValue AddSub = combineShuffleToAddSub(N, DAG))
22637       return AddSub;
22638
22639   // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
22640   if (Subtarget->hasFp256() && VT.is256BitVector() &&
22641       N->getOpcode() == ISD::VECTOR_SHUFFLE)
22642     return PerformShuffleCombine256(N, DAG, DCI, Subtarget);
22643
22644   // During Type Legalization, when promoting illegal vector types,
22645   // the backend might introduce new shuffle dag nodes and bitcasts.
22646   //
22647   // This code performs the following transformation:
22648   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
22649   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
22650   //
22651   // We do this only if both the bitcast and the BINOP dag nodes have
22652   // one use. Also, perform this transformation only if the new binary
22653   // operation is legal. This is to avoid introducing dag nodes that
22654   // potentially need to be further expanded (or custom lowered) into a
22655   // less optimal sequence of dag nodes.
22656   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
22657       N1.getOpcode() == ISD::UNDEF && N0.hasOneUse() &&
22658       N0.getOpcode() == ISD::BITCAST) {
22659     SDValue BC0 = N0.getOperand(0);
22660     EVT SVT = BC0.getValueType();
22661     unsigned Opcode = BC0.getOpcode();
22662     unsigned NumElts = VT.getVectorNumElements();
22663
22664     if (BC0.hasOneUse() && SVT.isVector() &&
22665         SVT.getVectorNumElements() * 2 == NumElts &&
22666         TLI.isOperationLegal(Opcode, VT)) {
22667       bool CanFold = false;
22668       switch (Opcode) {
22669       default : break;
22670       case ISD::ADD :
22671       case ISD::FADD :
22672       case ISD::SUB :
22673       case ISD::FSUB :
22674       case ISD::MUL :
22675       case ISD::FMUL :
22676         CanFold = true;
22677       }
22678
22679       unsigned SVTNumElts = SVT.getVectorNumElements();
22680       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
22681       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
22682         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
22683       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
22684         CanFold = SVOp->getMaskElt(i) < 0;
22685
22686       if (CanFold) {
22687         SDValue BC00 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(0));
22688         SDValue BC01 = DAG.getNode(ISD::BITCAST, dl, VT, BC0.getOperand(1));
22689         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
22690         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, &SVOp->getMask()[0]);
22691       }
22692     }
22693   }
22694
22695   // Only handle 128 wide vector from here on.
22696   if (!VT.is128BitVector())
22697     return SDValue();
22698
22699   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
22700   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
22701   // consecutive, non-overlapping, and in the right order.
22702   SmallVector<SDValue, 16> Elts;
22703   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
22704     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
22705
22706   SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
22707   if (LD.getNode())
22708     return LD;
22709
22710   if (isTargetShuffle(N->getOpcode())) {
22711     SDValue Shuffle =
22712         PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget);
22713     if (Shuffle.getNode())
22714       return Shuffle;
22715
22716     // Try recursively combining arbitrary sequences of x86 shuffle
22717     // instructions into higher-order shuffles. We do this after combining
22718     // specific PSHUF instruction sequences into their minimal form so that we
22719     // can evaluate how many specialized shuffle instructions are involved in
22720     // a particular chain.
22721     SmallVector<int, 1> NonceMask; // Just a placeholder.
22722     NonceMask.push_back(0);
22723     if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
22724                                       /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
22725                                       DCI, Subtarget))
22726       return SDValue(); // This routine will use CombineTo to replace N.
22727   }
22728
22729   return SDValue();
22730 }
22731
22732 /// PerformTruncateCombine - Converts truncate operation to
22733 /// a sequence of vector shuffle operations.
22734 /// It is possible when we truncate 256-bit vector to 128-bit vector
22735 static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
22736                                       TargetLowering::DAGCombinerInfo &DCI,
22737                                       const X86Subtarget *Subtarget)  {
22738   return SDValue();
22739 }
22740
22741 /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target
22742 /// specific shuffle of a load can be folded into a single element load.
22743 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
22744 /// shuffles have been custom lowered so we need to handle those here.
22745 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
22746                                          TargetLowering::DAGCombinerInfo &DCI) {
22747   if (DCI.isBeforeLegalizeOps())
22748     return SDValue();
22749
22750   SDValue InVec = N->getOperand(0);
22751   SDValue EltNo = N->getOperand(1);
22752
22753   if (!isa<ConstantSDNode>(EltNo))
22754     return SDValue();
22755
22756   EVT OriginalVT = InVec.getValueType();
22757
22758   if (InVec.getOpcode() == ISD::BITCAST) {
22759     // Don't duplicate a load with other uses.
22760     if (!InVec.hasOneUse())
22761       return SDValue();
22762     EVT BCVT = InVec.getOperand(0).getValueType();
22763     if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
22764       return SDValue();
22765     InVec = InVec.getOperand(0);
22766   }
22767
22768   EVT CurrentVT = InVec.getValueType();
22769
22770   if (!isTargetShuffle(InVec.getOpcode()))
22771     return SDValue();
22772
22773   // Don't duplicate a load with other uses.
22774   if (!InVec.hasOneUse())
22775     return SDValue();
22776
22777   SmallVector<int, 16> ShuffleMask;
22778   bool UnaryShuffle;
22779   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(),
22780                             ShuffleMask, UnaryShuffle))
22781     return SDValue();
22782
22783   // Select the input vector, guarding against out of range extract vector.
22784   unsigned NumElems = CurrentVT.getVectorNumElements();
22785   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
22786   int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt];
22787   SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0)
22788                                          : InVec.getOperand(1);
22789
22790   // If inputs to shuffle are the same for both ops, then allow 2 uses
22791   unsigned AllowedUses = InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1;
22792
22793   if (LdNode.getOpcode() == ISD::BITCAST) {
22794     // Don't duplicate a load with other uses.
22795     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
22796       return SDValue();
22797
22798     AllowedUses = 1; // only allow 1 load use if we have a bitcast
22799     LdNode = LdNode.getOperand(0);
22800   }
22801
22802   if (!ISD::isNormalLoad(LdNode.getNode()))
22803     return SDValue();
22804
22805   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
22806
22807   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
22808     return SDValue();
22809
22810   EVT EltVT = N->getValueType(0);
22811   // If there's a bitcast before the shuffle, check if the load type and
22812   // alignment is valid.
22813   unsigned Align = LN0->getAlignment();
22814   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22815   unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment(
22816       EltVT.getTypeForEVT(*DAG.getContext()));
22817
22818   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
22819     return SDValue();
22820
22821   // All checks match so transform back to vector_shuffle so that DAG combiner
22822   // can finish the job
22823   SDLoc dl(N);
22824
22825   // Create shuffle node taking into account the case that its a unary shuffle
22826   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT)
22827                                    : InVec.getOperand(1);
22828   Shuffle = DAG.getVectorShuffle(CurrentVT, dl,
22829                                  InVec.getOperand(0), Shuffle,
22830                                  &ShuffleMask[0]);
22831   Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle);
22832   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
22833                      EltNo);
22834 }
22835
22836 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
22837 /// generation and convert it from being a bunch of shuffles and extracts
22838 /// into a somewhat faster sequence. For i686, the best sequence is apparently
22839 /// storing the value and loading scalars back, while for x64 we should
22840 /// use 64-bit extracts and shifts.
22841 static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
22842                                          TargetLowering::DAGCombinerInfo &DCI) {
22843   SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI);
22844   if (NewOp.getNode())
22845     return NewOp;
22846
22847   SDValue InputVector = N->getOperand(0);
22848
22849   // Detect whether we are trying to convert from mmx to i32 and the bitcast
22850   // from mmx to v2i32 has a single usage.
22851   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
22852       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
22853       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
22854     return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
22855                        N->getValueType(0),
22856                        InputVector.getNode()->getOperand(0));
22857
22858   // Only operate on vectors of 4 elements, where the alternative shuffling
22859   // gets to be more expensive.
22860   if (InputVector.getValueType() != MVT::v4i32)
22861     return SDValue();
22862
22863   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
22864   // single use which is a sign-extend or zero-extend, and all elements are
22865   // used.
22866   SmallVector<SDNode *, 4> Uses;
22867   unsigned ExtractedElements = 0;
22868   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
22869        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
22870     if (UI.getUse().getResNo() != InputVector.getResNo())
22871       return SDValue();
22872
22873     SDNode *Extract = *UI;
22874     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22875       return SDValue();
22876
22877     if (Extract->getValueType(0) != MVT::i32)
22878       return SDValue();
22879     if (!Extract->hasOneUse())
22880       return SDValue();
22881     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
22882         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
22883       return SDValue();
22884     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
22885       return SDValue();
22886
22887     // Record which element was extracted.
22888     ExtractedElements |=
22889       1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
22890
22891     Uses.push_back(Extract);
22892   }
22893
22894   // If not all the elements were used, this may not be worthwhile.
22895   if (ExtractedElements != 15)
22896     return SDValue();
22897
22898   // Ok, we've now decided to do the transformation.
22899   // If 64-bit shifts are legal, use the extract-shift sequence,
22900   // otherwise bounce the vector off the cache.
22901   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22902   SDValue Vals[4];
22903   SDLoc dl(InputVector);
22904
22905   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
22906     SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector);
22907     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
22908     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22909       DAG.getConstant(0, VecIdxTy));
22910     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
22911       DAG.getConstant(1, VecIdxTy));
22912
22913     SDValue ShAmt = DAG.getConstant(32,
22914       DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64));
22915     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
22916     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22917       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
22918     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
22919     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
22920       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
22921   } else {
22922     // Store the value to a temporary stack slot.
22923     SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
22924     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
22925       MachinePointerInfo(), false, false, 0);
22926
22927     EVT ElementType = InputVector.getValueType().getVectorElementType();
22928     unsigned EltSize = ElementType.getSizeInBits() / 8;
22929
22930     // Replace each use (extract) with a load of the appropriate element.
22931     for (unsigned i = 0; i < 4; ++i) {
22932       uint64_t Offset = EltSize * i;
22933       SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
22934
22935       SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
22936                                        StackPtr, OffsetVal);
22937
22938       // Load the scalar.
22939       Vals[i] = DAG.getLoad(ElementType, dl, Ch,
22940                             ScalarAddr, MachinePointerInfo(),
22941                             false, false, false, 0);
22942
22943     }
22944   }
22945
22946   // Replace the extracts
22947   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
22948     UE = Uses.end(); UI != UE; ++UI) {
22949     SDNode *Extract = *UI;
22950
22951     SDValue Idx = Extract->getOperand(1);
22952     uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
22953     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
22954   }
22955
22956   // The replacement was made in place; don't return anything.
22957   return SDValue();
22958 }
22959
22960 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
22961 static std::pair<unsigned, bool>
22962 matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
22963                    SelectionDAG &DAG, const X86Subtarget *Subtarget) {
22964   if (!VT.isVector())
22965     return std::make_pair(0, false);
22966
22967   bool NeedSplit = false;
22968   switch (VT.getSimpleVT().SimpleTy) {
22969   default: return std::make_pair(0, false);
22970   case MVT::v4i64:
22971   case MVT::v2i64:
22972     if (!Subtarget->hasVLX())
22973       return std::make_pair(0, false);
22974     break;
22975   case MVT::v64i8:
22976   case MVT::v32i16:
22977     if (!Subtarget->hasBWI())
22978       return std::make_pair(0, false);
22979     break;
22980   case MVT::v16i32:
22981   case MVT::v8i64:
22982     if (!Subtarget->hasAVX512())
22983       return std::make_pair(0, false);
22984     break;
22985   case MVT::v32i8:
22986   case MVT::v16i16:
22987   case MVT::v8i32:
22988     if (!Subtarget->hasAVX2())
22989       NeedSplit = true;
22990     if (!Subtarget->hasAVX())
22991       return std::make_pair(0, false);
22992     break;
22993   case MVT::v16i8:
22994   case MVT::v8i16:
22995   case MVT::v4i32:
22996     if (!Subtarget->hasSSE2())
22997       return std::make_pair(0, false);
22998   }
22999
23000   // SSE2 has only a small subset of the operations.
23001   bool hasUnsigned = Subtarget->hasSSE41() ||
23002                      (Subtarget->hasSSE2() && VT == MVT::v16i8);
23003   bool hasSigned = Subtarget->hasSSE41() ||
23004                    (Subtarget->hasSSE2() && VT == MVT::v8i16);
23005
23006   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23007
23008   unsigned Opc = 0;
23009   // Check for x CC y ? x : y.
23010   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23011       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23012     switch (CC) {
23013     default: break;
23014     case ISD::SETULT:
23015     case ISD::SETULE:
23016       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23017     case ISD::SETUGT:
23018     case ISD::SETUGE:
23019       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23020     case ISD::SETLT:
23021     case ISD::SETLE:
23022       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23023     case ISD::SETGT:
23024     case ISD::SETGE:
23025       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23026     }
23027   // Check for x CC y ? y : x -- a min/max with reversed arms.
23028   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23029              DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23030     switch (CC) {
23031     default: break;
23032     case ISD::SETULT:
23033     case ISD::SETULE:
23034       Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
23035     case ISD::SETUGT:
23036     case ISD::SETUGE:
23037       Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
23038     case ISD::SETLT:
23039     case ISD::SETLE:
23040       Opc = hasSigned ? X86ISD::SMAX : 0; break;
23041     case ISD::SETGT:
23042     case ISD::SETGE:
23043       Opc = hasSigned ? X86ISD::SMIN : 0; break;
23044     }
23045   }
23046
23047   return std::make_pair(Opc, NeedSplit);
23048 }
23049
23050 static SDValue
23051 transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
23052                                       const X86Subtarget *Subtarget) {
23053   SDLoc dl(N);
23054   SDValue Cond = N->getOperand(0);
23055   SDValue LHS = N->getOperand(1);
23056   SDValue RHS = N->getOperand(2);
23057
23058   if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
23059     SDValue CondSrc = Cond->getOperand(0);
23060     if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
23061       Cond = CondSrc->getOperand(0);
23062   }
23063
23064   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
23065     return SDValue();
23066
23067   // A vselect where all conditions and data are constants can be optimized into
23068   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
23069   if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
23070       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
23071     return SDValue();
23072
23073   unsigned MaskValue = 0;
23074   if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
23075     return SDValue();
23076
23077   MVT VT = N->getSimpleValueType(0);
23078   unsigned NumElems = VT.getVectorNumElements();
23079   SmallVector<int, 8> ShuffleMask(NumElems, -1);
23080   for (unsigned i = 0; i < NumElems; ++i) {
23081     // Be sure we emit undef where we can.
23082     if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
23083       ShuffleMask[i] = -1;
23084     else
23085       ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
23086   }
23087
23088   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23089   if (!TLI.isShuffleMaskLegal(ShuffleMask, VT))
23090     return SDValue();
23091   return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
23092 }
23093
23094 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
23095 /// nodes.
23096 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
23097                                     TargetLowering::DAGCombinerInfo &DCI,
23098                                     const X86Subtarget *Subtarget) {
23099   SDLoc DL(N);
23100   SDValue Cond = N->getOperand(0);
23101   // Get the LHS/RHS of the select.
23102   SDValue LHS = N->getOperand(1);
23103   SDValue RHS = N->getOperand(2);
23104   EVT VT = LHS.getValueType();
23105   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23106
23107   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
23108   // instructions match the semantics of the common C idiom x<y?x:y but not
23109   // x<=y?x:y, because of how they handle negative zero (which can be
23110   // ignored in unsafe-math mode).
23111   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
23112   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
23113       VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
23114       (Subtarget->hasSSE2() ||
23115        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
23116     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23117
23118     unsigned Opcode = 0;
23119     // Check for x CC y ? x : y.
23120     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23121         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23122       switch (CC) {
23123       default: break;
23124       case ISD::SETULT:
23125         // Converting this to a min would handle NaNs incorrectly, and swapping
23126         // the operands would cause it to handle comparisons between positive
23127         // and negative zero incorrectly.
23128         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23129           if (!DAG.getTarget().Options.UnsafeFPMath &&
23130               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23131             break;
23132           std::swap(LHS, RHS);
23133         }
23134         Opcode = X86ISD::FMIN;
23135         break;
23136       case ISD::SETOLE:
23137         // Converting this to a min would handle comparisons between positive
23138         // and negative zero incorrectly.
23139         if (!DAG.getTarget().Options.UnsafeFPMath &&
23140             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23141           break;
23142         Opcode = X86ISD::FMIN;
23143         break;
23144       case ISD::SETULE:
23145         // Converting this to a min would handle both negative zeros and NaNs
23146         // incorrectly, but we can swap the operands to fix both.
23147         std::swap(LHS, RHS);
23148       case ISD::SETOLT:
23149       case ISD::SETLT:
23150       case ISD::SETLE:
23151         Opcode = X86ISD::FMIN;
23152         break;
23153
23154       case ISD::SETOGE:
23155         // Converting this to a max would handle comparisons between positive
23156         // and negative zero incorrectly.
23157         if (!DAG.getTarget().Options.UnsafeFPMath &&
23158             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
23159           break;
23160         Opcode = X86ISD::FMAX;
23161         break;
23162       case ISD::SETUGT:
23163         // Converting this to a max would handle NaNs incorrectly, and swapping
23164         // the operands would cause it to handle comparisons between positive
23165         // and negative zero incorrectly.
23166         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
23167           if (!DAG.getTarget().Options.UnsafeFPMath &&
23168               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
23169             break;
23170           std::swap(LHS, RHS);
23171         }
23172         Opcode = X86ISD::FMAX;
23173         break;
23174       case ISD::SETUGE:
23175         // Converting this to a max would handle both negative zeros and NaNs
23176         // incorrectly, but we can swap the operands to fix both.
23177         std::swap(LHS, RHS);
23178       case ISD::SETOGT:
23179       case ISD::SETGT:
23180       case ISD::SETGE:
23181         Opcode = X86ISD::FMAX;
23182         break;
23183       }
23184     // Check for x CC y ? y : x -- a min/max with reversed arms.
23185     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
23186                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
23187       switch (CC) {
23188       default: break;
23189       case ISD::SETOGE:
23190         // Converting this to a min would handle comparisons between positive
23191         // and negative zero incorrectly, and swapping the operands would
23192         // cause it to handle NaNs incorrectly.
23193         if (!DAG.getTarget().Options.UnsafeFPMath &&
23194             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
23195           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23196             break;
23197           std::swap(LHS, RHS);
23198         }
23199         Opcode = X86ISD::FMIN;
23200         break;
23201       case ISD::SETUGT:
23202         // Converting this to a min would handle NaNs incorrectly.
23203         if (!DAG.getTarget().Options.UnsafeFPMath &&
23204             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
23205           break;
23206         Opcode = X86ISD::FMIN;
23207         break;
23208       case ISD::SETUGE:
23209         // Converting this to a min would handle both negative zeros and NaNs
23210         // incorrectly, but we can swap the operands to fix both.
23211         std::swap(LHS, RHS);
23212       case ISD::SETOGT:
23213       case ISD::SETGT:
23214       case ISD::SETGE:
23215         Opcode = X86ISD::FMIN;
23216         break;
23217
23218       case ISD::SETULT:
23219         // Converting this to a max would handle NaNs incorrectly.
23220         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23221           break;
23222         Opcode = X86ISD::FMAX;
23223         break;
23224       case ISD::SETOLE:
23225         // Converting this to a max would handle comparisons between positive
23226         // and negative zero incorrectly, and swapping the operands would
23227         // cause it to handle NaNs incorrectly.
23228         if (!DAG.getTarget().Options.UnsafeFPMath &&
23229             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
23230           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
23231             break;
23232           std::swap(LHS, RHS);
23233         }
23234         Opcode = X86ISD::FMAX;
23235         break;
23236       case ISD::SETULE:
23237         // Converting this to a max would handle both negative zeros and NaNs
23238         // incorrectly, but we can swap the operands to fix both.
23239         std::swap(LHS, RHS);
23240       case ISD::SETOLT:
23241       case ISD::SETLT:
23242       case ISD::SETLE:
23243         Opcode = X86ISD::FMAX;
23244         break;
23245       }
23246     }
23247
23248     if (Opcode)
23249       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
23250   }
23251
23252   EVT CondVT = Cond.getValueType();
23253   if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
23254       CondVT.getVectorElementType() == MVT::i1) {
23255     // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
23256     // lowering on KNL. In this case we convert it to
23257     // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
23258     // The same situation for all 128 and 256-bit vectors of i8 and i16.
23259     // Since SKX these selects have a proper lowering.
23260     EVT OpVT = LHS.getValueType();
23261     if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
23262         (OpVT.getVectorElementType() == MVT::i8 ||
23263          OpVT.getVectorElementType() == MVT::i16) &&
23264         !(Subtarget->hasBWI() && Subtarget->hasVLX())) {
23265       Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
23266       DCI.AddToWorklist(Cond.getNode());
23267       return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
23268     }
23269   }
23270   // If this is a select between two integer constants, try to do some
23271   // optimizations.
23272   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
23273     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
23274       // Don't do this for crazy integer types.
23275       if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
23276         // If this is efficiently invertible, canonicalize the LHSC/RHSC values
23277         // so that TrueC (the true value) is larger than FalseC.
23278         bool NeedsCondInvert = false;
23279
23280         if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
23281             // Efficiently invertible.
23282             (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
23283              (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
23284               isa<ConstantSDNode>(Cond.getOperand(1))))) {
23285           NeedsCondInvert = true;
23286           std::swap(TrueC, FalseC);
23287         }
23288
23289         // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
23290         if (FalseC->getAPIntValue() == 0 &&
23291             TrueC->getAPIntValue().isPowerOf2()) {
23292           if (NeedsCondInvert) // Invert the condition if needed.
23293             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23294                                DAG.getConstant(1, Cond.getValueType()));
23295
23296           // Zero extend the condition if needed.
23297           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
23298
23299           unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23300           return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
23301                              DAG.getConstant(ShAmt, MVT::i8));
23302         }
23303
23304         // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
23305         if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23306           if (NeedsCondInvert) // Invert the condition if needed.
23307             Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23308                                DAG.getConstant(1, Cond.getValueType()));
23309
23310           // Zero extend the condition if needed.
23311           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23312                              FalseC->getValueType(0), Cond);
23313           return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23314                              SDValue(FalseC, 0));
23315         }
23316
23317         // Optimize cases that will turn into an LEA instruction.  This requires
23318         // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23319         if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23320           uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23321           if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23322
23323           bool isFastMultiplier = false;
23324           if (Diff < 10) {
23325             switch ((unsigned char)Diff) {
23326               default: break;
23327               case 1:  // result = add base, cond
23328               case 2:  // result = lea base(    , cond*2)
23329               case 3:  // result = lea base(cond, cond*2)
23330               case 4:  // result = lea base(    , cond*4)
23331               case 5:  // result = lea base(cond, cond*4)
23332               case 8:  // result = lea base(    , cond*8)
23333               case 9:  // result = lea base(cond, cond*8)
23334                 isFastMultiplier = true;
23335                 break;
23336             }
23337           }
23338
23339           if (isFastMultiplier) {
23340             APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23341             if (NeedsCondInvert) // Invert the condition if needed.
23342               Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
23343                                  DAG.getConstant(1, Cond.getValueType()));
23344
23345             // Zero extend the condition if needed.
23346             Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23347                                Cond);
23348             // Scale the condition by the difference.
23349             if (Diff != 1)
23350               Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23351                                  DAG.getConstant(Diff, Cond.getValueType()));
23352
23353             // Add the base if non-zero.
23354             if (FalseC->getAPIntValue() != 0)
23355               Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23356                                  SDValue(FalseC, 0));
23357             return Cond;
23358           }
23359         }
23360       }
23361   }
23362
23363   // Canonicalize max and min:
23364   // (x > y) ? x : y -> (x >= y) ? x : y
23365   // (x < y) ? x : y -> (x <= y) ? x : y
23366   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
23367   // the need for an extra compare
23368   // against zero. e.g.
23369   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
23370   // subl   %esi, %edi
23371   // testl  %edi, %edi
23372   // movl   $0, %eax
23373   // cmovgl %edi, %eax
23374   // =>
23375   // xorl   %eax, %eax
23376   // subl   %esi, $edi
23377   // cmovsl %eax, %edi
23378   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
23379       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
23380       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
23381     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23382     switch (CC) {
23383     default: break;
23384     case ISD::SETLT:
23385     case ISD::SETGT: {
23386       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
23387       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
23388                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
23389       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
23390     }
23391     }
23392   }
23393
23394   // Early exit check
23395   if (!TLI.isTypeLegal(VT))
23396     return SDValue();
23397
23398   // Match VSELECTs into subs with unsigned saturation.
23399   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
23400       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
23401       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
23402        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
23403     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
23404
23405     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
23406     // left side invert the predicate to simplify logic below.
23407     SDValue Other;
23408     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
23409       Other = RHS;
23410       CC = ISD::getSetCCInverse(CC, true);
23411     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
23412       Other = LHS;
23413     }
23414
23415     if (Other.getNode() && Other->getNumOperands() == 2 &&
23416         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
23417       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
23418       SDValue CondRHS = Cond->getOperand(1);
23419
23420       // Look for a general sub with unsigned saturation first.
23421       // x >= y ? x-y : 0 --> subus x, y
23422       // x >  y ? x-y : 0 --> subus x, y
23423       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
23424           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
23425         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
23426
23427       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
23428         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
23429           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
23430             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
23431               // If the RHS is a constant we have to reverse the const
23432               // canonicalization.
23433               // x > C-1 ? x+-C : 0 --> subus x, C
23434               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
23435                   CondRHSConst->getAPIntValue() ==
23436                       (-OpRHSConst->getAPIntValue() - 1))
23437                 return DAG.getNode(
23438                     X86ISD::SUBUS, DL, VT, OpLHS,
23439                     DAG.getConstant(-OpRHSConst->getAPIntValue(), VT));
23440
23441           // Another special case: If C was a sign bit, the sub has been
23442           // canonicalized into a xor.
23443           // FIXME: Would it be better to use computeKnownBits to determine
23444           //        whether it's safe to decanonicalize the xor?
23445           // x s< 0 ? x^C : 0 --> subus x, C
23446           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
23447               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
23448               OpRHSConst->getAPIntValue().isSignBit())
23449             // Note that we have to rebuild the RHS constant here to ensure we
23450             // don't rely on particular values of undef lanes.
23451             return DAG.getNode(
23452                 X86ISD::SUBUS, DL, VT, OpLHS,
23453                 DAG.getConstant(OpRHSConst->getAPIntValue(), VT));
23454         }
23455     }
23456   }
23457
23458   // Try to match a min/max vector operation.
23459   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
23460     std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
23461     unsigned Opc = ret.first;
23462     bool NeedSplit = ret.second;
23463
23464     if (Opc && NeedSplit) {
23465       unsigned NumElems = VT.getVectorNumElements();
23466       // Extract the LHS vectors
23467       SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
23468       SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
23469
23470       // Extract the RHS vectors
23471       SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
23472       SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
23473
23474       // Create min/max for each subvector
23475       LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
23476       RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
23477
23478       // Merge the result
23479       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
23480     } else if (Opc)
23481       return DAG.getNode(Opc, DL, VT, LHS, RHS);
23482   }
23483
23484   // Simplify vector selection if condition value type matches vselect
23485   // operand type
23486   if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
23487     assert(Cond.getValueType().isVector() &&
23488            "vector select expects a vector selector!");
23489
23490     bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
23491     bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
23492
23493     // Try invert the condition if true value is not all 1s and false value
23494     // is not all 0s.
23495     if (!TValIsAllOnes && !FValIsAllZeros &&
23496         // Check if the selector will be produced by CMPP*/PCMP*
23497         Cond.getOpcode() == ISD::SETCC &&
23498         // Check if SETCC has already been promoted
23499         TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) {
23500       bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
23501       bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
23502
23503       if (TValIsAllZeros || FValIsAllOnes) {
23504         SDValue CC = Cond.getOperand(2);
23505         ISD::CondCode NewCC =
23506           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
23507                                Cond.getOperand(0).getValueType().isInteger());
23508         Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
23509         std::swap(LHS, RHS);
23510         TValIsAllOnes = FValIsAllOnes;
23511         FValIsAllZeros = TValIsAllZeros;
23512       }
23513     }
23514
23515     if (TValIsAllOnes || FValIsAllZeros) {
23516       SDValue Ret;
23517
23518       if (TValIsAllOnes && FValIsAllZeros)
23519         Ret = Cond;
23520       else if (TValIsAllOnes)
23521         Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond,
23522                           DAG.getNode(ISD::BITCAST, DL, CondVT, RHS));
23523       else if (FValIsAllZeros)
23524         Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
23525                           DAG.getNode(ISD::BITCAST, DL, CondVT, LHS));
23526
23527       return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
23528     }
23529   }
23530
23531   // If we know that this node is legal then we know that it is going to be
23532   // matched by one of the SSE/AVX BLEND instructions. These instructions only
23533   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
23534   // to simplify previous instructions.
23535   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
23536       !DCI.isBeforeLegalize() &&
23537       // We explicitly check against v8i16 and v16i16 because, although
23538       // they're marked as Custom, they might only be legal when Cond is a
23539       // build_vector of constants. This will be taken care in a later
23540       // condition.
23541       (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
23542        VT != MVT::v8i16) &&
23543       // Don't optimize vector of constants. Those are handled by
23544       // the generic code and all the bits must be properly set for
23545       // the generic optimizer.
23546       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
23547     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
23548
23549     // Don't optimize vector selects that map to mask-registers.
23550     if (BitWidth == 1)
23551       return SDValue();
23552
23553     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
23554     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
23555
23556     APInt KnownZero, KnownOne;
23557     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
23558                                           DCI.isBeforeLegalizeOps());
23559     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
23560         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
23561                                  TLO)) {
23562       // If we changed the computation somewhere in the DAG, this change
23563       // will affect all users of Cond.
23564       // Make sure it is fine and update all the nodes so that we do not
23565       // use the generic VSELECT anymore. Otherwise, we may perform
23566       // wrong optimizations as we messed up with the actual expectation
23567       // for the vector boolean values.
23568       if (Cond != TLO.Old) {
23569         // Check all uses of that condition operand to check whether it will be
23570         // consumed by non-BLEND instructions, which may depend on all bits are
23571         // set properly.
23572         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23573              I != E; ++I)
23574           if (I->getOpcode() != ISD::VSELECT)
23575             // TODO: Add other opcodes eventually lowered into BLEND.
23576             return SDValue();
23577
23578         // Update all the users of the condition, before committing the change,
23579         // so that the VSELECT optimizations that expect the correct vector
23580         // boolean value will not be triggered.
23581         for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
23582              I != E; ++I)
23583           DAG.ReplaceAllUsesOfValueWith(
23584               SDValue(*I, 0),
23585               DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
23586                           Cond, I->getOperand(1), I->getOperand(2)));
23587         DCI.CommitTargetLoweringOpt(TLO);
23588         return SDValue();
23589       }
23590       // At this point, only Cond is changed. Change the condition
23591       // just for N to keep the opportunity to optimize all other
23592       // users their own way.
23593       DAG.ReplaceAllUsesOfValueWith(
23594           SDValue(N, 0),
23595           DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
23596                       TLO.New, N->getOperand(1), N->getOperand(2)));
23597       return SDValue();
23598     }
23599   }
23600
23601   // We should generate an X86ISD::BLENDI from a vselect if its argument
23602   // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
23603   // constants. This specific pattern gets generated when we split a
23604   // selector for a 512 bit vector in a machine without AVX512 (but with
23605   // 256-bit vectors), during legalization:
23606   //
23607   // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
23608   //
23609   // Iff we find this pattern and the build_vectors are built from
23610   // constants, we translate the vselect into a shuffle_vector that we
23611   // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
23612   if ((N->getOpcode() == ISD::VSELECT ||
23613        N->getOpcode() == X86ISD::SHRUNKBLEND) &&
23614       !DCI.isBeforeLegalize()) {
23615     SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
23616     if (Shuffle.getNode())
23617       return Shuffle;
23618   }
23619
23620   return SDValue();
23621 }
23622
23623 // Check whether a boolean test is testing a boolean value generated by
23624 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
23625 // code.
23626 //
23627 // Simplify the following patterns:
23628 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
23629 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
23630 // to (Op EFLAGS Cond)
23631 //
23632 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
23633 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
23634 // to (Op EFLAGS !Cond)
23635 //
23636 // where Op could be BRCOND or CMOV.
23637 //
23638 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
23639   // Quit if not CMP and SUB with its value result used.
23640   if (Cmp.getOpcode() != X86ISD::CMP &&
23641       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
23642       return SDValue();
23643
23644   // Quit if not used as a boolean value.
23645   if (CC != X86::COND_E && CC != X86::COND_NE)
23646     return SDValue();
23647
23648   // Check CMP operands. One of them should be 0 or 1 and the other should be
23649   // an SetCC or extended from it.
23650   SDValue Op1 = Cmp.getOperand(0);
23651   SDValue Op2 = Cmp.getOperand(1);
23652
23653   SDValue SetCC;
23654   const ConstantSDNode* C = nullptr;
23655   bool needOppositeCond = (CC == X86::COND_E);
23656   bool checkAgainstTrue = false; // Is it a comparison against 1?
23657
23658   if ((C = dyn_cast<ConstantSDNode>(Op1)))
23659     SetCC = Op2;
23660   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
23661     SetCC = Op1;
23662   else // Quit if all operands are not constants.
23663     return SDValue();
23664
23665   if (C->getZExtValue() == 1) {
23666     needOppositeCond = !needOppositeCond;
23667     checkAgainstTrue = true;
23668   } else if (C->getZExtValue() != 0)
23669     // Quit if the constant is neither 0 or 1.
23670     return SDValue();
23671
23672   bool truncatedToBoolWithAnd = false;
23673   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
23674   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
23675          SetCC.getOpcode() == ISD::TRUNCATE ||
23676          SetCC.getOpcode() == ISD::AND) {
23677     if (SetCC.getOpcode() == ISD::AND) {
23678       int OpIdx = -1;
23679       ConstantSDNode *CS;
23680       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) &&
23681           CS->getZExtValue() == 1)
23682         OpIdx = 1;
23683       if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) &&
23684           CS->getZExtValue() == 1)
23685         OpIdx = 0;
23686       if (OpIdx == -1)
23687         break;
23688       SetCC = SetCC.getOperand(OpIdx);
23689       truncatedToBoolWithAnd = true;
23690     } else
23691       SetCC = SetCC.getOperand(0);
23692   }
23693
23694   switch (SetCC.getOpcode()) {
23695   case X86ISD::SETCC_CARRY:
23696     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
23697     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
23698     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
23699     // truncated to i1 using 'and'.
23700     if (checkAgainstTrue && !truncatedToBoolWithAnd)
23701       break;
23702     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
23703            "Invalid use of SETCC_CARRY!");
23704     // FALL THROUGH
23705   case X86ISD::SETCC:
23706     // Set the condition code or opposite one if necessary.
23707     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
23708     if (needOppositeCond)
23709       CC = X86::GetOppositeBranchCondition(CC);
23710     return SetCC.getOperand(1);
23711   case X86ISD::CMOV: {
23712     // Check whether false/true value has canonical one, i.e. 0 or 1.
23713     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
23714     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
23715     // Quit if true value is not a constant.
23716     if (!TVal)
23717       return SDValue();
23718     // Quit if false value is not a constant.
23719     if (!FVal) {
23720       SDValue Op = SetCC.getOperand(0);
23721       // Skip 'zext' or 'trunc' node.
23722       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
23723           Op.getOpcode() == ISD::TRUNCATE)
23724         Op = Op.getOperand(0);
23725       // A special case for rdrand/rdseed, where 0 is set if false cond is
23726       // found.
23727       if ((Op.getOpcode() != X86ISD::RDRAND &&
23728            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
23729         return SDValue();
23730     }
23731     // Quit if false value is not the constant 0 or 1.
23732     bool FValIsFalse = true;
23733     if (FVal && FVal->getZExtValue() != 0) {
23734       if (FVal->getZExtValue() != 1)
23735         return SDValue();
23736       // If FVal is 1, opposite cond is needed.
23737       needOppositeCond = !needOppositeCond;
23738       FValIsFalse = false;
23739     }
23740     // Quit if TVal is not the constant opposite of FVal.
23741     if (FValIsFalse && TVal->getZExtValue() != 1)
23742       return SDValue();
23743     if (!FValIsFalse && TVal->getZExtValue() != 0)
23744       return SDValue();
23745     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
23746     if (needOppositeCond)
23747       CC = X86::GetOppositeBranchCondition(CC);
23748     return SetCC.getOperand(3);
23749   }
23750   }
23751
23752   return SDValue();
23753 }
23754
23755 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
23756 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
23757                                   TargetLowering::DAGCombinerInfo &DCI,
23758                                   const X86Subtarget *Subtarget) {
23759   SDLoc DL(N);
23760
23761   // If the flag operand isn't dead, don't touch this CMOV.
23762   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
23763     return SDValue();
23764
23765   SDValue FalseOp = N->getOperand(0);
23766   SDValue TrueOp = N->getOperand(1);
23767   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
23768   SDValue Cond = N->getOperand(3);
23769
23770   if (CC == X86::COND_E || CC == X86::COND_NE) {
23771     switch (Cond.getOpcode()) {
23772     default: break;
23773     case X86ISD::BSR:
23774     case X86ISD::BSF:
23775       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
23776       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
23777         return (CC == X86::COND_E) ? FalseOp : TrueOp;
23778     }
23779   }
23780
23781   SDValue Flags;
23782
23783   Flags = checkBoolTestSetCCCombine(Cond, CC);
23784   if (Flags.getNode() &&
23785       // Extra check as FCMOV only supports a subset of X86 cond.
23786       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
23787     SDValue Ops[] = { FalseOp, TrueOp,
23788                       DAG.getConstant(CC, MVT::i8), Flags };
23789     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
23790   }
23791
23792   // If this is a select between two integer constants, try to do some
23793   // optimizations.  Note that the operands are ordered the opposite of SELECT
23794   // operands.
23795   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
23796     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
23797       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
23798       // larger than FalseC (the false value).
23799       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
23800         CC = X86::GetOppositeBranchCondition(CC);
23801         std::swap(TrueC, FalseC);
23802         std::swap(TrueOp, FalseOp);
23803       }
23804
23805       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
23806       // This is efficient for any integer data type (including i8/i16) and
23807       // shift amount.
23808       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
23809         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23810                            DAG.getConstant(CC, MVT::i8), Cond);
23811
23812         // Zero extend the condition if needed.
23813         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
23814
23815         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
23816         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
23817                            DAG.getConstant(ShAmt, MVT::i8));
23818         if (N->getNumValues() == 2)  // Dead flag value?
23819           return DCI.CombineTo(N, Cond, SDValue());
23820         return Cond;
23821       }
23822
23823       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
23824       // for any integer data type, including i8/i16.
23825       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
23826         Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23827                            DAG.getConstant(CC, MVT::i8), Cond);
23828
23829         // Zero extend the condition if needed.
23830         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
23831                            FalseC->getValueType(0), Cond);
23832         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23833                            SDValue(FalseC, 0));
23834
23835         if (N->getNumValues() == 2)  // Dead flag value?
23836           return DCI.CombineTo(N, Cond, SDValue());
23837         return Cond;
23838       }
23839
23840       // Optimize cases that will turn into an LEA instruction.  This requires
23841       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
23842       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
23843         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
23844         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
23845
23846         bool isFastMultiplier = false;
23847         if (Diff < 10) {
23848           switch ((unsigned char)Diff) {
23849           default: break;
23850           case 1:  // result = add base, cond
23851           case 2:  // result = lea base(    , cond*2)
23852           case 3:  // result = lea base(cond, cond*2)
23853           case 4:  // result = lea base(    , cond*4)
23854           case 5:  // result = lea base(cond, cond*4)
23855           case 8:  // result = lea base(    , cond*8)
23856           case 9:  // result = lea base(cond, cond*8)
23857             isFastMultiplier = true;
23858             break;
23859           }
23860         }
23861
23862         if (isFastMultiplier) {
23863           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
23864           Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
23865                              DAG.getConstant(CC, MVT::i8), Cond);
23866           // Zero extend the condition if needed.
23867           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
23868                              Cond);
23869           // Scale the condition by the difference.
23870           if (Diff != 1)
23871             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
23872                                DAG.getConstant(Diff, Cond.getValueType()));
23873
23874           // Add the base if non-zero.
23875           if (FalseC->getAPIntValue() != 0)
23876             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
23877                                SDValue(FalseC, 0));
23878           if (N->getNumValues() == 2)  // Dead flag value?
23879             return DCI.CombineTo(N, Cond, SDValue());
23880           return Cond;
23881         }
23882       }
23883     }
23884   }
23885
23886   // Handle these cases:
23887   //   (select (x != c), e, c) -> select (x != c), e, x),
23888   //   (select (x == c), c, e) -> select (x == c), x, e)
23889   // where the c is an integer constant, and the "select" is the combination
23890   // of CMOV and CMP.
23891   //
23892   // The rationale for this change is that the conditional-move from a constant
23893   // needs two instructions, however, conditional-move from a register needs
23894   // only one instruction.
23895   //
23896   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
23897   //  some instruction-combining opportunities. This opt needs to be
23898   //  postponed as late as possible.
23899   //
23900   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
23901     // the DCI.xxxx conditions are provided to postpone the optimization as
23902     // late as possible.
23903
23904     ConstantSDNode *CmpAgainst = nullptr;
23905     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
23906         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
23907         !isa<ConstantSDNode>(Cond.getOperand(0))) {
23908
23909       if (CC == X86::COND_NE &&
23910           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
23911         CC = X86::GetOppositeBranchCondition(CC);
23912         std::swap(TrueOp, FalseOp);
23913       }
23914
23915       if (CC == X86::COND_E &&
23916           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
23917         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
23918                           DAG.getConstant(CC, MVT::i8), Cond };
23919         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
23920       }
23921     }
23922   }
23923
23924   return SDValue();
23925 }
23926
23927 static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
23928                                                 const X86Subtarget *Subtarget) {
23929   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
23930   switch (IntNo) {
23931   default: return SDValue();
23932   // SSE/AVX/AVX2 blend intrinsics.
23933   case Intrinsic::x86_avx2_pblendvb:
23934   case Intrinsic::x86_avx2_pblendw:
23935   case Intrinsic::x86_avx2_pblendd_128:
23936   case Intrinsic::x86_avx2_pblendd_256:
23937     // Don't try to simplify this intrinsic if we don't have AVX2.
23938     if (!Subtarget->hasAVX2())
23939       return SDValue();
23940     // FALL-THROUGH
23941   case Intrinsic::x86_avx_blend_pd_256:
23942   case Intrinsic::x86_avx_blend_ps_256:
23943   case Intrinsic::x86_avx_blendv_pd_256:
23944   case Intrinsic::x86_avx_blendv_ps_256:
23945     // Don't try to simplify this intrinsic if we don't have AVX.
23946     if (!Subtarget->hasAVX())
23947       return SDValue();
23948     // FALL-THROUGH
23949   case Intrinsic::x86_sse41_pblendw:
23950   case Intrinsic::x86_sse41_blendpd:
23951   case Intrinsic::x86_sse41_blendps:
23952   case Intrinsic::x86_sse41_blendvps:
23953   case Intrinsic::x86_sse41_blendvpd:
23954   case Intrinsic::x86_sse41_pblendvb: {
23955     SDValue Op0 = N->getOperand(1);
23956     SDValue Op1 = N->getOperand(2);
23957     SDValue Mask = N->getOperand(3);
23958
23959     // Don't try to simplify this intrinsic if we don't have SSE4.1.
23960     if (!Subtarget->hasSSE41())
23961       return SDValue();
23962
23963     // fold (blend A, A, Mask) -> A
23964     if (Op0 == Op1)
23965       return Op0;
23966     // fold (blend A, B, allZeros) -> A
23967     if (ISD::isBuildVectorAllZeros(Mask.getNode()))
23968       return Op0;
23969     // fold (blend A, B, allOnes) -> B
23970     if (ISD::isBuildVectorAllOnes(Mask.getNode()))
23971       return Op1;
23972
23973     // Simplify the case where the mask is a constant i32 value.
23974     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
23975       if (C->isNullValue())
23976         return Op0;
23977       if (C->isAllOnesValue())
23978         return Op1;
23979     }
23980
23981     return SDValue();
23982   }
23983
23984   // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
23985   case Intrinsic::x86_sse2_psrai_w:
23986   case Intrinsic::x86_sse2_psrai_d:
23987   case Intrinsic::x86_avx2_psrai_w:
23988   case Intrinsic::x86_avx2_psrai_d:
23989   case Intrinsic::x86_sse2_psra_w:
23990   case Intrinsic::x86_sse2_psra_d:
23991   case Intrinsic::x86_avx2_psra_w:
23992   case Intrinsic::x86_avx2_psra_d: {
23993     SDValue Op0 = N->getOperand(1);
23994     SDValue Op1 = N->getOperand(2);
23995     EVT VT = Op0.getValueType();
23996     assert(VT.isVector() && "Expected a vector type!");
23997
23998     if (isa<BuildVectorSDNode>(Op1))
23999       Op1 = Op1.getOperand(0);
24000
24001     if (!isa<ConstantSDNode>(Op1))
24002       return SDValue();
24003
24004     EVT SVT = VT.getVectorElementType();
24005     unsigned SVTBits = SVT.getSizeInBits();
24006
24007     ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
24008     const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
24009     uint64_t ShAmt = C.getZExtValue();
24010
24011     // Don't try to convert this shift into a ISD::SRA if the shift
24012     // count is bigger than or equal to the element size.
24013     if (ShAmt >= SVTBits)
24014       return SDValue();
24015
24016     // Trivial case: if the shift count is zero, then fold this
24017     // into the first operand.
24018     if (ShAmt == 0)
24019       return Op0;
24020
24021     // Replace this packed shift intrinsic with a target independent
24022     // shift dag node.
24023     SDValue Splat = DAG.getConstant(C, VT);
24024     return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
24025   }
24026   }
24027 }
24028
24029 /// PerformMulCombine - Optimize a single multiply with constant into two
24030 /// in order to implement it with two cheaper instructions, e.g.
24031 /// LEA + SHL, LEA + LEA.
24032 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
24033                                  TargetLowering::DAGCombinerInfo &DCI) {
24034   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
24035     return SDValue();
24036
24037   EVT VT = N->getValueType(0);
24038   if (VT != MVT::i64)
24039     return SDValue();
24040
24041   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24042   if (!C)
24043     return SDValue();
24044   uint64_t MulAmt = C->getZExtValue();
24045   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
24046     return SDValue();
24047
24048   uint64_t MulAmt1 = 0;
24049   uint64_t MulAmt2 = 0;
24050   if ((MulAmt % 9) == 0) {
24051     MulAmt1 = 9;
24052     MulAmt2 = MulAmt / 9;
24053   } else if ((MulAmt % 5) == 0) {
24054     MulAmt1 = 5;
24055     MulAmt2 = MulAmt / 5;
24056   } else if ((MulAmt % 3) == 0) {
24057     MulAmt1 = 3;
24058     MulAmt2 = MulAmt / 3;
24059   }
24060   if (MulAmt2 &&
24061       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
24062     SDLoc DL(N);
24063
24064     if (isPowerOf2_64(MulAmt2) &&
24065         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
24066       // If second multiplifer is pow2, issue it first. We want the multiply by
24067       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
24068       // is an add.
24069       std::swap(MulAmt1, MulAmt2);
24070
24071     SDValue NewMul;
24072     if (isPowerOf2_64(MulAmt1))
24073       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
24074                            DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
24075     else
24076       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
24077                            DAG.getConstant(MulAmt1, VT));
24078
24079     if (isPowerOf2_64(MulAmt2))
24080       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
24081                            DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
24082     else
24083       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
24084                            DAG.getConstant(MulAmt2, VT));
24085
24086     // Do not add new nodes to DAG combiner worklist.
24087     DCI.CombineTo(N, NewMul, false);
24088   }
24089   return SDValue();
24090 }
24091
24092 static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
24093   SDValue N0 = N->getOperand(0);
24094   SDValue N1 = N->getOperand(1);
24095   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
24096   EVT VT = N0.getValueType();
24097
24098   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
24099   // since the result of setcc_c is all zero's or all ones.
24100   if (VT.isInteger() && !VT.isVector() &&
24101       N1C && N0.getOpcode() == ISD::AND &&
24102       N0.getOperand(1).getOpcode() == ISD::Constant) {
24103     SDValue N00 = N0.getOperand(0);
24104     if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
24105         ((N00.getOpcode() == ISD::ANY_EXTEND ||
24106           N00.getOpcode() == ISD::ZERO_EXTEND) &&
24107          N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
24108       APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
24109       APInt ShAmt = N1C->getAPIntValue();
24110       Mask = Mask.shl(ShAmt);
24111       if (Mask != 0)
24112         return DAG.getNode(ISD::AND, SDLoc(N), VT,
24113                            N00, DAG.getConstant(Mask, VT));
24114     }
24115   }
24116
24117   // Hardware support for vector shifts is sparse which makes us scalarize the
24118   // vector operations in many cases. Also, on sandybridge ADD is faster than
24119   // shl.
24120   // (shl V, 1) -> add V,V
24121   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
24122     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
24123       assert(N0.getValueType().isVector() && "Invalid vector shift type");
24124       // We shift all of the values by one. In many cases we do not have
24125       // hardware support for this operation. This is better expressed as an ADD
24126       // of two values.
24127       if (N1SplatC->getZExtValue() == 1)
24128         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
24129     }
24130
24131   return SDValue();
24132 }
24133
24134 /// \brief Returns a vector of 0s if the node in input is a vector logical
24135 /// shift by a constant amount which is known to be bigger than or equal
24136 /// to the vector element size in bits.
24137 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
24138                                       const X86Subtarget *Subtarget) {
24139   EVT VT = N->getValueType(0);
24140
24141   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
24142       (!Subtarget->hasInt256() ||
24143        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
24144     return SDValue();
24145
24146   SDValue Amt = N->getOperand(1);
24147   SDLoc DL(N);
24148   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
24149     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
24150       APInt ShiftAmt = AmtSplat->getAPIntValue();
24151       unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
24152
24153       // SSE2/AVX2 logical shifts always return a vector of 0s
24154       // if the shift amount is bigger than or equal to
24155       // the element size. The constant shift amount will be
24156       // encoded as a 8-bit immediate.
24157       if (ShiftAmt.trunc(8).uge(MaxAmount))
24158         return getZeroVector(VT, Subtarget, DAG, DL);
24159     }
24160
24161   return SDValue();
24162 }
24163
24164 /// PerformShiftCombine - Combine shifts.
24165 static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
24166                                    TargetLowering::DAGCombinerInfo &DCI,
24167                                    const X86Subtarget *Subtarget) {
24168   if (N->getOpcode() == ISD::SHL) {
24169     SDValue V = PerformSHLCombine(N, DAG);
24170     if (V.getNode()) return V;
24171   }
24172
24173   if (N->getOpcode() != ISD::SRA) {
24174     // Try to fold this logical shift into a zero vector.
24175     SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
24176     if (V.getNode()) return V;
24177   }
24178
24179   return SDValue();
24180 }
24181
24182 // CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
24183 // where both setccs reference the same FP CMP, and rewrite for CMPEQSS
24184 // and friends.  Likewise for OR -> CMPNEQSS.
24185 static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
24186                             TargetLowering::DAGCombinerInfo &DCI,
24187                             const X86Subtarget *Subtarget) {
24188   unsigned opcode;
24189
24190   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
24191   // we're requiring SSE2 for both.
24192   if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
24193     SDValue N0 = N->getOperand(0);
24194     SDValue N1 = N->getOperand(1);
24195     SDValue CMP0 = N0->getOperand(1);
24196     SDValue CMP1 = N1->getOperand(1);
24197     SDLoc DL(N);
24198
24199     // The SETCCs should both refer to the same CMP.
24200     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
24201       return SDValue();
24202
24203     SDValue CMP00 = CMP0->getOperand(0);
24204     SDValue CMP01 = CMP0->getOperand(1);
24205     EVT     VT    = CMP00.getValueType();
24206
24207     if (VT == MVT::f32 || VT == MVT::f64) {
24208       bool ExpectingFlags = false;
24209       // Check for any users that want flags:
24210       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
24211            !ExpectingFlags && UI != UE; ++UI)
24212         switch (UI->getOpcode()) {
24213         default:
24214         case ISD::BR_CC:
24215         case ISD::BRCOND:
24216         case ISD::SELECT:
24217           ExpectingFlags = true;
24218           break;
24219         case ISD::CopyToReg:
24220         case ISD::SIGN_EXTEND:
24221         case ISD::ZERO_EXTEND:
24222         case ISD::ANY_EXTEND:
24223           break;
24224         }
24225
24226       if (!ExpectingFlags) {
24227         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
24228         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
24229
24230         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
24231           X86::CondCode tmp = cc0;
24232           cc0 = cc1;
24233           cc1 = tmp;
24234         }
24235
24236         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
24237             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
24238           // FIXME: need symbolic constants for these magic numbers.
24239           // See X86ATTInstPrinter.cpp:printSSECC().
24240           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
24241           if (Subtarget->hasAVX512()) {
24242             SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
24243                                          CMP01, DAG.getConstant(x86cc, MVT::i8));
24244             if (N->getValueType(0) != MVT::i1)
24245               return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
24246                                  FSetCC);
24247             return FSetCC;
24248           }
24249           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
24250                                               CMP00.getValueType(), CMP00, CMP01,
24251                                               DAG.getConstant(x86cc, MVT::i8));
24252
24253           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
24254           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
24255
24256           if (is64BitFP && !Subtarget->is64Bit()) {
24257             // On a 32-bit target, we cannot bitcast the 64-bit float to a
24258             // 64-bit integer, since that's not a legal type. Since
24259             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
24260             // bits, but can do this little dance to extract the lowest 32 bits
24261             // and work with those going forward.
24262             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
24263                                            OnesOrZeroesF);
24264             SDValue Vector32 = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32,
24265                                            Vector64);
24266             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
24267                                         Vector32, DAG.getIntPtrConstant(0));
24268             IntVT = MVT::i32;
24269           }
24270
24271           SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, IntVT, OnesOrZeroesF);
24272           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
24273                                       DAG.getConstant(1, IntVT));
24274           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
24275           return OneBitOfTruth;
24276         }
24277       }
24278     }
24279   }
24280   return SDValue();
24281 }
24282
24283 /// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
24284 /// so it can be folded inside ANDNP.
24285 static bool CanFoldXORWithAllOnes(const SDNode *N) {
24286   EVT VT = N->getValueType(0);
24287
24288   // Match direct AllOnes for 128 and 256-bit vectors
24289   if (ISD::isBuildVectorAllOnes(N))
24290     return true;
24291
24292   // Look through a bit convert.
24293   if (N->getOpcode() == ISD::BITCAST)
24294     N = N->getOperand(0).getNode();
24295
24296   // Sometimes the operand may come from a insert_subvector building a 256-bit
24297   // allones vector
24298   if (VT.is256BitVector() &&
24299       N->getOpcode() == ISD::INSERT_SUBVECTOR) {
24300     SDValue V1 = N->getOperand(0);
24301     SDValue V2 = N->getOperand(1);
24302
24303     if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
24304         V1.getOperand(0).getOpcode() == ISD::UNDEF &&
24305         ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
24306         ISD::isBuildVectorAllOnes(V2.getNode()))
24307       return true;
24308   }
24309
24310   return false;
24311 }
24312
24313 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
24314 // register. In most cases we actually compare or select YMM-sized registers
24315 // and mixing the two types creates horrible code. This method optimizes
24316 // some of the transition sequences.
24317 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
24318                                  TargetLowering::DAGCombinerInfo &DCI,
24319                                  const X86Subtarget *Subtarget) {
24320   EVT VT = N->getValueType(0);
24321   if (!VT.is256BitVector())
24322     return SDValue();
24323
24324   assert((N->getOpcode() == ISD::ANY_EXTEND ||
24325           N->getOpcode() == ISD::ZERO_EXTEND ||
24326           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
24327
24328   SDValue Narrow = N->getOperand(0);
24329   EVT NarrowVT = Narrow->getValueType(0);
24330   if (!NarrowVT.is128BitVector())
24331     return SDValue();
24332
24333   if (Narrow->getOpcode() != ISD::XOR &&
24334       Narrow->getOpcode() != ISD::AND &&
24335       Narrow->getOpcode() != ISD::OR)
24336     return SDValue();
24337
24338   SDValue N0  = Narrow->getOperand(0);
24339   SDValue N1  = Narrow->getOperand(1);
24340   SDLoc DL(Narrow);
24341
24342   // The Left side has to be a trunc.
24343   if (N0.getOpcode() != ISD::TRUNCATE)
24344     return SDValue();
24345
24346   // The type of the truncated inputs.
24347   EVT WideVT = N0->getOperand(0)->getValueType(0);
24348   if (WideVT != VT)
24349     return SDValue();
24350
24351   // The right side has to be a 'trunc' or a constant vector.
24352   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
24353   ConstantSDNode *RHSConstSplat = nullptr;
24354   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
24355     RHSConstSplat = RHSBV->getConstantSplatNode();
24356   if (!RHSTrunc && !RHSConstSplat)
24357     return SDValue();
24358
24359   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24360
24361   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
24362     return SDValue();
24363
24364   // Set N0 and N1 to hold the inputs to the new wide operation.
24365   N0 = N0->getOperand(0);
24366   if (RHSConstSplat) {
24367     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
24368                      SDValue(RHSConstSplat, 0));
24369     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
24370     N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
24371   } else if (RHSTrunc) {
24372     N1 = N1->getOperand(0);
24373   }
24374
24375   // Generate the wide operation.
24376   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
24377   unsigned Opcode = N->getOpcode();
24378   switch (Opcode) {
24379   case ISD::ANY_EXTEND:
24380     return Op;
24381   case ISD::ZERO_EXTEND: {
24382     unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
24383     APInt Mask = APInt::getAllOnesValue(InBits);
24384     Mask = Mask.zext(VT.getScalarType().getSizeInBits());
24385     return DAG.getNode(ISD::AND, DL, VT,
24386                        Op, DAG.getConstant(Mask, VT));
24387   }
24388   case ISD::SIGN_EXTEND:
24389     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
24390                        Op, DAG.getValueType(NarrowVT));
24391   default:
24392     llvm_unreachable("Unexpected opcode");
24393   }
24394 }
24395
24396 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
24397                                  TargetLowering::DAGCombinerInfo &DCI,
24398                                  const X86Subtarget *Subtarget) {
24399   EVT VT = N->getValueType(0);
24400   if (DCI.isBeforeLegalizeOps())
24401     return SDValue();
24402
24403   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24404   if (R.getNode())
24405     return R;
24406
24407   // Create BEXTR instructions
24408   // BEXTR is ((X >> imm) & (2**size-1))
24409   if (VT == MVT::i32 || VT == MVT::i64) {
24410     SDValue N0 = N->getOperand(0);
24411     SDValue N1 = N->getOperand(1);
24412     SDLoc DL(N);
24413
24414     // Check for BEXTR.
24415     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
24416         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
24417       ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
24418       ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
24419       if (MaskNode && ShiftNode) {
24420         uint64_t Mask = MaskNode->getZExtValue();
24421         uint64_t Shift = ShiftNode->getZExtValue();
24422         if (isMask_64(Mask)) {
24423           uint64_t MaskSize = CountPopulation_64(Mask);
24424           if (Shift + MaskSize <= VT.getSizeInBits())
24425             return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
24426                                DAG.getConstant(Shift | (MaskSize << 8), VT));
24427         }
24428       }
24429     } // BEXTR
24430
24431     return SDValue();
24432   }
24433
24434   // Want to form ANDNP nodes:
24435   // 1) In the hopes of then easily combining them with OR and AND nodes
24436   //    to form PBLEND/PSIGN.
24437   // 2) To match ANDN packed intrinsics
24438   if (VT != MVT::v2i64 && VT != MVT::v4i64)
24439     return SDValue();
24440
24441   SDValue N0 = N->getOperand(0);
24442   SDValue N1 = N->getOperand(1);
24443   SDLoc DL(N);
24444
24445   // Check LHS for vnot
24446   if (N0.getOpcode() == ISD::XOR &&
24447       //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
24448       CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
24449     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
24450
24451   // Check RHS for vnot
24452   if (N1.getOpcode() == ISD::XOR &&
24453       //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
24454       CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
24455     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
24456
24457   return SDValue();
24458 }
24459
24460 static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
24461                                 TargetLowering::DAGCombinerInfo &DCI,
24462                                 const X86Subtarget *Subtarget) {
24463   if (DCI.isBeforeLegalizeOps())
24464     return SDValue();
24465
24466   SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
24467   if (R.getNode())
24468     return R;
24469
24470   SDValue N0 = N->getOperand(0);
24471   SDValue N1 = N->getOperand(1);
24472   EVT VT = N->getValueType(0);
24473
24474   // look for psign/blend
24475   if (VT == MVT::v2i64 || VT == MVT::v4i64) {
24476     if (!Subtarget->hasSSSE3() ||
24477         (VT == MVT::v4i64 && !Subtarget->hasInt256()))
24478       return SDValue();
24479
24480     // Canonicalize pandn to RHS
24481     if (N0.getOpcode() == X86ISD::ANDNP)
24482       std::swap(N0, N1);
24483     // or (and (m, y), (pandn m, x))
24484     if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
24485       SDValue Mask = N1.getOperand(0);
24486       SDValue X    = N1.getOperand(1);
24487       SDValue Y;
24488       if (N0.getOperand(0) == Mask)
24489         Y = N0.getOperand(1);
24490       if (N0.getOperand(1) == Mask)
24491         Y = N0.getOperand(0);
24492
24493       // Check to see if the mask appeared in both the AND and ANDNP and
24494       if (!Y.getNode())
24495         return SDValue();
24496
24497       // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
24498       // Look through mask bitcast.
24499       if (Mask.getOpcode() == ISD::BITCAST)
24500         Mask = Mask.getOperand(0);
24501       if (X.getOpcode() == ISD::BITCAST)
24502         X = X.getOperand(0);
24503       if (Y.getOpcode() == ISD::BITCAST)
24504         Y = Y.getOperand(0);
24505
24506       EVT MaskVT = Mask.getValueType();
24507
24508       // Validate that the Mask operand is a vector sra node.
24509       // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
24510       // there is no psrai.b
24511       unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
24512       unsigned SraAmt = ~0;
24513       if (Mask.getOpcode() == ISD::SRA) {
24514         if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
24515           if (auto *AmtConst = AmtBV->getConstantSplatNode())
24516             SraAmt = AmtConst->getZExtValue();
24517       } else if (Mask.getOpcode() == X86ISD::VSRAI) {
24518         SDValue SraC = Mask.getOperand(1);
24519         SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
24520       }
24521       if ((SraAmt + 1) != EltBits)
24522         return SDValue();
24523
24524       SDLoc DL(N);
24525
24526       // Now we know we at least have a plendvb with the mask val.  See if
24527       // we can form a psignb/w/d.
24528       // psign = x.type == y.type == mask.type && y = sub(0, x);
24529       if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
24530           ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
24531           X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
24532         assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
24533                "Unsupported VT for PSIGN");
24534         Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
24535         return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24536       }
24537       // PBLENDVB only available on SSE 4.1
24538       if (!Subtarget->hasSSE41())
24539         return SDValue();
24540
24541       EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
24542
24543       X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
24544       Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
24545       Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
24546       Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
24547       return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
24548     }
24549   }
24550
24551   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
24552     return SDValue();
24553
24554   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
24555   MachineFunction &MF = DAG.getMachineFunction();
24556   bool OptForSize = MF.getFunction()->getAttributes().
24557     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
24558
24559   // SHLD/SHRD instructions have lower register pressure, but on some
24560   // platforms they have higher latency than the equivalent
24561   // series of shifts/or that would otherwise be generated.
24562   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
24563   // have higher latencies and we are not optimizing for size.
24564   if (!OptForSize && Subtarget->isSHLDSlow())
24565     return SDValue();
24566
24567   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
24568     std::swap(N0, N1);
24569   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
24570     return SDValue();
24571   if (!N0.hasOneUse() || !N1.hasOneUse())
24572     return SDValue();
24573
24574   SDValue ShAmt0 = N0.getOperand(1);
24575   if (ShAmt0.getValueType() != MVT::i8)
24576     return SDValue();
24577   SDValue ShAmt1 = N1.getOperand(1);
24578   if (ShAmt1.getValueType() != MVT::i8)
24579     return SDValue();
24580   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
24581     ShAmt0 = ShAmt0.getOperand(0);
24582   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
24583     ShAmt1 = ShAmt1.getOperand(0);
24584
24585   SDLoc DL(N);
24586   unsigned Opc = X86ISD::SHLD;
24587   SDValue Op0 = N0.getOperand(0);
24588   SDValue Op1 = N1.getOperand(0);
24589   if (ShAmt0.getOpcode() == ISD::SUB) {
24590     Opc = X86ISD::SHRD;
24591     std::swap(Op0, Op1);
24592     std::swap(ShAmt0, ShAmt1);
24593   }
24594
24595   unsigned Bits = VT.getSizeInBits();
24596   if (ShAmt1.getOpcode() == ISD::SUB) {
24597     SDValue Sum = ShAmt1.getOperand(0);
24598     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
24599       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
24600       if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
24601         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
24602       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
24603         return DAG.getNode(Opc, DL, VT,
24604                            Op0, Op1,
24605                            DAG.getNode(ISD::TRUNCATE, DL,
24606                                        MVT::i8, ShAmt0));
24607     }
24608   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
24609     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
24610     if (ShAmt0C &&
24611         ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
24612       return DAG.getNode(Opc, DL, VT,
24613                          N0.getOperand(0), N1.getOperand(0),
24614                          DAG.getNode(ISD::TRUNCATE, DL,
24615                                        MVT::i8, ShAmt0));
24616   }
24617
24618   return SDValue();
24619 }
24620
24621 // Generate NEG and CMOV for integer abs.
24622 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
24623   EVT VT = N->getValueType(0);
24624
24625   // Since X86 does not have CMOV for 8-bit integer, we don't convert
24626   // 8-bit integer abs to NEG and CMOV.
24627   if (VT.isInteger() && VT.getSizeInBits() == 8)
24628     return SDValue();
24629
24630   SDValue N0 = N->getOperand(0);
24631   SDValue N1 = N->getOperand(1);
24632   SDLoc DL(N);
24633
24634   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
24635   // and change it to SUB and CMOV.
24636   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
24637       N0.getOpcode() == ISD::ADD &&
24638       N0.getOperand(1) == N1 &&
24639       N1.getOpcode() == ISD::SRA &&
24640       N1.getOperand(0) == N0.getOperand(0))
24641     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
24642       if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
24643         // Generate SUB & CMOV.
24644         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
24645                                   DAG.getConstant(0, VT), N0.getOperand(0));
24646
24647         SDValue Ops[] = { N0.getOperand(0), Neg,
24648                           DAG.getConstant(X86::COND_GE, MVT::i8),
24649                           SDValue(Neg.getNode(), 1) };
24650         return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
24651       }
24652   return SDValue();
24653 }
24654
24655 // PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes
24656 static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
24657                                  TargetLowering::DAGCombinerInfo &DCI,
24658                                  const X86Subtarget *Subtarget) {
24659   if (DCI.isBeforeLegalizeOps())
24660     return SDValue();
24661
24662   if (Subtarget->hasCMov()) {
24663     SDValue RV = performIntegerAbsCombine(N, DAG);
24664     if (RV.getNode())
24665       return RV;
24666   }
24667
24668   return SDValue();
24669 }
24670
24671 /// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
24672 static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
24673                                   TargetLowering::DAGCombinerInfo &DCI,
24674                                   const X86Subtarget *Subtarget) {
24675   LoadSDNode *Ld = cast<LoadSDNode>(N);
24676   EVT RegVT = Ld->getValueType(0);
24677   EVT MemVT = Ld->getMemoryVT();
24678   SDLoc dl(Ld);
24679   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24680
24681   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
24682   // into two 16-byte operations.
24683   ISD::LoadExtType Ext = Ld->getExtensionType();
24684   unsigned Alignment = Ld->getAlignment();
24685   bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
24686   if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24687       !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
24688     unsigned NumElems = RegVT.getVectorNumElements();
24689     if (NumElems < 2)
24690       return SDValue();
24691
24692     SDValue Ptr = Ld->getBasePtr();
24693     SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
24694
24695     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
24696                                   NumElems/2);
24697     SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24698                                 Ld->getPointerInfo(), Ld->isVolatile(),
24699                                 Ld->isNonTemporal(), Ld->isInvariant(),
24700                                 Alignment);
24701     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24702     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
24703                                 Ld->getPointerInfo(), Ld->isVolatile(),
24704                                 Ld->isNonTemporal(), Ld->isInvariant(),
24705                                 std::min(16U, Alignment));
24706     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24707                              Load1.getValue(1),
24708                              Load2.getValue(1));
24709
24710     SDValue NewVec = DAG.getUNDEF(RegVT);
24711     NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
24712     NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
24713     return DCI.CombineTo(N, NewVec, TF, true);
24714   }
24715
24716   return SDValue();
24717 }
24718
24719 /// PerformMLOADCombine - Resolve extending loads
24720 static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG,
24721                                    TargetLowering::DAGCombinerInfo &DCI,
24722                                    const X86Subtarget *Subtarget) {
24723   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
24724   if (Mld->getExtensionType() != ISD::SEXTLOAD)
24725     return SDValue();
24726
24727   EVT VT = Mld->getValueType(0);
24728   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24729   unsigned NumElems = VT.getVectorNumElements();
24730   EVT LdVT = Mld->getMemoryVT();
24731   SDLoc dl(Mld);
24732
24733   assert(LdVT != VT && "Cannot extend to the same type");
24734   unsigned ToSz = VT.getVectorElementType().getSizeInBits();
24735   unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
24736   // From, To sizes and ElemCount must be pow of two
24737   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24738     "Unexpected size for extending masked load");
24739
24740   unsigned SizeRatio  = ToSz / FromSz;
24741   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
24742
24743   // Create a type on which we perform the shuffle
24744   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24745           LdVT.getScalarType(), NumElems*SizeRatio);
24746   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24747
24748   // Convert Src0 value
24749   SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0());
24750   if (Mld->getSrc0().getOpcode() != ISD::UNDEF) {
24751     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24752     for (unsigned i = 0; i != NumElems; ++i)
24753       ShuffleVec[i] = i * SizeRatio;
24754
24755     // Can't shuffle using an illegal type.
24756     assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
24757     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
24758                                     DAG.getUNDEF(WideVecVT), &ShuffleVec[0]);
24759   }
24760   // Prepare the new mask
24761   SDValue NewMask;
24762   SDValue Mask = Mld->getMask();
24763   if (Mask.getValueType() == VT) {
24764     // Mask and original value have the same type
24765     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24766     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24767     for (unsigned i = 0; i != NumElems; ++i)
24768       ShuffleVec[i] = i * SizeRatio;
24769     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24770       ShuffleVec[i] = NumElems*SizeRatio;
24771     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24772                                    DAG.getConstant(0, WideVecVT),
24773                                    &ShuffleVec[0]);
24774   }
24775   else {
24776     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24777     unsigned WidenNumElts = NumElems*SizeRatio;
24778     unsigned MaskNumElts = VT.getVectorNumElements();
24779     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24780                                      WidenNumElts);
24781
24782     unsigned NumConcat = WidenNumElts / MaskNumElts;
24783     SmallVector<SDValue, 16> Ops(NumConcat);
24784     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24785     Ops[0] = Mask;
24786     for (unsigned i = 1; i != NumConcat; ++i)
24787       Ops[i] = ZeroVal;
24788
24789     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24790   }
24791
24792   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
24793                                      Mld->getBasePtr(), NewMask, WideSrc0,
24794                                      Mld->getMemoryVT(), Mld->getMemOperand(),
24795                                      ISD::NON_EXTLOAD);
24796   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
24797   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
24798
24799 }
24800 /// PerformMSTORECombine - Resolve truncating stores
24801 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
24802                                     const X86Subtarget *Subtarget) {
24803   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
24804   if (!Mst->isTruncatingStore())
24805     return SDValue();
24806
24807   EVT VT = Mst->getValue().getValueType();
24808   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24809   unsigned NumElems = VT.getVectorNumElements();
24810   EVT StVT = Mst->getMemoryVT();
24811   SDLoc dl(Mst);
24812
24813   assert(StVT != VT && "Cannot truncate to the same type");
24814   unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24815   unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24816
24817   // From, To sizes and ElemCount must be pow of two
24818   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
24819     "Unexpected size for truncating masked store");
24820   // We are going to use the original vector elt for storing.
24821   // Accumulated smaller vector elements must be a multiple of the store size.
24822   assert (((NumElems * FromSz) % ToSz) == 0 &&
24823           "Unexpected ratio for truncating masked store");
24824
24825   unsigned SizeRatio  = FromSz / ToSz;
24826   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24827
24828   // Create a type on which we perform the shuffle
24829   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24830           StVT.getScalarType(), NumElems*SizeRatio);
24831
24832   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24833
24834   SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue());
24835   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
24836   for (unsigned i = 0; i != NumElems; ++i)
24837     ShuffleVec[i] = i * SizeRatio;
24838
24839   // Can't shuffle using an illegal type.
24840   assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal");
24841
24842   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24843                                         DAG.getUNDEF(WideVecVT),
24844                                         &ShuffleVec[0]);
24845
24846   SDValue NewMask;
24847   SDValue Mask = Mst->getMask();
24848   if (Mask.getValueType() == VT) {
24849     // Mask and original value have the same type
24850     NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask);
24851     for (unsigned i = 0; i != NumElems; ++i)
24852       ShuffleVec[i] = i * SizeRatio;
24853     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
24854       ShuffleVec[i] = NumElems*SizeRatio;
24855     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
24856                                    DAG.getConstant(0, WideVecVT),
24857                                    &ShuffleVec[0]);
24858   }
24859   else {
24860     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
24861     unsigned WidenNumElts = NumElems*SizeRatio;
24862     unsigned MaskNumElts = VT.getVectorNumElements();
24863     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
24864                                      WidenNumElts);
24865
24866     unsigned NumConcat = WidenNumElts / MaskNumElts;
24867     SmallVector<SDValue, 16> Ops(NumConcat);
24868     SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType());
24869     Ops[0] = Mask;
24870     for (unsigned i = 1; i != NumConcat; ++i)
24871       Ops[i] = ZeroVal;
24872
24873     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
24874   }
24875
24876   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(),
24877                             NewMask, StVT, Mst->getMemOperand(), false);
24878 }
24879 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
24880 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
24881                                    const X86Subtarget *Subtarget) {
24882   StoreSDNode *St = cast<StoreSDNode>(N);
24883   EVT VT = St->getValue().getValueType();
24884   EVT StVT = St->getMemoryVT();
24885   SDLoc dl(St);
24886   SDValue StoredVal = St->getOperand(1);
24887   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24888
24889   // If we are saving a concatenation of two XMM registers and 32-byte stores
24890   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
24891   unsigned Alignment = St->getAlignment();
24892   bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
24893   if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
24894       StVT == VT && !IsAligned) {
24895     unsigned NumElems = VT.getVectorNumElements();
24896     if (NumElems < 2)
24897       return SDValue();
24898
24899     SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
24900     SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
24901
24902     SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
24903     SDValue Ptr0 = St->getBasePtr();
24904     SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
24905
24906     SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
24907                                 St->getPointerInfo(), St->isVolatile(),
24908                                 St->isNonTemporal(), Alignment);
24909     SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
24910                                 St->getPointerInfo(), St->isVolatile(),
24911                                 St->isNonTemporal(),
24912                                 std::min(16U, Alignment));
24913     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
24914   }
24915
24916   // Optimize trunc store (of multiple scalars) to shuffle and store.
24917   // First, pack all of the elements in one place. Next, store to memory
24918   // in fewer chunks.
24919   if (St->isTruncatingStore() && VT.isVector()) {
24920     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24921     unsigned NumElems = VT.getVectorNumElements();
24922     assert(StVT != VT && "Cannot truncate to the same type");
24923     unsigned FromSz = VT.getVectorElementType().getSizeInBits();
24924     unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
24925
24926     // From, To sizes and ElemCount must be pow of two
24927     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
24928     // We are going to use the original vector elt for storing.
24929     // Accumulated smaller vector elements must be a multiple of the store size.
24930     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
24931
24932     unsigned SizeRatio  = FromSz / ToSz;
24933
24934     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
24935
24936     // Create a type on which we perform the shuffle
24937     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
24938             StVT.getScalarType(), NumElems*SizeRatio);
24939
24940     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
24941
24942     SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
24943     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
24944     for (unsigned i = 0; i != NumElems; ++i)
24945       ShuffleVec[i] = i * SizeRatio;
24946
24947     // Can't shuffle using an illegal type.
24948     if (!TLI.isTypeLegal(WideVecVT))
24949       return SDValue();
24950
24951     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
24952                                          DAG.getUNDEF(WideVecVT),
24953                                          &ShuffleVec[0]);
24954     // At this point all of the data is stored at the bottom of the
24955     // register. We now need to save it to mem.
24956
24957     // Find the largest store unit
24958     MVT StoreType = MVT::i8;
24959     for (MVT Tp : MVT::integer_valuetypes()) {
24960       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
24961         StoreType = Tp;
24962     }
24963
24964     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
24965     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
24966         (64 <= NumElems * ToSz))
24967       StoreType = MVT::f64;
24968
24969     // Bitcast the original vector into a vector of store-size units
24970     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
24971             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
24972     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
24973     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
24974     SmallVector<SDValue, 8> Chains;
24975     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
24976                                         TLI.getPointerTy());
24977     SDValue Ptr = St->getBasePtr();
24978
24979     // Perform one or more big stores into memory.
24980     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
24981       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
24982                                    StoreType, ShuffWide,
24983                                    DAG.getIntPtrConstant(i));
24984       SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
24985                                 St->getPointerInfo(), St->isVolatile(),
24986                                 St->isNonTemporal(), St->getAlignment());
24987       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
24988       Chains.push_back(Ch);
24989     }
24990
24991     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
24992   }
24993
24994   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
24995   // the FP state in cases where an emms may be missing.
24996   // A preferable solution to the general problem is to figure out the right
24997   // places to insert EMMS.  This qualifies as a quick hack.
24998
24999   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
25000   if (VT.getSizeInBits() != 64)
25001     return SDValue();
25002
25003   const Function *F = DAG.getMachineFunction().getFunction();
25004   bool NoImplicitFloatOps = F->getAttributes().
25005     hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
25006   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
25007                      && Subtarget->hasSSE2();
25008   if ((VT.isVector() ||
25009        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
25010       isa<LoadSDNode>(St->getValue()) &&
25011       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
25012       St->getChain().hasOneUse() && !St->isVolatile()) {
25013     SDNode* LdVal = St->getValue().getNode();
25014     LoadSDNode *Ld = nullptr;
25015     int TokenFactorIndex = -1;
25016     SmallVector<SDValue, 8> Ops;
25017     SDNode* ChainVal = St->getChain().getNode();
25018     // Must be a store of a load.  We currently handle two cases:  the load
25019     // is a direct child, and it's under an intervening TokenFactor.  It is
25020     // possible to dig deeper under nested TokenFactors.
25021     if (ChainVal == LdVal)
25022       Ld = cast<LoadSDNode>(St->getChain());
25023     else if (St->getValue().hasOneUse() &&
25024              ChainVal->getOpcode() == ISD::TokenFactor) {
25025       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
25026         if (ChainVal->getOperand(i).getNode() == LdVal) {
25027           TokenFactorIndex = i;
25028           Ld = cast<LoadSDNode>(St->getValue());
25029         } else
25030           Ops.push_back(ChainVal->getOperand(i));
25031       }
25032     }
25033
25034     if (!Ld || !ISD::isNormalLoad(Ld))
25035       return SDValue();
25036
25037     // If this is not the MMX case, i.e. we are just turning i64 load/store
25038     // into f64 load/store, avoid the transformation if there are multiple
25039     // uses of the loaded value.
25040     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
25041       return SDValue();
25042
25043     SDLoc LdDL(Ld);
25044     SDLoc StDL(N);
25045     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
25046     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
25047     // pair instead.
25048     if (Subtarget->is64Bit() || F64IsLegal) {
25049       EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
25050       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
25051                                   Ld->getPointerInfo(), Ld->isVolatile(),
25052                                   Ld->isNonTemporal(), Ld->isInvariant(),
25053                                   Ld->getAlignment());
25054       SDValue NewChain = NewLd.getValue(1);
25055       if (TokenFactorIndex != -1) {
25056         Ops.push_back(NewChain);
25057         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25058       }
25059       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
25060                           St->getPointerInfo(),
25061                           St->isVolatile(), St->isNonTemporal(),
25062                           St->getAlignment());
25063     }
25064
25065     // Otherwise, lower to two pairs of 32-bit loads / stores.
25066     SDValue LoAddr = Ld->getBasePtr();
25067     SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
25068                                  DAG.getConstant(4, MVT::i32));
25069
25070     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
25071                                Ld->getPointerInfo(),
25072                                Ld->isVolatile(), Ld->isNonTemporal(),
25073                                Ld->isInvariant(), Ld->getAlignment());
25074     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
25075                                Ld->getPointerInfo().getWithOffset(4),
25076                                Ld->isVolatile(), Ld->isNonTemporal(),
25077                                Ld->isInvariant(),
25078                                MinAlign(Ld->getAlignment(), 4));
25079
25080     SDValue NewChain = LoLd.getValue(1);
25081     if (TokenFactorIndex != -1) {
25082       Ops.push_back(LoLd);
25083       Ops.push_back(HiLd);
25084       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
25085     }
25086
25087     LoAddr = St->getBasePtr();
25088     HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
25089                          DAG.getConstant(4, MVT::i32));
25090
25091     SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
25092                                 St->getPointerInfo(),
25093                                 St->isVolatile(), St->isNonTemporal(),
25094                                 St->getAlignment());
25095     SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
25096                                 St->getPointerInfo().getWithOffset(4),
25097                                 St->isVolatile(),
25098                                 St->isNonTemporal(),
25099                                 MinAlign(St->getAlignment(), 4));
25100     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
25101   }
25102   return SDValue();
25103 }
25104
25105 /// Return 'true' if this vector operation is "horizontal"
25106 /// and return the operands for the horizontal operation in LHS and RHS.  A
25107 /// horizontal operation performs the binary operation on successive elements
25108 /// of its first operand, then on successive elements of its second operand,
25109 /// returning the resulting values in a vector.  For example, if
25110 ///   A = < float a0, float a1, float a2, float a3 >
25111 /// and
25112 ///   B = < float b0, float b1, float b2, float b3 >
25113 /// then the result of doing a horizontal operation on A and B is
25114 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
25115 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
25116 /// A horizontal-op B, for some already available A and B, and if so then LHS is
25117 /// set to A, RHS to B, and the routine returns 'true'.
25118 /// Note that the binary operation should have the property that if one of the
25119 /// operands is UNDEF then the result is UNDEF.
25120 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
25121   // Look for the following pattern: if
25122   //   A = < float a0, float a1, float a2, float a3 >
25123   //   B = < float b0, float b1, float b2, float b3 >
25124   // and
25125   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
25126   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
25127   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
25128   // which is A horizontal-op B.
25129
25130   // At least one of the operands should be a vector shuffle.
25131   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
25132       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
25133     return false;
25134
25135   MVT VT = LHS.getSimpleValueType();
25136
25137   assert((VT.is128BitVector() || VT.is256BitVector()) &&
25138          "Unsupported vector type for horizontal add/sub");
25139
25140   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
25141   // operate independently on 128-bit lanes.
25142   unsigned NumElts = VT.getVectorNumElements();
25143   unsigned NumLanes = VT.getSizeInBits()/128;
25144   unsigned NumLaneElts = NumElts / NumLanes;
25145   assert((NumLaneElts % 2 == 0) &&
25146          "Vector type should have an even number of elements in each lane");
25147   unsigned HalfLaneElts = NumLaneElts/2;
25148
25149   // View LHS in the form
25150   //   LHS = VECTOR_SHUFFLE A, B, LMask
25151   // If LHS is not a shuffle then pretend it is the shuffle
25152   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
25153   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
25154   // type VT.
25155   SDValue A, B;
25156   SmallVector<int, 16> LMask(NumElts);
25157   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25158     if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
25159       A = LHS.getOperand(0);
25160     if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
25161       B = LHS.getOperand(1);
25162     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
25163     std::copy(Mask.begin(), Mask.end(), LMask.begin());
25164   } else {
25165     if (LHS.getOpcode() != ISD::UNDEF)
25166       A = LHS;
25167     for (unsigned i = 0; i != NumElts; ++i)
25168       LMask[i] = i;
25169   }
25170
25171   // Likewise, view RHS in the form
25172   //   RHS = VECTOR_SHUFFLE C, D, RMask
25173   SDValue C, D;
25174   SmallVector<int, 16> RMask(NumElts);
25175   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
25176     if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
25177       C = RHS.getOperand(0);
25178     if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
25179       D = RHS.getOperand(1);
25180     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
25181     std::copy(Mask.begin(), Mask.end(), RMask.begin());
25182   } else {
25183     if (RHS.getOpcode() != ISD::UNDEF)
25184       C = RHS;
25185     for (unsigned i = 0; i != NumElts; ++i)
25186       RMask[i] = i;
25187   }
25188
25189   // Check that the shuffles are both shuffling the same vectors.
25190   if (!(A == C && B == D) && !(A == D && B == C))
25191     return false;
25192
25193   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
25194   if (!A.getNode() && !B.getNode())
25195     return false;
25196
25197   // If A and B occur in reverse order in RHS, then "swap" them (which means
25198   // rewriting the mask).
25199   if (A != C)
25200     CommuteVectorShuffleMask(RMask, NumElts);
25201
25202   // At this point LHS and RHS are equivalent to
25203   //   LHS = VECTOR_SHUFFLE A, B, LMask
25204   //   RHS = VECTOR_SHUFFLE A, B, RMask
25205   // Check that the masks correspond to performing a horizontal operation.
25206   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
25207     for (unsigned i = 0; i != NumLaneElts; ++i) {
25208       int LIdx = LMask[i+l], RIdx = RMask[i+l];
25209
25210       // Ignore any UNDEF components.
25211       if (LIdx < 0 || RIdx < 0 ||
25212           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
25213           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
25214         continue;
25215
25216       // Check that successive elements are being operated on.  If not, this is
25217       // not a horizontal operation.
25218       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
25219       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
25220       if (!(LIdx == Index && RIdx == Index + 1) &&
25221           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
25222         return false;
25223     }
25224   }
25225
25226   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
25227   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
25228   return true;
25229 }
25230
25231 /// Do target-specific dag combines on floating point adds.
25232 static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
25233                                   const X86Subtarget *Subtarget) {
25234   EVT VT = N->getValueType(0);
25235   SDValue LHS = N->getOperand(0);
25236   SDValue RHS = N->getOperand(1);
25237
25238   // Try to synthesize horizontal adds from adds of shuffles.
25239   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25240        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25241       isHorizontalBinOp(LHS, RHS, true))
25242     return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
25243   return SDValue();
25244 }
25245
25246 /// Do target-specific dag combines on floating point subs.
25247 static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
25248                                   const X86Subtarget *Subtarget) {
25249   EVT VT = N->getValueType(0);
25250   SDValue LHS = N->getOperand(0);
25251   SDValue RHS = N->getOperand(1);
25252
25253   // Try to synthesize horizontal subs from subs of shuffles.
25254   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
25255        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
25256       isHorizontalBinOp(LHS, RHS, false))
25257     return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
25258   return SDValue();
25259 }
25260
25261 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
25262 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
25263   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
25264   // F[X]OR(0.0, x) -> x
25265   // F[X]OR(x, 0.0) -> x
25266   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25267     if (C->getValueAPF().isPosZero())
25268       return N->getOperand(1);
25269   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25270     if (C->getValueAPF().isPosZero())
25271       return N->getOperand(0);
25272   return SDValue();
25273 }
25274
25275 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
25276 static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
25277   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
25278
25279   // Only perform optimizations if UnsafeMath is used.
25280   if (!DAG.getTarget().Options.UnsafeFPMath)
25281     return SDValue();
25282
25283   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
25284   // into FMINC and FMAXC, which are Commutative operations.
25285   unsigned NewOp = 0;
25286   switch (N->getOpcode()) {
25287     default: llvm_unreachable("unknown opcode");
25288     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
25289     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
25290   }
25291
25292   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
25293                      N->getOperand(0), N->getOperand(1));
25294 }
25295
25296 /// Do target-specific dag combines on X86ISD::FAND nodes.
25297 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
25298   // FAND(0.0, x) -> 0.0
25299   // FAND(x, 0.0) -> 0.0
25300   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25301     if (C->getValueAPF().isPosZero())
25302       return N->getOperand(0);
25303   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25304     if (C->getValueAPF().isPosZero())
25305       return N->getOperand(1);
25306   return SDValue();
25307 }
25308
25309 /// Do target-specific dag combines on X86ISD::FANDN nodes
25310 static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
25311   // FANDN(x, 0.0) -> 0.0
25312   // FANDN(0.0, x) -> x
25313   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
25314     if (C->getValueAPF().isPosZero())
25315       return N->getOperand(1);
25316   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
25317     if (C->getValueAPF().isPosZero())
25318       return N->getOperand(1);
25319   return SDValue();
25320 }
25321
25322 static SDValue PerformBTCombine(SDNode *N,
25323                                 SelectionDAG &DAG,
25324                                 TargetLowering::DAGCombinerInfo &DCI) {
25325   // BT ignores high bits in the bit index operand.
25326   SDValue Op1 = N->getOperand(1);
25327   if (Op1.hasOneUse()) {
25328     unsigned BitWidth = Op1.getValueSizeInBits();
25329     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
25330     APInt KnownZero, KnownOne;
25331     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
25332                                           !DCI.isBeforeLegalizeOps());
25333     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25334     if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
25335         TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
25336       DCI.CommitTargetLoweringOpt(TLO);
25337   }
25338   return SDValue();
25339 }
25340
25341 static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
25342   SDValue Op = N->getOperand(0);
25343   if (Op.getOpcode() == ISD::BITCAST)
25344     Op = Op.getOperand(0);
25345   EVT VT = N->getValueType(0), OpVT = Op.getValueType();
25346   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
25347       VT.getVectorElementType().getSizeInBits() ==
25348       OpVT.getVectorElementType().getSizeInBits()) {
25349     return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
25350   }
25351   return SDValue();
25352 }
25353
25354 static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
25355                                                const X86Subtarget *Subtarget) {
25356   EVT VT = N->getValueType(0);
25357   if (!VT.isVector())
25358     return SDValue();
25359
25360   SDValue N0 = N->getOperand(0);
25361   SDValue N1 = N->getOperand(1);
25362   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
25363   SDLoc dl(N);
25364
25365   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
25366   // both SSE and AVX2 since there is no sign-extended shift right
25367   // operation on a vector with 64-bit elements.
25368   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
25369   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
25370   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
25371       N0.getOpcode() == ISD::SIGN_EXTEND)) {
25372     SDValue N00 = N0.getOperand(0);
25373
25374     // EXTLOAD has a better solution on AVX2,
25375     // it may be replaced with X86ISD::VSEXT node.
25376     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
25377       if (!ISD::isNormalLoad(N00.getNode()))
25378         return SDValue();
25379
25380     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
25381         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
25382                                   N00, N1);
25383       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
25384     }
25385   }
25386   return SDValue();
25387 }
25388
25389 static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
25390                                   TargetLowering::DAGCombinerInfo &DCI,
25391                                   const X86Subtarget *Subtarget) {
25392   SDValue N0 = N->getOperand(0);
25393   EVT VT = N->getValueType(0);
25394
25395   // (i8,i32 sext (sdivrem (i8 x, i8 y)) ->
25396   // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y)
25397   // This exposes the sext to the sdivrem lowering, so that it directly extends
25398   // from AH (which we otherwise need to do contortions to access).
25399   if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 &&
25400       N0.getValueType() == MVT::i8 && VT == MVT::i32) {
25401     SDLoc dl(N);
25402     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25403     SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys,
25404                             N0.getOperand(0), N0.getOperand(1));
25405     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25406     return R.getValue(1);
25407   }
25408
25409   if (!DCI.isBeforeLegalizeOps())
25410     return SDValue();
25411
25412   if (!Subtarget->hasFp256())
25413     return SDValue();
25414
25415   if (VT.isVector() && VT.getSizeInBits() == 256) {
25416     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25417     if (R.getNode())
25418       return R;
25419   }
25420
25421   return SDValue();
25422 }
25423
25424 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
25425                                  const X86Subtarget* Subtarget) {
25426   SDLoc dl(N);
25427   EVT VT = N->getValueType(0);
25428
25429   // Let legalize expand this if it isn't a legal type yet.
25430   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
25431     return SDValue();
25432
25433   EVT ScalarVT = VT.getScalarType();
25434   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
25435       (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
25436     return SDValue();
25437
25438   SDValue A = N->getOperand(0);
25439   SDValue B = N->getOperand(1);
25440   SDValue C = N->getOperand(2);
25441
25442   bool NegA = (A.getOpcode() == ISD::FNEG);
25443   bool NegB = (B.getOpcode() == ISD::FNEG);
25444   bool NegC = (C.getOpcode() == ISD::FNEG);
25445
25446   // Negative multiplication when NegA xor NegB
25447   bool NegMul = (NegA != NegB);
25448   if (NegA)
25449     A = A.getOperand(0);
25450   if (NegB)
25451     B = B.getOperand(0);
25452   if (NegC)
25453     C = C.getOperand(0);
25454
25455   unsigned Opcode;
25456   if (!NegMul)
25457     Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
25458   else
25459     Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
25460
25461   return DAG.getNode(Opcode, dl, VT, A, B, C);
25462 }
25463
25464 static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
25465                                   TargetLowering::DAGCombinerInfo &DCI,
25466                                   const X86Subtarget *Subtarget) {
25467   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
25468   //           (and (i32 x86isd::setcc_carry), 1)
25469   // This eliminates the zext. This transformation is necessary because
25470   // ISD::SETCC is always legalized to i8.
25471   SDLoc dl(N);
25472   SDValue N0 = N->getOperand(0);
25473   EVT VT = N->getValueType(0);
25474
25475   if (N0.getOpcode() == ISD::AND &&
25476       N0.hasOneUse() &&
25477       N0.getOperand(0).hasOneUse()) {
25478     SDValue N00 = N0.getOperand(0);
25479     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25480       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
25481       if (!C || C->getZExtValue() != 1)
25482         return SDValue();
25483       return DAG.getNode(ISD::AND, dl, VT,
25484                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25485                                      N00.getOperand(0), N00.getOperand(1)),
25486                          DAG.getConstant(1, VT));
25487     }
25488   }
25489
25490   if (N0.getOpcode() == ISD::TRUNCATE &&
25491       N0.hasOneUse() &&
25492       N0.getOperand(0).hasOneUse()) {
25493     SDValue N00 = N0.getOperand(0);
25494     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
25495       return DAG.getNode(ISD::AND, dl, VT,
25496                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
25497                                      N00.getOperand(0), N00.getOperand(1)),
25498                          DAG.getConstant(1, VT));
25499     }
25500   }
25501   if (VT.is256BitVector()) {
25502     SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
25503     if (R.getNode())
25504       return R;
25505   }
25506
25507   // (i8,i32 zext (udivrem (i8 x, i8 y)) ->
25508   // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y)
25509   // This exposes the zext to the udivrem lowering, so that it directly extends
25510   // from AH (which we otherwise need to do contortions to access).
25511   if (N0.getOpcode() == ISD::UDIVREM &&
25512       N0.getResNo() == 1 && N0.getValueType() == MVT::i8 &&
25513       (VT == MVT::i32 || VT == MVT::i64)) {
25514     SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
25515     SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys,
25516                             N0.getOperand(0), N0.getOperand(1));
25517     DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
25518     return R.getValue(1);
25519   }
25520
25521   return SDValue();
25522 }
25523
25524 // Optimize x == -y --> x+y == 0
25525 //          x != -y --> x+y != 0
25526 static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
25527                                       const X86Subtarget* Subtarget) {
25528   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
25529   SDValue LHS = N->getOperand(0);
25530   SDValue RHS = N->getOperand(1);
25531   EVT VT = N->getValueType(0);
25532   SDLoc DL(N);
25533
25534   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
25535     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
25536       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
25537         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), LHS.getValueType(), RHS,
25538                                    LHS.getOperand(1));
25539         return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
25540                             DAG.getConstant(0, addV.getValueType()), CC);
25541       }
25542   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
25543     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
25544       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
25545         SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N), RHS.getValueType(), LHS,
25546                                    RHS.getOperand(1));
25547         return DAG.getSetCC(SDLoc(N), N->getValueType(0), addV,
25548                             DAG.getConstant(0, addV.getValueType()), CC);
25549       }
25550
25551   if (VT.getScalarType() == MVT::i1 &&
25552       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
25553     bool IsSEXT0 =
25554         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25555         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25556     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25557
25558     if (!IsSEXT0 || !IsVZero1) {
25559       // Swap the operands and update the condition code.
25560       std::swap(LHS, RHS);
25561       CC = ISD::getSetCCSwappedOperands(CC);
25562
25563       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
25564                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
25565       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
25566     }
25567
25568     if (IsSEXT0 && IsVZero1) {
25569       assert(VT == LHS.getOperand(0).getValueType() &&
25570              "Uexpected operand type");
25571       if (CC == ISD::SETGT)
25572         return DAG.getConstant(0, VT);
25573       if (CC == ISD::SETLE)
25574         return DAG.getConstant(1, VT);
25575       if (CC == ISD::SETEQ || CC == ISD::SETGE)
25576         return DAG.getNOT(DL, LHS.getOperand(0), VT);
25577
25578       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
25579              "Unexpected condition code!");
25580       return LHS.getOperand(0);
25581     }
25582   }
25583
25584   return SDValue();
25585 }
25586
25587 static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
25588                                       const X86Subtarget *Subtarget) {
25589   SDLoc dl(N);
25590   MVT VT = N->getOperand(1)->getSimpleValueType(0);
25591   assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
25592          "X86insertps is only defined for v4x32");
25593
25594   SDValue Ld = N->getOperand(1);
25595   if (MayFoldLoad(Ld)) {
25596     // Extract the countS bits from the immediate so we can get the proper
25597     // address when narrowing the vector load to a specific element.
25598     // When the second source op is a memory address, interps doesn't use
25599     // countS and just gets an f32 from that address.
25600     unsigned DestIndex =
25601         cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
25602     Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
25603   } else
25604     return SDValue();
25605
25606   // Create this as a scalar to vector to match the instruction pattern.
25607   SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
25608   // countS bits are ignored when loading from memory on insertps, which
25609   // means we don't need to explicitly set them to 0.
25610   return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
25611                      LoadScalarToVector, N->getOperand(2));
25612 }
25613
25614 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
25615 // as "sbb reg,reg", since it can be extended without zext and produces
25616 // an all-ones bit which is more useful than 0/1 in some cases.
25617 static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG,
25618                                MVT VT) {
25619   if (VT == MVT::i8)
25620     return DAG.getNode(ISD::AND, DL, VT,
25621                        DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25622                                    DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
25623                        DAG.getConstant(1, VT));
25624   assert (VT == MVT::i1 && "Unexpected type for SECCC node");
25625   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
25626                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
25627                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS));
25628 }
25629
25630 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
25631 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
25632                                    TargetLowering::DAGCombinerInfo &DCI,
25633                                    const X86Subtarget *Subtarget) {
25634   SDLoc DL(N);
25635   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
25636   SDValue EFLAGS = N->getOperand(1);
25637
25638   if (CC == X86::COND_A) {
25639     // Try to convert COND_A into COND_B in an attempt to facilitate
25640     // materializing "setb reg".
25641     //
25642     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
25643     // cannot take an immediate as its first operand.
25644     //
25645     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
25646         EFLAGS.getValueType().isInteger() &&
25647         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
25648       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
25649                                    EFLAGS.getNode()->getVTList(),
25650                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
25651       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
25652       return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
25653     }
25654   }
25655
25656   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
25657   // a zext and produces an all-ones bit which is more useful than 0/1 in some
25658   // cases.
25659   if (CC == X86::COND_B)
25660     return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
25661
25662   SDValue Flags;
25663
25664   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25665   if (Flags.getNode()) {
25666     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25667     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
25668   }
25669
25670   return SDValue();
25671 }
25672
25673 // Optimize branch condition evaluation.
25674 //
25675 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
25676                                     TargetLowering::DAGCombinerInfo &DCI,
25677                                     const X86Subtarget *Subtarget) {
25678   SDLoc DL(N);
25679   SDValue Chain = N->getOperand(0);
25680   SDValue Dest = N->getOperand(1);
25681   SDValue EFLAGS = N->getOperand(3);
25682   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
25683
25684   SDValue Flags;
25685
25686   Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
25687   if (Flags.getNode()) {
25688     SDValue Cond = DAG.getConstant(CC, MVT::i8);
25689     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
25690                        Flags);
25691   }
25692
25693   return SDValue();
25694 }
25695
25696 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
25697                                                          SelectionDAG &DAG) {
25698   // Take advantage of vector comparisons producing 0 or -1 in each lane to
25699   // optimize away operation when it's from a constant.
25700   //
25701   // The general transformation is:
25702   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
25703   //       AND(VECTOR_CMP(x,y), constant2)
25704   //    constant2 = UNARYOP(constant)
25705
25706   // Early exit if this isn't a vector operation, the operand of the
25707   // unary operation isn't a bitwise AND, or if the sizes of the operations
25708   // aren't the same.
25709   EVT VT = N->getValueType(0);
25710   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
25711       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
25712       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
25713     return SDValue();
25714
25715   // Now check that the other operand of the AND is a constant. We could
25716   // make the transformation for non-constant splats as well, but it's unclear
25717   // that would be a benefit as it would not eliminate any operations, just
25718   // perform one more step in scalar code before moving to the vector unit.
25719   if (BuildVectorSDNode *BV =
25720           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
25721     // Bail out if the vector isn't a constant.
25722     if (!BV->isConstant())
25723       return SDValue();
25724
25725     // Everything checks out. Build up the new and improved node.
25726     SDLoc DL(N);
25727     EVT IntVT = BV->getValueType(0);
25728     // Create a new constant of the appropriate type for the transformed
25729     // DAG.
25730     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
25731     // The AND node needs bitcasts to/from an integer vector type around it.
25732     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
25733     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
25734                                  N->getOperand(0)->getOperand(0), MaskConst);
25735     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
25736     return Res;
25737   }
25738
25739   return SDValue();
25740 }
25741
25742 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
25743                                         const X86TargetLowering *XTLI) {
25744   // First try to optimize away the conversion entirely when it's
25745   // conditionally from a constant. Vectors only.
25746   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
25747   if (Res != SDValue())
25748     return Res;
25749
25750   // Now move on to more general possibilities.
25751   SDValue Op0 = N->getOperand(0);
25752   EVT InVT = Op0->getValueType(0);
25753
25754   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
25755   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
25756     SDLoc dl(N);
25757     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
25758     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
25759     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
25760   }
25761
25762   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
25763   // a 32-bit target where SSE doesn't support i64->FP operations.
25764   if (Op0.getOpcode() == ISD::LOAD) {
25765     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
25766     EVT VT = Ld->getValueType(0);
25767     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
25768         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
25769         !XTLI->getSubtarget()->is64Bit() &&
25770         VT == MVT::i64) {
25771       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
25772                                           Ld->getChain(), Op0, DAG);
25773       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
25774       return FILDChain;
25775     }
25776   }
25777   return SDValue();
25778 }
25779
25780 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
25781 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
25782                                  X86TargetLowering::DAGCombinerInfo &DCI) {
25783   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
25784   // the result is either zero or one (depending on the input carry bit).
25785   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
25786   if (X86::isZeroNode(N->getOperand(0)) &&
25787       X86::isZeroNode(N->getOperand(1)) &&
25788       // We don't have a good way to replace an EFLAGS use, so only do this when
25789       // dead right now.
25790       SDValue(N, 1).use_empty()) {
25791     SDLoc DL(N);
25792     EVT VT = N->getValueType(0);
25793     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
25794     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
25795                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
25796                                            DAG.getConstant(X86::COND_B,MVT::i8),
25797                                            N->getOperand(2)),
25798                                DAG.getConstant(1, VT));
25799     return DCI.CombineTo(N, Res1, CarryOut);
25800   }
25801
25802   return SDValue();
25803 }
25804
25805 // fold (add Y, (sete  X, 0)) -> adc  0, Y
25806 //      (add Y, (setne X, 0)) -> sbb -1, Y
25807 //      (sub (sete  X, 0), Y) -> sbb  0, Y
25808 //      (sub (setne X, 0), Y) -> adc -1, Y
25809 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
25810   SDLoc DL(N);
25811
25812   // Look through ZExts.
25813   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
25814   if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
25815     return SDValue();
25816
25817   SDValue SetCC = Ext.getOperand(0);
25818   if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
25819     return SDValue();
25820
25821   X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
25822   if (CC != X86::COND_E && CC != X86::COND_NE)
25823     return SDValue();
25824
25825   SDValue Cmp = SetCC.getOperand(1);
25826   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
25827       !X86::isZeroNode(Cmp.getOperand(1)) ||
25828       !Cmp.getOperand(0).getValueType().isInteger())
25829     return SDValue();
25830
25831   SDValue CmpOp0 = Cmp.getOperand(0);
25832   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
25833                                DAG.getConstant(1, CmpOp0.getValueType()));
25834
25835   SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
25836   if (CC == X86::COND_NE)
25837     return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
25838                        DL, OtherVal.getValueType(), OtherVal,
25839                        DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
25840   return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
25841                      DL, OtherVal.getValueType(), OtherVal,
25842                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
25843 }
25844
25845 /// PerformADDCombine - Do target-specific dag combines on integer adds.
25846 static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
25847                                  const X86Subtarget *Subtarget) {
25848   EVT VT = N->getValueType(0);
25849   SDValue Op0 = N->getOperand(0);
25850   SDValue Op1 = N->getOperand(1);
25851
25852   // Try to synthesize horizontal adds from adds of shuffles.
25853   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25854        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25855       isHorizontalBinOp(Op0, Op1, true))
25856     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
25857
25858   return OptimizeConditionalInDecrement(N, DAG);
25859 }
25860
25861 static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
25862                                  const X86Subtarget *Subtarget) {
25863   SDValue Op0 = N->getOperand(0);
25864   SDValue Op1 = N->getOperand(1);
25865
25866   // X86 can't encode an immediate LHS of a sub. See if we can push the
25867   // negation into a preceding instruction.
25868   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
25869     // If the RHS of the sub is a XOR with one use and a constant, invert the
25870     // immediate. Then add one to the LHS of the sub so we can turn
25871     // X-Y -> X+~Y+1, saving one register.
25872     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
25873         isa<ConstantSDNode>(Op1.getOperand(1))) {
25874       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
25875       EVT VT = Op0.getValueType();
25876       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
25877                                    Op1.getOperand(0),
25878                                    DAG.getConstant(~XorC, VT));
25879       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
25880                          DAG.getConstant(C->getAPIntValue()+1, VT));
25881     }
25882   }
25883
25884   // Try to synthesize horizontal adds from adds of shuffles.
25885   EVT VT = N->getValueType(0);
25886   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
25887        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
25888       isHorizontalBinOp(Op0, Op1, true))
25889     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
25890
25891   return OptimizeConditionalInDecrement(N, DAG);
25892 }
25893
25894 /// performVZEXTCombine - Performs build vector combines
25895 static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
25896                                    TargetLowering::DAGCombinerInfo &DCI,
25897                                    const X86Subtarget *Subtarget) {
25898   SDLoc DL(N);
25899   MVT VT = N->getSimpleValueType(0);
25900   SDValue Op = N->getOperand(0);
25901   MVT OpVT = Op.getSimpleValueType();
25902   MVT OpEltVT = OpVT.getVectorElementType();
25903   unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
25904
25905   // (vzext (bitcast (vzext (x)) -> (vzext x)
25906   SDValue V = Op;
25907   while (V.getOpcode() == ISD::BITCAST)
25908     V = V.getOperand(0);
25909
25910   if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
25911     MVT InnerVT = V.getSimpleValueType();
25912     MVT InnerEltVT = InnerVT.getVectorElementType();
25913
25914     // If the element sizes match exactly, we can just do one larger vzext. This
25915     // is always an exact type match as vzext operates on integer types.
25916     if (OpEltVT == InnerEltVT) {
25917       assert(OpVT == InnerVT && "Types must match for vzext!");
25918       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
25919     }
25920
25921     // The only other way we can combine them is if only a single element of the
25922     // inner vzext is used in the input to the outer vzext.
25923     if (InnerEltVT.getSizeInBits() < InputBits)
25924       return SDValue();
25925
25926     // In this case, the inner vzext is completely dead because we're going to
25927     // only look at bits inside of the low element. Just do the outer vzext on
25928     // a bitcast of the input to the inner.
25929     return DAG.getNode(X86ISD::VZEXT, DL, VT,
25930                        DAG.getNode(ISD::BITCAST, DL, OpVT, V));
25931   }
25932
25933   // Check if we can bypass extracting and re-inserting an element of an input
25934   // vector. Essentialy:
25935   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
25936   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
25937       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
25938       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
25939     SDValue ExtractedV = V.getOperand(0);
25940     SDValue OrigV = ExtractedV.getOperand(0);
25941     if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1)))
25942       if (ExtractIdx->getZExtValue() == 0) {
25943         MVT OrigVT = OrigV.getSimpleValueType();
25944         // Extract a subvector if necessary...
25945         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
25946           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
25947           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
25948                                     OrigVT.getVectorNumElements() / Ratio);
25949           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
25950                               DAG.getIntPtrConstant(0));
25951         }
25952         Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV);
25953         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
25954       }
25955   }
25956
25957   return SDValue();
25958 }
25959
25960 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
25961                                              DAGCombinerInfo &DCI) const {
25962   SelectionDAG &DAG = DCI.DAG;
25963   switch (N->getOpcode()) {
25964   default: break;
25965   case ISD::EXTRACT_VECTOR_ELT:
25966     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
25967   case ISD::VSELECT:
25968   case ISD::SELECT:
25969   case X86ISD::SHRUNKBLEND:
25970     return PerformSELECTCombine(N, DAG, DCI, Subtarget);
25971   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
25972   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
25973   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
25974   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
25975   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
25976   case ISD::SHL:
25977   case ISD::SRA:
25978   case ISD::SRL:            return PerformShiftCombine(N, DAG, DCI, Subtarget);
25979   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
25980   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
25981   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
25982   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
25983   case ISD::MLOAD:          return PerformMLOADCombine(N, DAG, DCI, Subtarget);
25984   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
25985   case ISD::MSTORE:         return PerformMSTORECombine(N, DAG, Subtarget);
25986   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
25987   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
25988   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
25989   case X86ISD::FXOR:
25990   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
25991   case X86ISD::FMIN:
25992   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
25993   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
25994   case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
25995   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
25996   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
25997   case ISD::ANY_EXTEND:
25998   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
25999   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
26000   case ISD::SIGN_EXTEND_INREG:
26001     return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
26002   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
26003   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
26004   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
26005   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
26006   case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
26007   case X86ISD::SHUFP:       // Handle all target specific shuffles
26008   case X86ISD::PALIGNR:
26009   case X86ISD::UNPCKH:
26010   case X86ISD::UNPCKL:
26011   case X86ISD::MOVHLPS:
26012   case X86ISD::MOVLHPS:
26013   case X86ISD::PSHUFB:
26014   case X86ISD::PSHUFD:
26015   case X86ISD::PSHUFHW:
26016   case X86ISD::PSHUFLW:
26017   case X86ISD::MOVSS:
26018   case X86ISD::MOVSD:
26019   case X86ISD::VPERMILPI:
26020   case X86ISD::VPERM2X128:
26021   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
26022   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
26023   case ISD::INTRINSIC_WO_CHAIN:
26024     return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
26025   case X86ISD::INSERTPS:
26026     return PerformINSERTPSCombine(N, DAG, Subtarget);
26027   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DAG, Subtarget);
26028   }
26029
26030   return SDValue();
26031 }
26032
26033 /// isTypeDesirableForOp - Return true if the target has native support for
26034 /// the specified value type and it is 'desirable' to use the type for the
26035 /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
26036 /// instruction encodings are longer and some i16 instructions are slow.
26037 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
26038   if (!isTypeLegal(VT))
26039     return false;
26040   if (VT != MVT::i16)
26041     return true;
26042
26043   switch (Opc) {
26044   default:
26045     return true;
26046   case ISD::LOAD:
26047   case ISD::SIGN_EXTEND:
26048   case ISD::ZERO_EXTEND:
26049   case ISD::ANY_EXTEND:
26050   case ISD::SHL:
26051   case ISD::SRL:
26052   case ISD::SUB:
26053   case ISD::ADD:
26054   case ISD::MUL:
26055   case ISD::AND:
26056   case ISD::OR:
26057   case ISD::XOR:
26058     return false;
26059   }
26060 }
26061
26062 /// IsDesirableToPromoteOp - This method query the target whether it is
26063 /// beneficial for dag combiner to promote the specified node. If true, it
26064 /// should return the desired promotion type by reference.
26065 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
26066   EVT VT = Op.getValueType();
26067   if (VT != MVT::i16)
26068     return false;
26069
26070   bool Promote = false;
26071   bool Commute = false;
26072   switch (Op.getOpcode()) {
26073   default: break;
26074   case ISD::LOAD: {
26075     LoadSDNode *LD = cast<LoadSDNode>(Op);
26076     // If the non-extending load has a single use and it's not live out, then it
26077     // might be folded.
26078     if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
26079                                                      Op.hasOneUse()*/) {
26080       for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
26081              UE = Op.getNode()->use_end(); UI != UE; ++UI) {
26082         // The only case where we'd want to promote LOAD (rather then it being
26083         // promoted as an operand is when it's only use is liveout.
26084         if (UI->getOpcode() != ISD::CopyToReg)
26085           return false;
26086       }
26087     }
26088     Promote = true;
26089     break;
26090   }
26091   case ISD::SIGN_EXTEND:
26092   case ISD::ZERO_EXTEND:
26093   case ISD::ANY_EXTEND:
26094     Promote = true;
26095     break;
26096   case ISD::SHL:
26097   case ISD::SRL: {
26098     SDValue N0 = Op.getOperand(0);
26099     // Look out for (store (shl (load), x)).
26100     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
26101       return false;
26102     Promote = true;
26103     break;
26104   }
26105   case ISD::ADD:
26106   case ISD::MUL:
26107   case ISD::AND:
26108   case ISD::OR:
26109   case ISD::XOR:
26110     Commute = true;
26111     // fallthrough
26112   case ISD::SUB: {
26113     SDValue N0 = Op.getOperand(0);
26114     SDValue N1 = Op.getOperand(1);
26115     if (!Commute && MayFoldLoad(N1))
26116       return false;
26117     // Avoid disabling potential load folding opportunities.
26118     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
26119       return false;
26120     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
26121       return false;
26122     Promote = true;
26123   }
26124   }
26125
26126   PVT = MVT::i32;
26127   return Promote;
26128 }
26129
26130 //===----------------------------------------------------------------------===//
26131 //                           X86 Inline Assembly Support
26132 //===----------------------------------------------------------------------===//
26133
26134 namespace {
26135   // Helper to match a string separated by whitespace.
26136   bool matchAsmImpl(StringRef s, ArrayRef<const StringRef *> args) {
26137     s = s.substr(s.find_first_not_of(" \t")); // Skip leading whitespace.
26138
26139     for (unsigned i = 0, e = args.size(); i != e; ++i) {
26140       StringRef piece(*args[i]);
26141       if (!s.startswith(piece)) // Check if the piece matches.
26142         return false;
26143
26144       s = s.substr(piece.size());
26145       StringRef::size_type pos = s.find_first_not_of(" \t");
26146       if (pos == 0) // We matched a prefix.
26147         return false;
26148
26149       s = s.substr(pos);
26150     }
26151
26152     return s.empty();
26153   }
26154   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
26155 }
26156
26157 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
26158
26159   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
26160     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
26161         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
26162         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
26163
26164       if (AsmPieces.size() == 3)
26165         return true;
26166       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
26167         return true;
26168     }
26169   }
26170   return false;
26171 }
26172
26173 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
26174   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
26175
26176   std::string AsmStr = IA->getAsmString();
26177
26178   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
26179   if (!Ty || Ty->getBitWidth() % 16 != 0)
26180     return false;
26181
26182   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
26183   SmallVector<StringRef, 4> AsmPieces;
26184   SplitString(AsmStr, AsmPieces, ";\n");
26185
26186   switch (AsmPieces.size()) {
26187   default: return false;
26188   case 1:
26189     // FIXME: this should verify that we are targeting a 486 or better.  If not,
26190     // we will turn this bswap into something that will be lowered to logical
26191     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
26192     // lower so don't worry about this.
26193     // bswap $0
26194     if (matchAsm(AsmPieces[0], "bswap", "$0") ||
26195         matchAsm(AsmPieces[0], "bswapl", "$0") ||
26196         matchAsm(AsmPieces[0], "bswapq", "$0") ||
26197         matchAsm(AsmPieces[0], "bswap", "${0:q}") ||
26198         matchAsm(AsmPieces[0], "bswapl", "${0:q}") ||
26199         matchAsm(AsmPieces[0], "bswapq", "${0:q}")) {
26200       // No need to check constraints, nothing other than the equivalent of
26201       // "=r,0" would be valid here.
26202       return IntrinsicLowering::LowerToByteSwap(CI);
26203     }
26204
26205     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
26206     if (CI->getType()->isIntegerTy(16) &&
26207         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26208         (matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") ||
26209          matchAsm(AsmPieces[0], "rolw", "$$8,", "${0:w}"))) {
26210       AsmPieces.clear();
26211       const std::string &ConstraintsStr = IA->getConstraintString();
26212       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26213       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26214       if (clobbersFlagRegisters(AsmPieces))
26215         return IntrinsicLowering::LowerToByteSwap(CI);
26216     }
26217     break;
26218   case 3:
26219     if (CI->getType()->isIntegerTy(32) &&
26220         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
26221         matchAsm(AsmPieces[0], "rorw", "$$8,", "${0:w}") &&
26222         matchAsm(AsmPieces[1], "rorl", "$$16,", "$0") &&
26223         matchAsm(AsmPieces[2], "rorw", "$$8,", "${0:w}")) {
26224       AsmPieces.clear();
26225       const std::string &ConstraintsStr = IA->getConstraintString();
26226       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
26227       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
26228       if (clobbersFlagRegisters(AsmPieces))
26229         return IntrinsicLowering::LowerToByteSwap(CI);
26230     }
26231
26232     if (CI->getType()->isIntegerTy(64)) {
26233       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
26234       if (Constraints.size() >= 2 &&
26235           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
26236           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
26237         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
26238         if (matchAsm(AsmPieces[0], "bswap", "%eax") &&
26239             matchAsm(AsmPieces[1], "bswap", "%edx") &&
26240             matchAsm(AsmPieces[2], "xchgl", "%eax,", "%edx"))
26241           return IntrinsicLowering::LowerToByteSwap(CI);
26242       }
26243     }
26244     break;
26245   }
26246   return false;
26247 }
26248
26249 /// getConstraintType - Given a constraint letter, return the type of
26250 /// constraint it is for this target.
26251 X86TargetLowering::ConstraintType
26252 X86TargetLowering::getConstraintType(const std::string &Constraint) const {
26253   if (Constraint.size() == 1) {
26254     switch (Constraint[0]) {
26255     case 'R':
26256     case 'q':
26257     case 'Q':
26258     case 'f':
26259     case 't':
26260     case 'u':
26261     case 'y':
26262     case 'x':
26263     case 'Y':
26264     case 'l':
26265       return C_RegisterClass;
26266     case 'a':
26267     case 'b':
26268     case 'c':
26269     case 'd':
26270     case 'S':
26271     case 'D':
26272     case 'A':
26273       return C_Register;
26274     case 'I':
26275     case 'J':
26276     case 'K':
26277     case 'L':
26278     case 'M':
26279     case 'N':
26280     case 'G':
26281     case 'C':
26282     case 'e':
26283     case 'Z':
26284       return C_Other;
26285     default:
26286       break;
26287     }
26288   }
26289   return TargetLowering::getConstraintType(Constraint);
26290 }
26291
26292 /// Examine constraint type and operand type and determine a weight value.
26293 /// This object must already have been set up with the operand type
26294 /// and the current alternative constraint selected.
26295 TargetLowering::ConstraintWeight
26296   X86TargetLowering::getSingleConstraintMatchWeight(
26297     AsmOperandInfo &info, const char *constraint) const {
26298   ConstraintWeight weight = CW_Invalid;
26299   Value *CallOperandVal = info.CallOperandVal;
26300     // If we don't have a value, we can't do a match,
26301     // but allow it at the lowest weight.
26302   if (!CallOperandVal)
26303     return CW_Default;
26304   Type *type = CallOperandVal->getType();
26305   // Look at the constraint type.
26306   switch (*constraint) {
26307   default:
26308     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
26309   case 'R':
26310   case 'q':
26311   case 'Q':
26312   case 'a':
26313   case 'b':
26314   case 'c':
26315   case 'd':
26316   case 'S':
26317   case 'D':
26318   case 'A':
26319     if (CallOperandVal->getType()->isIntegerTy())
26320       weight = CW_SpecificReg;
26321     break;
26322   case 'f':
26323   case 't':
26324   case 'u':
26325     if (type->isFloatingPointTy())
26326       weight = CW_SpecificReg;
26327     break;
26328   case 'y':
26329     if (type->isX86_MMXTy() && Subtarget->hasMMX())
26330       weight = CW_SpecificReg;
26331     break;
26332   case 'x':
26333   case 'Y':
26334     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
26335         ((type->getPrimitiveSizeInBits() == 256) && Subtarget->hasFp256()))
26336       weight = CW_Register;
26337     break;
26338   case 'I':
26339     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
26340       if (C->getZExtValue() <= 31)
26341         weight = CW_Constant;
26342     }
26343     break;
26344   case 'J':
26345     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26346       if (C->getZExtValue() <= 63)
26347         weight = CW_Constant;
26348     }
26349     break;
26350   case 'K':
26351     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26352       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
26353         weight = CW_Constant;
26354     }
26355     break;
26356   case 'L':
26357     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26358       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
26359         weight = CW_Constant;
26360     }
26361     break;
26362   case 'M':
26363     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26364       if (C->getZExtValue() <= 3)
26365         weight = CW_Constant;
26366     }
26367     break;
26368   case 'N':
26369     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26370       if (C->getZExtValue() <= 0xff)
26371         weight = CW_Constant;
26372     }
26373     break;
26374   case 'G':
26375   case 'C':
26376     if (dyn_cast<ConstantFP>(CallOperandVal)) {
26377       weight = CW_Constant;
26378     }
26379     break;
26380   case 'e':
26381     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26382       if ((C->getSExtValue() >= -0x80000000LL) &&
26383           (C->getSExtValue() <= 0x7fffffffLL))
26384         weight = CW_Constant;
26385     }
26386     break;
26387   case 'Z':
26388     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
26389       if (C->getZExtValue() <= 0xffffffff)
26390         weight = CW_Constant;
26391     }
26392     break;
26393   }
26394   return weight;
26395 }
26396
26397 /// LowerXConstraint - try to replace an X constraint, which matches anything,
26398 /// with another that has more specific requirements based on the type of the
26399 /// corresponding operand.
26400 const char *X86TargetLowering::
26401 LowerXConstraint(EVT ConstraintVT) const {
26402   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
26403   // 'f' like normal targets.
26404   if (ConstraintVT.isFloatingPoint()) {
26405     if (Subtarget->hasSSE2())
26406       return "Y";
26407     if (Subtarget->hasSSE1())
26408       return "x";
26409   }
26410
26411   return TargetLowering::LowerXConstraint(ConstraintVT);
26412 }
26413
26414 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
26415 /// vector.  If it is invalid, don't add anything to Ops.
26416 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
26417                                                      std::string &Constraint,
26418                                                      std::vector<SDValue>&Ops,
26419                                                      SelectionDAG &DAG) const {
26420   SDValue Result;
26421
26422   // Only support length 1 constraints for now.
26423   if (Constraint.length() > 1) return;
26424
26425   char ConstraintLetter = Constraint[0];
26426   switch (ConstraintLetter) {
26427   default: break;
26428   case 'I':
26429     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26430       if (C->getZExtValue() <= 31) {
26431         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26432         break;
26433       }
26434     }
26435     return;
26436   case 'J':
26437     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26438       if (C->getZExtValue() <= 63) {
26439         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26440         break;
26441       }
26442     }
26443     return;
26444   case 'K':
26445     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26446       if (isInt<8>(C->getSExtValue())) {
26447         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26448         break;
26449       }
26450     }
26451     return;
26452   case 'L':
26453     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26454       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
26455           (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) {
26456         Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType());
26457         break;
26458       }
26459     }
26460     return;
26461   case 'M':
26462     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26463       if (C->getZExtValue() <= 3) {
26464         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26465         break;
26466       }
26467     }
26468     return;
26469   case 'N':
26470     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26471       if (C->getZExtValue() <= 255) {
26472         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26473         break;
26474       }
26475     }
26476     return;
26477   case 'O':
26478     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26479       if (C->getZExtValue() <= 127) {
26480         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26481         break;
26482       }
26483     }
26484     return;
26485   case 'e': {
26486     // 32-bit signed value
26487     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26488       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26489                                            C->getSExtValue())) {
26490         // Widen to 64 bits here to get it sign extended.
26491         Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
26492         break;
26493       }
26494     // FIXME gcc accepts some relocatable values here too, but only in certain
26495     // memory models; it's complicated.
26496     }
26497     return;
26498   }
26499   case 'Z': {
26500     // 32-bit unsigned value
26501     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
26502       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
26503                                            C->getZExtValue())) {
26504         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
26505         break;
26506       }
26507     }
26508     // FIXME gcc accepts some relocatable values here too, but only in certain
26509     // memory models; it's complicated.
26510     return;
26511   }
26512   case 'i': {
26513     // Literal immediates are always ok.
26514     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
26515       // Widen to 64 bits here to get it sign extended.
26516       Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
26517       break;
26518     }
26519
26520     // In any sort of PIC mode addresses need to be computed at runtime by
26521     // adding in a register or some sort of table lookup.  These can't
26522     // be used as immediates.
26523     if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC())
26524       return;
26525
26526     // If we are in non-pic codegen mode, we allow the address of a global (with
26527     // an optional displacement) to be used with 'i'.
26528     GlobalAddressSDNode *GA = nullptr;
26529     int64_t Offset = 0;
26530
26531     // Match either (GA), (GA+C), (GA+C1+C2), etc.
26532     while (1) {
26533       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
26534         Offset += GA->getOffset();
26535         break;
26536       } else if (Op.getOpcode() == ISD::ADD) {
26537         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26538           Offset += C->getZExtValue();
26539           Op = Op.getOperand(0);
26540           continue;
26541         }
26542       } else if (Op.getOpcode() == ISD::SUB) {
26543         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26544           Offset += -C->getZExtValue();
26545           Op = Op.getOperand(0);
26546           continue;
26547         }
26548       }
26549
26550       // Otherwise, this isn't something we can handle, reject it.
26551       return;
26552     }
26553
26554     const GlobalValue *GV = GA->getGlobal();
26555     // If we require an extra load to get this address, as in PIC mode, we
26556     // can't accept it.
26557     if (isGlobalStubReference(
26558             Subtarget->ClassifyGlobalReference(GV, DAG.getTarget())))
26559       return;
26560
26561     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
26562                                         GA->getValueType(0), Offset);
26563     break;
26564   }
26565   }
26566
26567   if (Result.getNode()) {
26568     Ops.push_back(Result);
26569     return;
26570   }
26571   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
26572 }
26573
26574 std::pair<unsigned, const TargetRegisterClass*>
26575 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
26576                                                 MVT VT) const {
26577   // First, see if this is a constraint that directly corresponds to an LLVM
26578   // register class.
26579   if (Constraint.size() == 1) {
26580     // GCC Constraint Letters
26581     switch (Constraint[0]) {
26582     default: break;
26583       // TODO: Slight differences here in allocation order and leaving
26584       // RIP in the class. Do they matter any more here than they do
26585       // in the normal allocation?
26586     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
26587       if (Subtarget->is64Bit()) {
26588         if (VT == MVT::i32 || VT == MVT::f32)
26589           return std::make_pair(0U, &X86::GR32RegClass);
26590         if (VT == MVT::i16)
26591           return std::make_pair(0U, &X86::GR16RegClass);
26592         if (VT == MVT::i8 || VT == MVT::i1)
26593           return std::make_pair(0U, &X86::GR8RegClass);
26594         if (VT == MVT::i64 || VT == MVT::f64)
26595           return std::make_pair(0U, &X86::GR64RegClass);
26596         break;
26597       }
26598       // 32-bit fallthrough
26599     case 'Q':   // Q_REGS
26600       if (VT == MVT::i32 || VT == MVT::f32)
26601         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
26602       if (VT == MVT::i16)
26603         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
26604       if (VT == MVT::i8 || VT == MVT::i1)
26605         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
26606       if (VT == MVT::i64)
26607         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
26608       break;
26609     case 'r':   // GENERAL_REGS
26610     case 'l':   // INDEX_REGS
26611       if (VT == MVT::i8 || VT == MVT::i1)
26612         return std::make_pair(0U, &X86::GR8RegClass);
26613       if (VT == MVT::i16)
26614         return std::make_pair(0U, &X86::GR16RegClass);
26615       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget->is64Bit())
26616         return std::make_pair(0U, &X86::GR32RegClass);
26617       return std::make_pair(0U, &X86::GR64RegClass);
26618     case 'R':   // LEGACY_REGS
26619       if (VT == MVT::i8 || VT == MVT::i1)
26620         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
26621       if (VT == MVT::i16)
26622         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
26623       if (VT == MVT::i32 || !Subtarget->is64Bit())
26624         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
26625       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
26626     case 'f':  // FP Stack registers.
26627       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
26628       // value to the correct fpstack register class.
26629       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
26630         return std::make_pair(0U, &X86::RFP32RegClass);
26631       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
26632         return std::make_pair(0U, &X86::RFP64RegClass);
26633       return std::make_pair(0U, &X86::RFP80RegClass);
26634     case 'y':   // MMX_REGS if MMX allowed.
26635       if (!Subtarget->hasMMX()) break;
26636       return std::make_pair(0U, &X86::VR64RegClass);
26637     case 'Y':   // SSE_REGS if SSE2 allowed
26638       if (!Subtarget->hasSSE2()) break;
26639       // FALL THROUGH.
26640     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
26641       if (!Subtarget->hasSSE1()) break;
26642
26643       switch (VT.SimpleTy) {
26644       default: break;
26645       // Scalar SSE types.
26646       case MVT::f32:
26647       case MVT::i32:
26648         return std::make_pair(0U, &X86::FR32RegClass);
26649       case MVT::f64:
26650       case MVT::i64:
26651         return std::make_pair(0U, &X86::FR64RegClass);
26652       // Vector types.
26653       case MVT::v16i8:
26654       case MVT::v8i16:
26655       case MVT::v4i32:
26656       case MVT::v2i64:
26657       case MVT::v4f32:
26658       case MVT::v2f64:
26659         return std::make_pair(0U, &X86::VR128RegClass);
26660       // AVX types.
26661       case MVT::v32i8:
26662       case MVT::v16i16:
26663       case MVT::v8i32:
26664       case MVT::v4i64:
26665       case MVT::v8f32:
26666       case MVT::v4f64:
26667         return std::make_pair(0U, &X86::VR256RegClass);
26668       case MVT::v8f64:
26669       case MVT::v16f32:
26670       case MVT::v16i32:
26671       case MVT::v8i64:
26672         return std::make_pair(0U, &X86::VR512RegClass);
26673       }
26674       break;
26675     }
26676   }
26677
26678   // Use the default implementation in TargetLowering to convert the register
26679   // constraint into a member of a register class.
26680   std::pair<unsigned, const TargetRegisterClass*> Res;
26681   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
26682
26683   // Not found as a standard register?
26684   if (!Res.second) {
26685     // Map st(0) -> st(7) -> ST0
26686     if (Constraint.size() == 7 && Constraint[0] == '{' &&
26687         tolower(Constraint[1]) == 's' &&
26688         tolower(Constraint[2]) == 't' &&
26689         Constraint[3] == '(' &&
26690         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
26691         Constraint[5] == ')' &&
26692         Constraint[6] == '}') {
26693
26694       Res.first = X86::FP0+Constraint[4]-'0';
26695       Res.second = &X86::RFP80RegClass;
26696       return Res;
26697     }
26698
26699     // GCC allows "st(0)" to be called just plain "st".
26700     if (StringRef("{st}").equals_lower(Constraint)) {
26701       Res.first = X86::FP0;
26702       Res.second = &X86::RFP80RegClass;
26703       return Res;
26704     }
26705
26706     // flags -> EFLAGS
26707     if (StringRef("{flags}").equals_lower(Constraint)) {
26708       Res.first = X86::EFLAGS;
26709       Res.second = &X86::CCRRegClass;
26710       return Res;
26711     }
26712
26713     // 'A' means EAX + EDX.
26714     if (Constraint == "A") {
26715       Res.first = X86::EAX;
26716       Res.second = &X86::GR32_ADRegClass;
26717       return Res;
26718     }
26719     return Res;
26720   }
26721
26722   // Otherwise, check to see if this is a register class of the wrong value
26723   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
26724   // turn into {ax},{dx}.
26725   if (Res.second->hasType(VT))
26726     return Res;   // Correct type already, nothing to do.
26727
26728   // All of the single-register GCC register classes map their values onto
26729   // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
26730   // really want an 8-bit or 32-bit register, map to the appropriate register
26731   // class and return the appropriate register.
26732   if (Res.second == &X86::GR16RegClass) {
26733     if (VT == MVT::i8 || VT == MVT::i1) {
26734       unsigned DestReg = 0;
26735       switch (Res.first) {
26736       default: break;
26737       case X86::AX: DestReg = X86::AL; break;
26738       case X86::DX: DestReg = X86::DL; break;
26739       case X86::CX: DestReg = X86::CL; break;
26740       case X86::BX: DestReg = X86::BL; break;
26741       }
26742       if (DestReg) {
26743         Res.first = DestReg;
26744         Res.second = &X86::GR8RegClass;
26745       }
26746     } else if (VT == MVT::i32 || VT == MVT::f32) {
26747       unsigned DestReg = 0;
26748       switch (Res.first) {
26749       default: break;
26750       case X86::AX: DestReg = X86::EAX; break;
26751       case X86::DX: DestReg = X86::EDX; break;
26752       case X86::CX: DestReg = X86::ECX; break;
26753       case X86::BX: DestReg = X86::EBX; break;
26754       case X86::SI: DestReg = X86::ESI; break;
26755       case X86::DI: DestReg = X86::EDI; break;
26756       case X86::BP: DestReg = X86::EBP; break;
26757       case X86::SP: DestReg = X86::ESP; break;
26758       }
26759       if (DestReg) {
26760         Res.first = DestReg;
26761         Res.second = &X86::GR32RegClass;
26762       }
26763     } else if (VT == MVT::i64 || VT == MVT::f64) {
26764       unsigned DestReg = 0;
26765       switch (Res.first) {
26766       default: break;
26767       case X86::AX: DestReg = X86::RAX; break;
26768       case X86::DX: DestReg = X86::RDX; break;
26769       case X86::CX: DestReg = X86::RCX; break;
26770       case X86::BX: DestReg = X86::RBX; break;
26771       case X86::SI: DestReg = X86::RSI; break;
26772       case X86::DI: DestReg = X86::RDI; break;
26773       case X86::BP: DestReg = X86::RBP; break;
26774       case X86::SP: DestReg = X86::RSP; break;
26775       }
26776       if (DestReg) {
26777         Res.first = DestReg;
26778         Res.second = &X86::GR64RegClass;
26779       }
26780     }
26781   } else if (Res.second == &X86::FR32RegClass ||
26782              Res.second == &X86::FR64RegClass ||
26783              Res.second == &X86::VR128RegClass ||
26784              Res.second == &X86::VR256RegClass ||
26785              Res.second == &X86::FR32XRegClass ||
26786              Res.second == &X86::FR64XRegClass ||
26787              Res.second == &X86::VR128XRegClass ||
26788              Res.second == &X86::VR256XRegClass ||
26789              Res.second == &X86::VR512RegClass) {
26790     // Handle references to XMM physical registers that got mapped into the
26791     // wrong class.  This can happen with constraints like {xmm0} where the
26792     // target independent register mapper will just pick the first match it can
26793     // find, ignoring the required type.
26794
26795     if (VT == MVT::f32 || VT == MVT::i32)
26796       Res.second = &X86::FR32RegClass;
26797     else if (VT == MVT::f64 || VT == MVT::i64)
26798       Res.second = &X86::FR64RegClass;
26799     else if (X86::VR128RegClass.hasType(VT))
26800       Res.second = &X86::VR128RegClass;
26801     else if (X86::VR256RegClass.hasType(VT))
26802       Res.second = &X86::VR256RegClass;
26803     else if (X86::VR512RegClass.hasType(VT))
26804       Res.second = &X86::VR512RegClass;
26805   }
26806
26807   return Res;
26808 }
26809
26810 int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
26811                                             Type *Ty) const {
26812   // Scaling factors are not free at all.
26813   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
26814   // will take 2 allocations in the out of order engine instead of 1
26815   // for plain addressing mode, i.e. inst (reg1).
26816   // E.g.,
26817   // vaddps (%rsi,%drx), %ymm0, %ymm1
26818   // Requires two allocations (one for the load, one for the computation)
26819   // whereas:
26820   // vaddps (%rsi), %ymm0, %ymm1
26821   // Requires just 1 allocation, i.e., freeing allocations for other operations
26822   // and having less micro operations to execute.
26823   //
26824   // For some X86 architectures, this is even worse because for instance for
26825   // stores, the complex addressing mode forces the instruction to use the
26826   // "load" ports instead of the dedicated "store" port.
26827   // E.g., on Haswell:
26828   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
26829   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
26830   if (isLegalAddressingMode(AM, Ty))
26831     // Scale represents reg2 * scale, thus account for 1
26832     // as soon as we use a second register.
26833     return AM.Scale != 0;
26834   return -1;
26835 }
26836
26837 bool X86TargetLowering::isTargetFTOL() const {
26838   return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit();
26839 }